VanessaHochwald commited on
Commit
fe57270
·
verified ·
1 Parent(s): 3f7c136

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -114
app.py CHANGED
@@ -1,93 +1,7 @@
1
- # import streamlit as st
2
- # from transformers import pipeline
3
- # from sentence_transformers import SentenceTransformer, util
4
- # import pdfplumber
5
-
6
- # # ---- App Setup ----
7
- # st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
8
- # st.title("Chatbot for Gender Strategy Document")
9
-
10
- # # ---- Helper Functions ----
11
- # def extract_text_from_pdf(pdf_path):
12
- # """Extracts text from a PDF file."""
13
- # text = ""
14
- # with pdfplumber.open(pdf_path) as pdf:
15
- # for page in pdf.pages:
16
- # text += page.extract_text()
17
- # return text
18
-
19
- # def preprocess_text(document_text):
20
- # """Processes the text, removes hard line breaks, and ensures clean paragraphs."""
21
- # # 1. Remove hyphenation and line breaks, but keep the word intact
22
- # document_text = re.sub(r'(?<=\S)-\n(?=\S)', '', document_text) # Remove hyphenation and \n
23
- # # 2. Merge hard line breaks that occur between two words without hyphenation into a single space
24
- # document_text = re.sub(r'(?<=\S)\n(?=\S)', ' ', document_text)
25
- # # 3. Remove unnecessary whitespace at the beginning and end of the text
26
- # document_text = document_text.strip()
27
- # # 4. Optional: Reduce multiple consecutive spaces to a single space
28
- # document_text = re.sub(r'\s{2,}', ' ', document_text)
29
- # # Return the processed text
30
- # standardized_text = document_text
31
- # return standardized_text
32
-
33
-
34
- # def semantic_search(query, corpus, model):
35
- # """Performs semantic search to find the most relevant text in the corpus."""
36
- # query_embedding = model.encode(query, convert_to_tensor=True)
37
- # corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
38
-
39
- # scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
40
- # best_match_idx = scores.argmax().item()
41
- # return corpus[best_match_idx], scores[best_match_idx].item()
42
-
43
- # # ---- Load PDF and Extract Text ----
44
- # @st.cache_data
45
- # def load_pdf_and_prepare_embeddings(pdf_path):
46
- # """Loads a PDF, extracts text, standardizes formatting, splits into chunks, and prepares embeddings."""
47
- # document_text = extract_text_from_pdf(pdf_path)
48
- # standardized_text = preprocess_text(document_text)
49
- # chunks = standardized_text.split("\n\n") # Splitting text into chunks by paragraphs
50
- # model = SentenceTransformer('all-MiniLM-L6-v2')
51
- # return chunks, model
52
-
53
- # pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf"
54
- # chunks, embedding_model = load_pdf_and_prepare_embeddings(pdf_path)
55
-
56
- # # ---- User Input Section ----
57
- # st.sidebar.header("Ask a Question")
58
- # query = st.sidebar.text_area("Type your question here:")
59
-
60
- # if st.sidebar.button("Submit"):
61
- # if query.strip() == "":
62
- # st.sidebar.error("Please enter a question.")
63
- # else:
64
- # with st.spinner("Searching for the best answer..."):
65
- # answer, score = semantic_search(query, chunks, embedding_model)
66
- # st.write("### Your Question:")
67
- # st.write(query)
68
- # st.write("### Best Match:")
69
- # st.write(answer)
70
- # st.write(f"**Relevance Score:** {score:.2f}")
71
-
72
- # # ---- Info Section ----
73
- # with st.expander("ℹ️ - About this app"):
74
- # st.write(
75
- # """
76
- # This chatbot allows users to ask questions about the Gender Strategy document.
77
- # It uses a semantic search model (`all-MiniLM-L6-v2`) to find the most relevant passages from the document.
78
-
79
- # - The document is pre-loaded and processed into searchable chunks.
80
- # - The model ranks the relevance of the results based on cosine similarity.
81
-
82
- # For feedback or improvements, please contact the developer.
83
- # """
84
- # )
85
-
86
  import streamlit as st
87
  from transformers import pipeline
88
  from sentence_transformers import SentenceTransformer, util
89
  import pdfplumber
90
- import re # Import für Regular Expressions
91
 
92
  # ---- App Setup ----
93
  st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
@@ -102,26 +16,19 @@ def extract_text_from_pdf(pdf_path):
102
  text += page.extract_text()
103
  return text
104
 
105
-
106
  def preprocess_text(document_text):
107
  """Processes the text, removes hard line breaks, and ensures clean paragraphs."""
108
- # 1. Entferne Bindestriche am Zeilenende, aber erhalte das Wort intakt
109
- document_text = re.sub(r'(?<=\S)-\n(?=\S)', '', document_text)
110
-
111
- # 2. Identifiziere harte Zeilenumbrüche zwischen Wörtern und ersetze sie durch Leerzeichen
112
  document_text = re.sub(r'(?<=\S)\n(?=\S)', ' ', document_text)
113
-
114
- # 3. Ersetze mehrere Leerzeilen durch einen einzigen doppelten Zeilenumbruch für saubere Absätze
115
- document_text = re.sub(r'\n{2,}', '\n\n', document_text)
116
-
117
- # 4. Entferne überflüssige Leerzeichen am Anfang und Ende
118
  document_text = document_text.strip()
119
-
120
- # 5. Reduziere mehrere Leerzeichen innerhalb eines Absatzes auf ein einziges Leerzeichen
121
  document_text = re.sub(r'\s{2,}', ' ', document_text)
122
-
123
- # Rückgabe des verarbeiteten Textes
124
- return document_text
125
 
126
 
127
  def semantic_search(query, corpus, model):
@@ -137,15 +44,14 @@ def semantic_search(query, corpus, model):
137
  @st.cache_data
138
  def load_pdf_and_prepare_embeddings(pdf_path):
139
  """Loads a PDF, extracts text, standardizes formatting, splits into chunks, and prepares embeddings."""
140
- raw_text = extract_text_from_pdf(pdf_path) # Extract raw text from PDF
141
- processed_text = preprocess_text(raw_text) # Preprocess the raw text
142
- chunks = processed_text.split("\n\n") # Splitting text into chunks by paragraphs
143
  model = SentenceTransformer('all-MiniLM-L6-v2')
144
- return chunks, model, raw_text, processed_text
145
 
146
- # ---- Main Application Logic ----
147
  pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf"
148
- chunks, embedding_model, raw_text, processed_text = load_pdf_and_prepare_embeddings(pdf_path)
149
 
150
  # ---- User Input Section ----
151
  st.sidebar.header("Ask a Question")
@@ -163,21 +69,17 @@ if st.sidebar.button("Submit"):
163
  st.write(answer)
164
  st.write(f"**Relevance Score:** {score:.2f}")
165
 
166
- # ---- Before & After Section ----
167
- st.write("## Original vs Processed Text")
168
- with st.expander("View Original Text"):
169
- st.text(raw_text[:2000]) # Display the first 2000 characters of the raw text
170
- with st.expander("View Processed Text"):
171
- st.text(processed_text[:2000]) # Display the first 2000 characters of the processed text
172
-
173
  # ---- Info Section ----
174
  with st.expander("ℹ️ - About this app"):
175
  st.write(
176
  """
177
  This chatbot allows users to ask questions about the Gender Strategy document.
178
  It uses a semantic search model (`all-MiniLM-L6-v2`) to find the most relevant passages from the document.
 
179
  - The document is pre-loaded and processed into searchable chunks.
180
  - The model ranks the relevance of the results based on cosine similarity.
 
181
  For feedback or improvements, please contact the developer.
182
  """
183
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  from transformers import pipeline
3
  from sentence_transformers import SentenceTransformer, util
4
  import pdfplumber
 
5
 
6
  # ---- App Setup ----
7
  st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
 
16
  text += page.extract_text()
17
  return text
18
 
 
19
  def preprocess_text(document_text):
20
  """Processes the text, removes hard line breaks, and ensures clean paragraphs."""
21
+ # 1. Remove hyphenation and line breaks, but keep the word intact
22
+ document_text = re.sub(r'(?<=\S)-\n(?=\S)', '', document_text) # Remove hyphenation and \n
23
+ # 2. Merge hard line breaks that occur between two words without hyphenation into a single space
 
24
  document_text = re.sub(r'(?<=\S)\n(?=\S)', ' ', document_text)
25
+ # 3. Remove unnecessary whitespace at the beginning and end of the text
 
 
 
 
26
  document_text = document_text.strip()
27
+ # 4. Optional: Reduce multiple consecutive spaces to a single space
 
28
  document_text = re.sub(r'\s{2,}', ' ', document_text)
29
+ # Return the processed text
30
+ standardized_text = document_text
31
+ return standardized_text
32
 
33
 
34
  def semantic_search(query, corpus, model):
 
44
  @st.cache_data
45
  def load_pdf_and_prepare_embeddings(pdf_path):
46
  """Loads a PDF, extracts text, standardizes formatting, splits into chunks, and prepares embeddings."""
47
+ document_text = extract_text_from_pdf(pdf_path)
48
+ standardized_text = preprocess_text(document_text)
49
+ chunks = standardized_text.split("\n\n") # Splitting text into chunks by paragraphs
50
  model = SentenceTransformer('all-MiniLM-L6-v2')
51
+ return chunks, model
52
 
 
53
  pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf"
54
+ chunks, embedding_model = load_pdf_and_prepare_embeddings(pdf_path)
55
 
56
  # ---- User Input Section ----
57
  st.sidebar.header("Ask a Question")
 
69
  st.write(answer)
70
  st.write(f"**Relevance Score:** {score:.2f}")
71
 
 
 
 
 
 
 
 
72
  # ---- Info Section ----
73
  with st.expander("ℹ️ - About this app"):
74
  st.write(
75
  """
76
  This chatbot allows users to ask questions about the Gender Strategy document.
77
  It uses a semantic search model (`all-MiniLM-L6-v2`) to find the most relevant passages from the document.
78
+
79
  - The document is pre-loaded and processed into searchable chunks.
80
  - The model ranks the relevance of the results based on cosine similarity.
81
+
82
  For feedback or improvements, please contact the developer.
83
  """
84
  )
85
+