Spaces:

GIZ
/

gender-strategy-chatbot-giz

Running

App Files Files Community

VanessaHochwald commited on Dec 27, 2024

Commit

fe57270

verified ·

1 Parent(s): 3f7c136

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -114

app.py CHANGED Viewed

@@ -1,93 +1,7 @@
-# import streamlit as st
-# from transformers import pipeline
-# from sentence_transformers import SentenceTransformer, util
-# import pdfplumber
-# # ---- App Setup ----
-# st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
-# st.title("Chatbot for Gender Strategy Document")
-# # ---- Helper Functions ----
-# def extract_text_from_pdf(pdf_path):
-#     """Extracts text from a PDF file."""
-#     text = ""
-#     with pdfplumber.open(pdf_path) as pdf:
-#         for page in pdf.pages:
-#             text += page.extract_text()
-#     return text
-# def preprocess_text(document_text):
-#     """Processes the text, removes hard line breaks, and ensures clean paragraphs."""
-#     # 1. Remove hyphenation and line breaks, but keep the word intact
-#     document_text = re.sub(r'(?<=\S)-\n(?=\S)', '', document_text)  # Remove hyphenation and \n
-#     # 2. Merge hard line breaks that occur between two words without hyphenation into a single space
-#     document_text = re.sub(r'(?<=\S)\n(?=\S)', ' ', document_text)
-#     # 3. Remove unnecessary whitespace at the beginning and end of the text
-#     document_text = document_text.strip()
-#     # 4. Optional: Reduce multiple consecutive spaces to a single space
-#     document_text = re.sub(r'\s{2,}', ' ', document_text)
-#     # Return the processed text
-#     standardized_text = document_text
-#     return standardized_text
-# def semantic_search(query, corpus, model):
-#     """Performs semantic search to find the most relevant text in the corpus."""
-#     query_embedding = model.encode(query, convert_to_tensor=True)
-#     corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
-#     scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
-#     best_match_idx = scores.argmax().item()
-#     return corpus[best_match_idx], scores[best_match_idx].item()
-# # ---- Load PDF and Extract Text ----
-# @st.cache_data
-# def load_pdf_and_prepare_embeddings(pdf_path):
-#     """Loads a PDF, extracts text, standardizes formatting, splits into chunks, and prepares embeddings."""
-#     document_text = extract_text_from_pdf(pdf_path)
-#     standardized_text = preprocess_text(document_text)
-#     chunks = standardized_text.split("\n\n")  # Splitting text into chunks by paragraphs
-#     model = SentenceTransformer('all-MiniLM-L6-v2')
-#     return chunks, model
-# pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf"
-# chunks, embedding_model = load_pdf_and_prepare_embeddings(pdf_path)
-# # ---- User Input Section ----
-# st.sidebar.header("Ask a Question")
-# query = st.sidebar.text_area("Type your question here:")
-# if st.sidebar.button("Submit"):
-#     if query.strip() == "":
-#         st.sidebar.error("Please enter a question.")
-#     else:
-#         with st.spinner("Searching for the best answer..."):
-#             answer, score = semantic_search(query, chunks, embedding_model)
-#             st.write("### Your Question:")
-#             st.write(query)
-#             st.write("### Best Match:")
-#             st.write(answer)
-#             st.write(f"**Relevance Score:** {score:.2f}")
-# # ---- Info Section ----
-# with st.expander("ℹ️ - About this app"):
-#     st.write(
-#         """
-#         This chatbot allows users to ask questions about the Gender Strategy document.
-#         It uses a semantic search model (`all-MiniLM-L6-v2`) to find the most relevant passages from the document.
-#         - The document is pre-loaded and processed into searchable chunks.
-#         - The model ranks the relevance of the results based on cosine similarity.
-#         For feedback or improvements, please contact the developer.
-#         """
-#     )
 import streamlit as st
 from transformers import pipeline
 from sentence_transformers import SentenceTransformer, util
 import pdfplumber
-import re  # Import für Regular Expressions
 # ---- App Setup ----
 st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
@@ -102,26 +16,19 @@ def extract_text_from_pdf(pdf_path):
             text += page.extract_text()
     return text
 def preprocess_text(document_text):
     """Processes the text, removes hard line breaks, and ensures clean paragraphs."""
-    # 1. Entferne Bindestriche am Zeilenende, aber erhalte das Wort intakt
-    document_text = re.sub(r'(?<=\S)-\n(?=\S)', '', document_text)
-    # 2. Identifiziere harte Zeilenumbrüche zwischen Wörtern und ersetze sie durch Leerzeichen
     document_text = re.sub(r'(?<=\S)\n(?=\S)', ' ', document_text)
-    # 3. Ersetze mehrere Leerzeilen durch einen einzigen doppelten Zeilenumbruch für saubere Absätze
-    document_text = re.sub(r'\n{2,}', '\n\n', document_text)
-    # 4. Entferne überflüssige Leerzeichen am Anfang und Ende
     document_text = document_text.strip()
-    # 5. Reduziere mehrere Leerzeichen innerhalb eines Absatzes auf ein einziges Leerzeichen
     document_text = re.sub(r'\s{2,}', ' ', document_text)
-    # Rückgabe des verarbeiteten Textes
-    return document_text
 def semantic_search(query, corpus, model):
@@ -137,15 +44,14 @@ def semantic_search(query, corpus, model):
 @st.cache_data
 def load_pdf_and_prepare_embeddings(pdf_path):
     """Loads a PDF, extracts text, standardizes formatting, splits into chunks, and prepares embeddings."""
-    raw_text = extract_text_from_pdf(pdf_path)  # Extract raw text from PDF
-    processed_text = preprocess_text(raw_text)  # Preprocess the raw text
-    chunks = processed_text.split("\n\n")  # Splitting text into chunks by paragraphs
     model = SentenceTransformer('all-MiniLM-L6-v2')
-    return chunks, model, raw_text, processed_text
-# ---- Main Application Logic ----
 pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf"
-chunks, embedding_model, raw_text, processed_text = load_pdf_and_prepare_embeddings(pdf_path)
 # ---- User Input Section ----
 st.sidebar.header("Ask a Question")
@@ -163,21 +69,17 @@ if st.sidebar.button("Submit"):
             st.write(answer)
             st.write(f"**Relevance Score:** {score:.2f}")
-# ---- Before & After Section ----
-st.write("## Original vs Processed Text")
-with st.expander("View Original Text"):
-    st.text(raw_text[:2000])  # Display the first 2000 characters of the raw text
-with st.expander("View Processed Text"):
-    st.text(processed_text[:2000])  # Display the first 2000 characters of the processed text
 # ---- Info Section ----
 with st.expander("ℹ️ - About this app"):
     st.write(
         """
         This chatbot allows users to ask questions about the Gender Strategy document.
         It uses a semantic search model (`all-MiniLM-L6-v2`) to find the most relevant passages from the document.
         - The document is pre-loaded and processed into searchable chunks.
         - The model ranks the relevance of the results based on cosine similarity.
         For feedback or improvements, please contact the developer.
         """
     )

 import streamlit as st
 from transformers import pipeline
 from sentence_transformers import SentenceTransformer, util
 import pdfplumber
 # ---- App Setup ----
 st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
             text += page.extract_text()
     return text
 def preprocess_text(document_text):
     """Processes the text, removes hard line breaks, and ensures clean paragraphs."""
+    # 1. Remove hyphenation and line breaks, but keep the word intact
+    document_text = re.sub(r'(?<=\S)-\n(?=\S)', '', document_text)  # Remove hyphenation and \n
+    # 2. Merge hard line breaks that occur between two words without hyphenation into a single space
     document_text = re.sub(r'(?<=\S)\n(?=\S)', ' ', document_text)
+    # 3. Remove unnecessary whitespace at the beginning and end of the text
     document_text = document_text.strip()
+    # 4. Optional: Reduce multiple consecutive spaces to a single space
     document_text = re.sub(r'\s{2,}', ' ', document_text)
+    # Return the processed text
+    standardized_text = document_text
+    return standardized_text
 def semantic_search(query, corpus, model):
 @st.cache_data
 def load_pdf_and_prepare_embeddings(pdf_path):
     """Loads a PDF, extracts text, standardizes formatting, splits into chunks, and prepares embeddings."""
+    document_text = extract_text_from_pdf(pdf_path)
+    standardized_text = preprocess_text(document_text)
+    chunks = standardized_text.split("\n\n")  # Splitting text into chunks by paragraphs
     model = SentenceTransformer('all-MiniLM-L6-v2')
+    return chunks, model
 pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf"
+chunks, embedding_model = load_pdf_and_prepare_embeddings(pdf_path)
 # ---- User Input Section ----
 st.sidebar.header("Ask a Question")
             st.write(answer)
             st.write(f"**Relevance Score:** {score:.2f}")
 # ---- Info Section ----
 with st.expander("ℹ️ - About this app"):
     st.write(
         """
         This chatbot allows users to ask questions about the Gender Strategy document.
         It uses a semantic search model (`all-MiniLM-L6-v2`) to find the most relevant passages from the document.
         - The document is pre-loaded and processed into searchable chunks.
         - The model ranks the relevance of the results based on cosine similarity.
         For feedback or improvements, please contact the developer.
         """
     )