Spaces:

tareeb23
/

Document_Search_Engine

Sleeping

App Files Files Community

tareeb23 commited on Jul 25, 2024

Commit

bf71408

verified ·

1 Parent(s): c9a0222

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -56

app.py CHANGED Viewed

@@ -1,27 +1,12 @@
 import streamlit as st
 from transformers import pipeline
 import re
-from collections import Counter
-import string
 import docx2txt
 from io import BytesIO
 @st.cache_resource
 def load_qa_pipeline():
-    return pipeline("question-answering", model="tareeb23/Roberta_SQUAD_V2")
-def normalize_answer(s):
-    """Lower text and remove punctuation, articles and extra whitespace."""
-    def remove_articles(text):
-        return re.sub(r'\b(a|an|the)\b', ' ', text)
-    def white_space_fix(text):
-        return ' '.join(text.split())
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return ''.join(ch for ch in text if ch not in exclude)
-    def lower(text):
-        return text.lower()
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
 def chunk_text(text, chunk_size=1000):
     sentences = re.split(r'(?<=[.!?])\s+', text)
@@ -40,22 +25,24 @@ def chunk_text(text, chunk_size=1000):
     return chunks
-def highlight_text(text, start_indices, chunk_size):
-    highlighted_text = text
-    offset = 0
-    for i, start in enumerate(start_indices):
-        actual_start = start + (i * 7)  # 7 is the length of the highlight tag
-        chunk_index = start // chunk_size
-        actual_start += chunk_index * chunk_size
-        highlighted_text = (
-            highlighted_text[:actual_start + offset] +
-            "<mark>" +
-            highlighted_text[actual_start + offset:actual_start + offset + 10] +
-            "</mark>" +
-            highlighted_text[actual_start + offset + 10:]
-        )
-        offset += 13  # Length of "<mark></mark>"
-    return highlighted_text
 def main():
     st.title("Document Search Engine")
@@ -76,36 +63,34 @@ def main():
     st.session_state['context'] = context
     # Search input and button
-    col1, col2 = st.columns([3, 1])
     with col1:
         question = st.text_input("Enter your search query:")
     with col2:
-        search_button = st.button("Search")
-    if search_button:
         if context and question:
-            chunks = chunk_text(context)
-            results = []
-            for i, chunk in enumerate(chunks):
-                result = qa_pipeline(question=question, context=chunk)
-                result['chunk_index'] = i
-                results.append(result)
-            # Sort results by score and get top 3
-            top_results = sorted(results, key=lambda x: x['score'], reverse=True)[:3]
-            st.subheader("Top 3 Results:")
-            for i, result in enumerate(top_results, 1):
-                st.write(f"{i}. Answer: {result['answer']}")
-                st.write(f"   Confidence: {result['score']:.2f}")
-            # Highlight answers in the context
-            chunk_size = 1000  # Make sure this matches the chunk_size in chunk_text function
-            start_indices = [result['start'] + (result['chunk_index'] * chunk_size) for result in top_results]
-            highlighted_context = highlight_text(context, start_indices, chunk_size)
-            st.subheader("Context with Highlighted Answers:")
-            st.markdown(highlighted_context, unsafe_allow_html=True)
         else:
             st.warning("Please provide both context and search query.")

 import streamlit as st
 from transformers import pipeline
 import re
 import docx2txt
 from io import BytesIO
 @st.cache_resource
 def load_qa_pipeline():
+    return pipeline("question-answering", model="tareeb23/Roberta_SQUAD_V2", tokenizer="tareeb23/Roberta_SQUAD_V2")
 def chunk_text(text, chunk_size=1000):
     sentences = re.split(r'(?<=[.!?])\s+', text)
     return chunks
+def get_top_answers(qa_pipeline, question, context, top_k=3, score_limit=0.1):
+    chunks = chunk_text(context)
+    results = []
+    for i, chunk in enumerate(chunks):
+        result = qa_pipeline(question=question, context=chunk)
+        result['chunk_index'] = i
+        result['chunk_start'] = i * 1000  # Approximate start position in original context
+        results.append(result)
+    # Sort results by score, filter by score limit, and get top k
+    filtered_results = [r for r in results if r['score'] >= score_limit]
+    top_results = sorted(filtered_results, key=lambda x: x['score'], reverse=True)[:top_k]
+    return top_results
+def highlight_answer(text, answer, start):
+    return text[:start] + "**" + answer + "**" + text[start+len(answer):]
 def main():
     st.title("Document Search Engine")
     st.session_state['context'] = context
     # Search input and button
+    col1, col2, col3 = st.columns([3, 1, 1])
     with col1:
         question = st.text_input("Enter your search query:")
     with col2:
+        top_k = st.number_input("Top K results", min_value=1, max_value=10, value=3)
+    with col3:
+        score_limit = st.number_input("Score limit", min_value=0.0, max_value=1.0, value=0.1, step=0.05)
+    if st.button("Search"):
         if context and question:
+            top_results = get_top_answers(qa_pipeline, question, context, top_k=top_k, score_limit=score_limit)
+            if top_results:
+                st.subheader(f"Top {len(top_results)} Results:")
+                for i, result in enumerate(top_results, 1):
+                    st.write(f"{i}. Answer: {result['answer']}")
+                    st.write(f"   Confidence: {result['score']:.4f}")
+                    st.write(f"   Start Index in Original Context: {result['chunk_start'] + result['start']}")
+                    st.write(f"   Chunk Index: {result['chunk_index']}")
+                st.subheader("Context with Highlighted Answers:")
+                highlighted_context = context
+                for result in reversed(top_results):  # Reverse to avoid messing up indices
+                    start = result['chunk_start'] + result['start']
+                    highlighted_context = highlight_answer(highlighted_context, result['answer'], start)
+                st.markdown(highlighted_context)
+            else:
+                st.warning("No results found above the score limit.")
         else:
             st.warning("Please provide both context and search query.")