Spaces:

ugaray96
/

neural-search

Sleeping

App Files Files Community

ugmSorcero commited on Sep 23, 2022

Commit

843bc9e

1 Parent(s): 27e0350

Fixes indexing problem and adds split id

Browse files

Files changed (5) hide show

core/pipelines.py +2 -0
core/search_index.py +1 -0
interface/components.py +21 -18
interface/config.py +5 -1
interface/pages.py +3 -1

core/pipelines.py CHANGED Viewed

@@ -19,6 +19,8 @@ def keyword_search(index="documents", split_word_length=100):
       - Documents that have more lexical overlap with the query are more likely to be relevant
       - Words that occur in fewer documents are more significant than words that occur in many documents
     """
     document_store = InMemoryDocumentStore(index=index)
     keyword_retriever = TfidfRetriever(document_store=(document_store))

       - Documents that have more lexical overlap with the query are more likely to be relevant
       - Words that occur in fewer documents are more significant than words that occur in many documents
+    :warning: **(HAYSTACK BUG) Keyword Search doesn't work if you reindex:** Please refresh page in order to reindex
     """
     document_store = InMemoryDocumentStore(index=index)
     keyword_retriever = TfidfRetriever(document_store=(document_store))

core/search_index.py CHANGED Viewed

@@ -45,6 +45,7 @@ def search(queries, pipeline):
                     "score": res.score,
                     "id": res.meta["id"],
                     "fragment_id": res.id,
                 }
             )
         if not score_is_empty:

                     "score": res.score,
                     "id": res.meta["id"],
                     "fragment_id": res.id,
+                    "meta": res.meta
                 }
             )
         if not score_is_empty:

interface/components.py CHANGED Viewed

@@ -42,11 +42,15 @@ def component_select_pipeline(container):
                 "index_pipeline": index_pipeline,
                 "doc": pipeline_funcs[index_pipe].__doc__,
             }
 def component_show_pipeline(pipeline, pipeline_name):
     """Draw the pipeline"""
-    with st.expander("Show pipeline"):
         if pipeline["doc"] is not None:
             st.markdown(pipeline["doc"])
         fig = get_pipeline_graph(pipeline[pipeline_name])
@@ -59,41 +63,41 @@ def component_show_search_result(container, results):
             st.markdown(f"### Match {idx+1}")
             st.markdown(f"**Text**: {document['text']}")
             st.markdown(f"**Document**: {document['id']}")
             if document["score"] is not None:
                 st.markdown(f"**Score**: {document['score']:.3f}")
             st.markdown("---")
-def component_text_input(container):
     """Draw the Text Input widget"""
     with container:
         texts = []
-        doc_id = 1
         with st.expander("Enter documents"):
             while True:
                 text = st.text_input(f"Document {doc_id}", key=doc_id)
                 if text != "":
-                    texts.append({"text": text})
                     doc_id += 1
                     st.markdown("---")
                 else:
                     break
         corpus = [
-            {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(texts)
         ]
-        return corpus
-def component_article_url(container):
     """Draw the Article URL widget"""
     with container:
         urls = []
-        doc_id = 1
         with st.expander("Enter URLs"):
             while True:
                 url = st.text_input(f"URL {doc_id}", key=doc_id)
                 if url != "":
-                    urls.append({"text": extract_text_from_url(url)})
                     doc_id += 1
                     st.markdown("---")
                 else:
@@ -101,19 +105,18 @@ def component_article_url(container):
         for idx, doc in enumerate(urls):
             with st.expander(f"Preview URL {idx}"):
-                st.write(doc)
         corpus = [
-            {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
         ]
-        return corpus
-def component_file_input(container):
     """Draw the extract text from file widget"""
     with container:
         files = []
-        doc_id = 1
         with st.expander("Enter Files"):
             while True:
                 file = st.file_uploader(
@@ -122,7 +125,7 @@ def component_file_input(container):
                 if file != None:
                     extracted_text = extract_text_from_file(file)
                     if extracted_text != None:
-                        files.append({"text": extracted_text})
                         doc_id += 1
                         st.markdown("---")
                     else:
@@ -132,9 +135,9 @@ def component_file_input(container):
         for idx, doc in enumerate(files):
             with st.expander(f"Preview File {idx}"):
-                st.write(doc)
         corpus = [
-            {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
         ]
-        return corpus

                 "index_pipeline": index_pipeline,
                 "doc": pipeline_funcs[index_pipe].__doc__,
             }
+            st.session_state['doc_id'] = 0
 def component_show_pipeline(pipeline, pipeline_name):
     """Draw the pipeline"""
+    expander_text = "Show pipeline"
+    if pipeline["doc"] is not None and "BUG" in pipeline["doc"]:
+        expander_text += "  ⚠️"
+    with st.expander(expander_text):
         if pipeline["doc"] is not None:
             st.markdown(pipeline["doc"])
         fig = get_pipeline_graph(pipeline[pipeline_name])
             st.markdown(f"### Match {idx+1}")
             st.markdown(f"**Text**: {document['text']}")
             st.markdown(f"**Document**: {document['id']}")
+            if '_split_id' in document['meta']:
+                st.markdown(f"**Document Chunk**: {document['meta']['_split_id']}")
             if document["score"] is not None:
                 st.markdown(f"**Score**: {document['score']:.3f}")
             st.markdown("---")
+def component_text_input(container, doc_id):
     """Draw the Text Input widget"""
     with container:
         texts = []
         with st.expander("Enter documents"):
             while True:
                 text = st.text_input(f"Document {doc_id}", key=doc_id)
                 if text != "":
+                    texts.append({"text": text, 'doc_id': doc_id})
                     doc_id += 1
                     st.markdown("---")
                 else:
                     break
         corpus = [
+            {"text": doc["text"], "id": doc["doc_id"]} for doc in texts
         ]
+        return corpus, doc_id
+def component_article_url(container, doc_id):
     """Draw the Article URL widget"""
     with container:
         urls = []
         with st.expander("Enter URLs"):
             while True:
                 url = st.text_input(f"URL {doc_id}", key=doc_id)
                 if url != "":
+                    urls.append({"text": extract_text_from_url(url), 'doc_id': doc_id})
                     doc_id += 1
                     st.markdown("---")
                 else:
         for idx, doc in enumerate(urls):
             with st.expander(f"Preview URL {idx}"):
+                st.write(doc['text'])
         corpus = [
+            {"text": doc["text"], "id": doc["doc_id"]} for doc in urls
         ]
+        return corpus, doc_id
+def component_file_input(container, doc_id):
     """Draw the extract text from file widget"""
     with container:
         files = []
         with st.expander("Enter Files"):
             while True:
                 file = st.file_uploader(
                 if file != None:
                     extracted_text = extract_text_from_file(file)
                     if extracted_text != None:
+                        files.append({"text": extracted_text, 'doc_id': doc_id})
                         doc_id += 1
                         st.markdown("---")
                     else:
         for idx, doc in enumerate(files):
             with st.expander(f"Preview File {idx}"):
+                st.write(doc['text'])
         corpus = [
+            {"text": doc["text"], "id": doc["doc_id"]} for doc in files
         ]
+        return corpus, doc_id

interface/config.py CHANGED Viewed

@@ -1,7 +1,11 @@
 from interface.pages import page_landing_page, page_search, page_index
 # Define default Session Variables over the whole session.
-session_state_variables = {"pipeline": None, "pipeline_func_parameters": []}
 # Define Pages for the demo
 pages = {

 from interface.pages import page_landing_page, page_search, page_index
 # Define default Session Variables over the whole session.
+session_state_variables = {
+    "pipeline": None,
+    "pipeline_func_parameters": [],
+    "doc_id": 0
+}
 # Define Pages for the demo
 pages = {

interface/pages.py CHANGED Viewed

@@ -81,7 +81,8 @@ def page_index(container):
         clear_index = st.sidebar.checkbox('Clear Index', True)
-        corpus = input_funcs[selected_input][0](container)
         if len(corpus) > 0:
             index_results = None
@@ -91,5 +92,6 @@ def page_index(container):
                     st.session_state["pipeline"]["index_pipeline"],
                     clear_index
                 )
             if index_results:
                 st.write(index_results)

         clear_index = st.sidebar.checkbox('Clear Index', True)
+        doc_id = st.session_state['doc_id']
+        corpus, doc_id = input_funcs[selected_input][0](container, doc_id)
         if len(corpus) > 0:
             index_results = None
                     st.session_state["pipeline"]["index_pipeline"],
                     clear_index
                 )
+                st.session_state['doc_id'] = doc_id
             if index_results:
                 st.write(index_results)