ugmSorcero commited on
Commit
0f09d43
1 Parent(s): 1fd11ba

Audio gets deleted when changing pipelines

Browse files
core/pipelines.py CHANGED
@@ -25,8 +25,6 @@ def keyword_search(index="documents", split_word_length=100, audio_output=False)
25
 
26
  - Documents that have more lexical overlap with the query are more likely to be relevant
27
  - Words that occur in fewer documents are more significant than words that occur in many documents
28
-
29
- :warning: **(HAYSTACK BUG) Keyword Search doesn't work if you reindex:** Please refresh page in order to reindex
30
  """
31
  document_store = InMemoryDocumentStore(index=index)
32
  keyword_retriever = TfidfRetriever(document_store=(document_store))
@@ -67,7 +65,7 @@ def dense_passage_retrieval(
67
  split_word_length=100,
68
  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
69
  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
70
- audio_output=False
71
  ):
72
  """
73
  **Dense Passage Retrieval Pipeline**
@@ -104,7 +102,7 @@ def dense_passage_retrieval(
104
  index_pipeline.add_node(
105
  document_store, name="DocumentStore", inputs=["DPRRetriever"]
106
  )
107
-
108
  if audio_output:
109
  doc2speech = DocumentToSpeech(
110
  model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
@@ -123,6 +121,7 @@ def dense_passage_retrieval_ranker(
123
  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
124
  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
125
  ranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
 
126
  ):
127
  """
128
  **Dense Passage Retrieval Ranker Pipeline**
@@ -142,5 +141,14 @@ def dense_passage_retrieval_ranker(
142
  ranker = SentenceTransformersRanker(model_name_or_path=ranker_model)
143
 
144
  search_pipeline.add_node(ranker, name="Ranker", inputs=["DPRRetriever"])
 
 
 
 
 
 
 
 
 
145
 
146
  return search_pipeline, index_pipeline
 
25
 
26
  - Documents that have more lexical overlap with the query are more likely to be relevant
27
  - Words that occur in fewer documents are more significant than words that occur in many documents
 
 
28
  """
29
  document_store = InMemoryDocumentStore(index=index)
30
  keyword_retriever = TfidfRetriever(document_store=(document_store))
 
65
  split_word_length=100,
66
  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
67
  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
68
+ audio_output=False,
69
  ):
70
  """
71
  **Dense Passage Retrieval Pipeline**
 
102
  index_pipeline.add_node(
103
  document_store, name="DocumentStore", inputs=["DPRRetriever"]
104
  )
105
+
106
  if audio_output:
107
  doc2speech = DocumentToSpeech(
108
  model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
 
121
  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
122
  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
123
  ranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
124
+ audio_output=False,
125
  ):
126
  """
127
  **Dense Passage Retrieval Ranker Pipeline**
 
141
  ranker = SentenceTransformersRanker(model_name_or_path=ranker_model)
142
 
143
  search_pipeline.add_node(ranker, name="Ranker", inputs=["DPRRetriever"])
144
+
145
+ if audio_output:
146
+ doc2speech = DocumentToSpeech(
147
+ model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
148
+ generated_audio_dir=Path(data_path + "audio"),
149
+ )
150
+ search_pipeline.add_node(
151
+ doc2speech, name="DocumentToSpeech", inputs=["Ranker"]
152
+ )
153
 
154
  return search_pipeline, index_pipeline
interface/components.py CHANGED
@@ -1,5 +1,5 @@
1
  import streamlit as st
2
- from interface.utils import get_pipelines, extract_text_from_url, extract_text_from_file
3
  from interface.draw_pipelines import get_pipeline_graph
4
 
5
 
@@ -42,7 +42,7 @@ def component_select_pipeline(container):
42
  "index_pipeline": index_pipeline,
43
  "doc": pipeline_funcs[index_pipe].__doc__,
44
  }
45
- st.session_state["doc_id"] = 0
46
 
47
 
48
  def component_show_pipeline(pipeline, pipeline_name):
 
1
  import streamlit as st
2
+ from interface.utils import get_pipelines, extract_text_from_url, extract_text_from_file, reset_vars_data
3
  from interface.draw_pipelines import get_pipeline_graph
4
 
5
 
 
42
  "index_pipeline": index_pipeline,
43
  "doc": pipeline_funcs[index_pipe].__doc__,
44
  }
45
+ reset_vars_data()
46
 
47
 
48
  def component_show_pipeline(pipeline, pipeline_name):
interface/utils.py CHANGED
@@ -1,5 +1,8 @@
1
  from io import StringIO
 
 
2
  import core.pipelines as pipelines_functions
 
3
  from inspect import getmembers, isfunction, signature
4
  from newspaper import Article
5
  from PyPDF2 import PdfFileReader
@@ -22,6 +25,11 @@ def get_pipelines():
22
  ]
23
  return pipeline_names, pipeline_funcs, pipeline_func_parameters
24
 
 
 
 
 
 
25
 
26
  @st.experimental_memo
27
  def extract_text_from_url(url: str):
 
1
  from io import StringIO
2
+ import os
3
+ import shutil
4
  import core.pipelines as pipelines_functions
5
+ from core.pipelines import data_path
6
  from inspect import getmembers, isfunction, signature
7
  from newspaper import Article
8
  from PyPDF2 import PdfFileReader
 
25
  ]
26
  return pipeline_names, pipeline_funcs, pipeline_func_parameters
27
 
28
+ def reset_vars_data():
29
+ st.session_state["doc_id"] = 0
30
+ # Delete data files
31
+ shutil.rmtree(data_path)
32
+ os.makedirs(data_path, exist_ok=True)
33
 
34
  @st.experimental_memo
35
  def extract_text_from_url(url: str):