Spaces:
Runtime error
Runtime error
ugmSorcero
commited on
Commit
•
0f09d43
1
Parent(s):
1fd11ba
Audio gets deleted when changing pipelines
Browse files- core/pipelines.py +12 -4
- interface/components.py +2 -2
- interface/utils.py +8 -0
core/pipelines.py
CHANGED
@@ -25,8 +25,6 @@ def keyword_search(index="documents", split_word_length=100, audio_output=False)
|
|
25 |
|
26 |
- Documents that have more lexical overlap with the query are more likely to be relevant
|
27 |
- Words that occur in fewer documents are more significant than words that occur in many documents
|
28 |
-
|
29 |
-
:warning: **(HAYSTACK BUG) Keyword Search doesn't work if you reindex:** Please refresh page in order to reindex
|
30 |
"""
|
31 |
document_store = InMemoryDocumentStore(index=index)
|
32 |
keyword_retriever = TfidfRetriever(document_store=(document_store))
|
@@ -67,7 +65,7 @@ def dense_passage_retrieval(
|
|
67 |
split_word_length=100,
|
68 |
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
|
69 |
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
|
70 |
-
audio_output=False
|
71 |
):
|
72 |
"""
|
73 |
**Dense Passage Retrieval Pipeline**
|
@@ -104,7 +102,7 @@ def dense_passage_retrieval(
|
|
104 |
index_pipeline.add_node(
|
105 |
document_store, name="DocumentStore", inputs=["DPRRetriever"]
|
106 |
)
|
107 |
-
|
108 |
if audio_output:
|
109 |
doc2speech = DocumentToSpeech(
|
110 |
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
|
@@ -123,6 +121,7 @@ def dense_passage_retrieval_ranker(
|
|
123 |
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
|
124 |
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
|
125 |
ranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
|
|
|
126 |
):
|
127 |
"""
|
128 |
**Dense Passage Retrieval Ranker Pipeline**
|
@@ -142,5 +141,14 @@ def dense_passage_retrieval_ranker(
|
|
142 |
ranker = SentenceTransformersRanker(model_name_or_path=ranker_model)
|
143 |
|
144 |
search_pipeline.add_node(ranker, name="Ranker", inputs=["DPRRetriever"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
return search_pipeline, index_pipeline
|
|
|
25 |
|
26 |
- Documents that have more lexical overlap with the query are more likely to be relevant
|
27 |
- Words that occur in fewer documents are more significant than words that occur in many documents
|
|
|
|
|
28 |
"""
|
29 |
document_store = InMemoryDocumentStore(index=index)
|
30 |
keyword_retriever = TfidfRetriever(document_store=(document_store))
|
|
|
65 |
split_word_length=100,
|
66 |
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
|
67 |
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
|
68 |
+
audio_output=False,
|
69 |
):
|
70 |
"""
|
71 |
**Dense Passage Retrieval Pipeline**
|
|
|
102 |
index_pipeline.add_node(
|
103 |
document_store, name="DocumentStore", inputs=["DPRRetriever"]
|
104 |
)
|
105 |
+
|
106 |
if audio_output:
|
107 |
doc2speech = DocumentToSpeech(
|
108 |
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
|
|
|
121 |
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
|
122 |
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
|
123 |
ranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
|
124 |
+
audio_output=False,
|
125 |
):
|
126 |
"""
|
127 |
**Dense Passage Retrieval Ranker Pipeline**
|
|
|
141 |
ranker = SentenceTransformersRanker(model_name_or_path=ranker_model)
|
142 |
|
143 |
search_pipeline.add_node(ranker, name="Ranker", inputs=["DPRRetriever"])
|
144 |
+
|
145 |
+
if audio_output:
|
146 |
+
doc2speech = DocumentToSpeech(
|
147 |
+
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
|
148 |
+
generated_audio_dir=Path(data_path + "audio"),
|
149 |
+
)
|
150 |
+
search_pipeline.add_node(
|
151 |
+
doc2speech, name="DocumentToSpeech", inputs=["Ranker"]
|
152 |
+
)
|
153 |
|
154 |
return search_pipeline, index_pipeline
|
interface/components.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import streamlit as st
|
2 |
-
from interface.utils import get_pipelines, extract_text_from_url, extract_text_from_file
|
3 |
from interface.draw_pipelines import get_pipeline_graph
|
4 |
|
5 |
|
@@ -42,7 +42,7 @@ def component_select_pipeline(container):
|
|
42 |
"index_pipeline": index_pipeline,
|
43 |
"doc": pipeline_funcs[index_pipe].__doc__,
|
44 |
}
|
45 |
-
|
46 |
|
47 |
|
48 |
def component_show_pipeline(pipeline, pipeline_name):
|
|
|
1 |
import streamlit as st
|
2 |
+
from interface.utils import get_pipelines, extract_text_from_url, extract_text_from_file, reset_vars_data
|
3 |
from interface.draw_pipelines import get_pipeline_graph
|
4 |
|
5 |
|
|
|
42 |
"index_pipeline": index_pipeline,
|
43 |
"doc": pipeline_funcs[index_pipe].__doc__,
|
44 |
}
|
45 |
+
reset_vars_data()
|
46 |
|
47 |
|
48 |
def component_show_pipeline(pipeline, pipeline_name):
|
interface/utils.py
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
from io import StringIO
|
|
|
|
|
2 |
import core.pipelines as pipelines_functions
|
|
|
3 |
from inspect import getmembers, isfunction, signature
|
4 |
from newspaper import Article
|
5 |
from PyPDF2 import PdfFileReader
|
@@ -22,6 +25,11 @@ def get_pipelines():
|
|
22 |
]
|
23 |
return pipeline_names, pipeline_funcs, pipeline_func_parameters
|
24 |
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
@st.experimental_memo
|
27 |
def extract_text_from_url(url: str):
|
|
|
1 |
from io import StringIO
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
import core.pipelines as pipelines_functions
|
5 |
+
from core.pipelines import data_path
|
6 |
from inspect import getmembers, isfunction, signature
|
7 |
from newspaper import Article
|
8 |
from PyPDF2 import PdfFileReader
|
|
|
25 |
]
|
26 |
return pipeline_names, pipeline_funcs, pipeline_func_parameters
|
27 |
|
28 |
+
def reset_vars_data():
|
29 |
+
st.session_state["doc_id"] = 0
|
30 |
+
# Delete data files
|
31 |
+
shutil.rmtree(data_path)
|
32 |
+
os.makedirs(data_path, exist_ok=True)
|
33 |
|
34 |
@st.experimental_memo
|
35 |
def extract_text_from_url(url: str):
|