|
from haystack.document_stores import InMemoryDocumentStore |
|
import pandas as pd |
|
import gradio as gr |
|
|
|
df=pd.read_parquet('df.parquet') |
|
dirname='lot3' |
|
|
|
df['fileclean']=df.file.str.replace(f'.*{dirname}/[^/]+/','').str.replace('[\(\)]','').str.replace('/[^/]+$','').str.replace('/',' ').str.replace('-',' ').str.replace(' 0+',' ') |
|
candidats=pd.read_parquet('candidats.parquet') |
|
df2=pd.read_parquet('df2.parquet') |
|
for c in df2.columns: |
|
candidats[c]=candidats[c].astype(str) |
|
df2[c]=df2[c].astype(str) |
|
candidats=candidats.merge(df2) |
|
|
|
document_store = InMemoryDocumentStore(use_bm25=True) |
|
docs=df.drop_duplicates(subset=['fileclean']).rename(columns={'fileclean':'content'}).to_dict(orient='records') |
|
document_store.write_documents(docs) |
|
from haystack.nodes import BM25Retriever |
|
retriever = BM25Retriever(document_store=document_store) |
|
from haystack.pipelines import DocumentSearchPipeline |
|
pipeline = DocumentSearchPipeline(retriever=retriever) |
|
|
|
def semanticsearch(query): |
|
result = pipeline.run( |
|
query=query, |
|
params={ |
|
"Retriever": { |
|
"top_k": 10 |
|
} |
|
},debug=False |
|
) |
|
results=[] |
|
for document in result['documents']: |
|
result=document.meta |
|
result['score']=document.score |
|
results.append(result) |
|
results=pd.DataFrame(results) |
|
return results |
|
|
|
demo = gr.Interface( |
|
semanticsearch, |
|
[ |
|
gr.Dropdown(candidats.sort_values(by='text').text.tolist()), |
|
], |
|
[gr.Dataframe()] |
|
|
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |