from haystack.document_stores import InMemoryDocumentStore import pandas as pd import gradio as gr df=pd.read_parquet('df.parquet') dirname='lot3' df['fileclean']=df.file.str.replace(f'.*{dirname}/[^/]+/','').str.replace('[\(\)]','').str.replace('/[^/]+$','').str.replace('/',' ').str.replace('-',' ').str.replace(' 0+',' ') candidats=pd.read_parquet('candidats.parquet') df2=pd.read_parquet('df2.parquet') for c in df2.columns: candidats[c]=candidats[c].astype(str) df2[c]=df2[c].astype(str) candidats=candidats.merge(df2) document_store = InMemoryDocumentStore(use_bm25=True) docs=df.drop_duplicates(subset=['fileclean']).rename(columns={'fileclean':'content'}).to_dict(orient='records') document_store.write_documents(docs) from haystack.nodes import BM25Retriever retriever = BM25Retriever(document_store=document_store) from haystack.pipelines import DocumentSearchPipeline pipeline = DocumentSearchPipeline(retriever=retriever) def semanticsearch(query): result = pipeline.run( query=query, params={ "Retriever": { "top_k": 10 } },debug=False ) results=[] for document in result['documents']: result=document.meta result['score']=document.score results.append(result) results=pd.DataFrame(results) return results demo = gr.Interface( semanticsearch, [ gr.Dropdown(candidats.sort_values(by='text').text.tolist()), ], [gr.Dataframe()] ) if __name__ == "__main__": demo.launch()