import faiss import gradio as gr import numpy as np import pandas as pd from sentence_transformers import SentenceTransformer import zipfile import os import logging logging.basicConfig(level=logging.ERROR) # if not os.path.exists("faiss.index"): with zipfile.ZipFile("files.zip", "r") as z: z.extractall() pr_number = 14 logging.info("Loading embedding model") model = SentenceTransformer( "intfloat/multilingual-e5-small", revision=f"refs/pr/{pr_number}", backend="openvino", ) class FaissIndex: def __init__( self, model: SentenceTransformer, data_path: str = "faiss.lookup.csv", index_path="faiss.index", ): self.model = model self.df = pd.read_csv(data_path) self.index = faiss.read_index(index_path) def search(self, query, k=5): query = np.array(query).astype("float32") distances, indices = self.index.search(query, k) return distances, indices def extract_docs(self, indices, k): indices = list(indices[0]) lookup = self.df.iloc[indices] questions = lookup["query"].values answers = lookup["answer"].values pairs = list(zip(questions, answers)) # ensure we only have unique answers. The questions can be duplicates filtered_pairs = [] seen = set() for pair in pairs: if pair[1] not in seen: seen.add(pair[1]) filtered_pairs.append(pair) # format pairs as: f"{answer}\n{kilde: {question}}" formatted_pairs = [] for pair in filtered_pairs: formatted_pairs.append(f"{pair[1]}") return formatted_pairs def search(self, query: str, k: int = 5): query = "query: " + query enc = self.model.encode([query]) emb = np.array(enc).astype("float32").reshape(1, -1) _, indices = self.index.search(emb, k) return self.extract_docs(indices, k) logging.info("Loading FAISS index") index = FaissIndex(model) def query_faiss_index(søketekst): if len(søketekst) < 3: return """ Queries the FAISS index with the provided search text and returns the top 5 results. Args: søketekst (str): The search text to query the FAISS index. Returns: str: A string containing the top 5 search results, separated by double newlines. """ results = index.search(søketekst, k=2) return "\n\n".join(results) # Create the Gradio interface # iface = gr.Interface( # fn=query_faiss_index, # inputs=gr.Textbox(lines=2, placeholder="Søk etter info i SIKT", interactive=True, min_width="30vw"), # outputs=gr.Textbox(label="Søkeresultater", type="text", lines=20, min_width="70vw"), # title="SIKT-FAQ", # description="Semantisk søk i SIKT med Openvino.", # live=True # ) with gr.Blocks() as blocks: gr.Markdown("## SIKT-FAQ") with gr.Row(): box_search = gr.Textbox(label="Søk etter informasjon i SIKT", lines=1, placeholder="Innlogging i FEIDE...", interactive=True) with gr.Row(): box_output = gr.Textbox(label="Søkeresultater", type="text", lines=20) box_search.change(fn=query_faiss_index, inputs=box_search, outputs=box_output, max_batch_size=1) blocks.launch() # Launch the Gradio app # if __name__ == "__main__": # iface.launch()