import easyknn import gradio as gr import pandas as pd from sentence_transformers import SentenceTransformer model = SentenceTransformer("jangedoo/all-MiniLM-L6-v2-nepali") knn = easyknn.EasyKNN.load("./data/knn_index") def search(query: str, k=5): query_embeddings = model.encode( query, normalize_embeddings=True, convert_to_numpy=True ) items, scores = knn.neighbors(query_embeddings, k=k) df = pd.DataFrame(items) df["distance"] = scores.round(2) df = df[["id", "distance", "title", "text"]] return df def search_duplicate_news(evt: gr.SelectData): return search(evt.row_value[3].replace(" ...", "")[:1500], k=10) with gr.Blocks() as demo: gr.Markdown( """ ## Demo of [jangedoo/all-MiniLM-L6-v2-nepali](https://huggingface.co/jangedoo/all-MiniLM-L6-v2-nepali) model. 5,000 [Nepali Wikipedia articles](https://huggingface.co/datasets/wikimedia/wikipedia/viewer/20231101.ne) have been embedded using this model. FAISS library is used for similarity search and the embeddings have been quantized to 8bit integers to tradeoff performance vs resource usage. You can use **Nepali** as well as **English** for your queries. However, English queries are kind of hit-and-miss. """ ) gr.Markdown("Enter a search query and select number of docs you want to return") with gr.Row(): query = gr.Textbox(placeholder="query") num_results = gr.Slider( minimum=1, maximum=10, value=5, step=1, label="Number of results" ) examples = gr.Examples( [ "विद्युत् प्राधिकरण", "capital city", "विद्यादेवी भण्डारी", "सवारी दुर्घटना", "वैदेशिक रोजगार", "prime minister", ], query, ) btn = gr.Button("Search") out = gr.DataFrame(headers=["article", "distance"]) # out = gr.Blocks() gr.Markdown( "**Select an article above to see similar articles.** content from 'text' is used for similarity search" ) duplicate_news = gr.DataFrame(headers=["article", "distance"]) btn.click(fn=search, inputs=[query, num_results], outputs=out) out.select(search_duplicate_news, outputs=duplicate_news) demo.launch(debug=True)