Spaces:
Sleeping
Sleeping
import easyknn | |
import gradio as gr | |
import pandas as pd | |
from sentence_transformers import SentenceTransformer | |
model = SentenceTransformer("jangedoo/all-MiniLM-L6-v2-nepali") | |
knn = easyknn.EasyKNN.load("./data/knn_index") | |
def search(query: str, k=5): | |
query_embeddings = model.encode( | |
query, normalize_embeddings=True, convert_to_numpy=True | |
) | |
items, scores = knn.neighbors(query_embeddings, k=k) | |
df = pd.DataFrame(items) | |
df["distance"] = scores.round(2) | |
df = df[["id", "distance", "title", "text"]] | |
return df | |
def search_duplicate_news(evt: gr.SelectData): | |
return search(evt.row_value[3].replace(" ...", "")[:1500], k=10) | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" | |
## Demo of [jangedoo/all-MiniLM-L6-v2-nepali](https://huggingface.co/jangedoo/all-MiniLM-L6-v2-nepali) model. | |
5,000 [Nepali Wikipedia articles](https://huggingface.co/datasets/wikimedia/wikipedia/viewer/20231101.ne) have been embedded using this model. | |
FAISS library is used for similarity search and the embeddings have been quantized to 8bit integers to tradeoff performance vs resource usage. | |
You can use **Nepali** as well as **English** for your queries. However, English queries are kind of hit-and-miss. | |
""" | |
) | |
gr.Markdown("Enter a search query and select number of docs you want to return") | |
with gr.Row(): | |
query = gr.Textbox(placeholder="query") | |
num_results = gr.Slider( | |
minimum=1, maximum=10, value=5, step=1, label="Number of results" | |
) | |
examples = gr.Examples( | |
[ | |
"विद्युत् प्राधिकरण", | |
"capital city", | |
"विद्यादेवी भण्डारी", | |
"सवारी दुर्घटना", | |
"वैदेशिक रोजगार", | |
"prime minister", | |
], | |
query, | |
) | |
btn = gr.Button("Search") | |
out = gr.DataFrame(headers=["article", "distance"]) | |
# out = gr.Blocks() | |
gr.Markdown( | |
"**Select an article above to see similar articles.** content from 'text' is used for similarity search" | |
) | |
duplicate_news = gr.DataFrame(headers=["article", "distance"]) | |
btn.click(fn=search, inputs=[query, num_results], outputs=out) | |
out.select(search_duplicate_news, outputs=duplicate_news) | |
demo.launch(debug=True) | |