Sanjaya Subedi
update easyknn and demo article
b67d211
import easyknn
import gradio as gr
import pandas as pd
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("jangedoo/all-MiniLM-L6-v2-nepali")
knn = easyknn.EasyKNN.load("./data/knn_index")
def search(query: str, k=5):
query_embeddings = model.encode(
query, normalize_embeddings=True, convert_to_numpy=True
)
items, scores = knn.neighbors(query_embeddings, k=k)
df = pd.DataFrame(items)
df["distance"] = scores.round(2)
df = df[["id", "distance", "title", "text"]]
return df
def search_duplicate_news(evt: gr.SelectData):
return search(evt.row_value[3].replace(" ...", "")[:1500], k=10)
with gr.Blocks() as demo:
gr.Markdown(
"""
## Demo of [jangedoo/all-MiniLM-L6-v2-nepali](https://huggingface.co/jangedoo/all-MiniLM-L6-v2-nepali) model.
5,000 [Nepali Wikipedia articles](https://huggingface.co/datasets/wikimedia/wikipedia/viewer/20231101.ne) have been embedded using this model.
FAISS library is used for similarity search and the embeddings have been quantized to 8bit integers to tradeoff performance vs resource usage.
You can use **Nepali** as well as **English** for your queries. However, English queries are kind of hit-and-miss.
"""
)
gr.Markdown("Enter a search query and select number of docs you want to return")
with gr.Row():
query = gr.Textbox(placeholder="query")
num_results = gr.Slider(
minimum=1, maximum=10, value=5, step=1, label="Number of results"
)
examples = gr.Examples(
[
"विद्युत् प्राधिकरण",
"capital city",
"विद्यादेवी भण्डारी",
"सवारी दुर्घटना",
"वैदेशिक रोजगार",
"prime minister",
],
query,
)
btn = gr.Button("Search")
out = gr.DataFrame(headers=["article", "distance"])
# out = gr.Blocks()
gr.Markdown(
"**Select an article above to see similar articles.** content from 'text' is used for similarity search"
)
duplicate_news = gr.DataFrame(headers=["article", "distance"])
btn.click(fn=search, inputs=[query, num_results], outputs=out)
out.select(search_duplicate_news, outputs=duplicate_news)
demo.launch(debug=True)