Spaces:
Sleeping
Sleeping
File size: 2,366 Bytes
ed813a0 b67d211 ed813a0 b67d211 ed813a0 b67d211 ed813a0 b67d211 ed813a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import easyknn
import gradio as gr
import pandas as pd
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("jangedoo/all-MiniLM-L6-v2-nepali")
knn = easyknn.EasyKNN.load("./data/knn_index")
def search(query: str, k=5):
query_embeddings = model.encode(
query, normalize_embeddings=True, convert_to_numpy=True
)
items, scores = knn.neighbors(query_embeddings, k=k)
df = pd.DataFrame(items)
df["distance"] = scores.round(2)
df = df[["id", "distance", "title", "text"]]
return df
def search_duplicate_news(evt: gr.SelectData):
return search(evt.row_value[3].replace(" ...", "")[:1500], k=10)
with gr.Blocks() as demo:
gr.Markdown(
"""
## Demo of [jangedoo/all-MiniLM-L6-v2-nepali](https://huggingface.co/jangedoo/all-MiniLM-L6-v2-nepali) model.
5,000 [Nepali Wikipedia articles](https://huggingface.co/datasets/wikimedia/wikipedia/viewer/20231101.ne) have been embedded using this model.
FAISS library is used for similarity search and the embeddings have been quantized to 8bit integers to tradeoff performance vs resource usage.
You can use **Nepali** as well as **English** for your queries. However, English queries are kind of hit-and-miss.
"""
)
gr.Markdown("Enter a search query and select number of docs you want to return")
with gr.Row():
query = gr.Textbox(placeholder="query")
num_results = gr.Slider(
minimum=1, maximum=10, value=5, step=1, label="Number of results"
)
examples = gr.Examples(
[
"विद्युत् प्राधिकरण",
"capital city",
"विद्यादेवी भण्डारी",
"सवारी दुर्घटना",
"वैदेशिक रोजगार",
"prime minister",
],
query,
)
btn = gr.Button("Search")
out = gr.DataFrame(headers=["article", "distance"])
# out = gr.Blocks()
gr.Markdown(
"**Select an article above to see similar articles.** content from 'text' is used for similarity search"
)
duplicate_news = gr.DataFrame(headers=["article", "distance"])
btn.click(fn=search, inputs=[query, num_results], outputs=out)
out.select(search_duplicate_news, outputs=duplicate_news)
demo.launch(debug=True)
|