File size: 698 Bytes
ed813a0
 
 
 
 
b67d211
ed813a0
b67d211
ed813a0
 
b67d211
 
 
ed813a0
b67d211
 
 
 
ed813a0
 
 
b67d211
ed813a0
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import datasets
import easyknn
from sentence_transformers import SentenceTransformer

ds = datasets.load_dataset(
    "wikimedia/wikipedia", "20231101.ne", split="train", streaming=True
)
ds = list(ds.take(5000))
model = SentenceTransformer("jangedoo/all-MiniLM-L6-v2-nepali")

texts = [row["text"] for row in ds]
urls = [row["url"] for row in ds]

embeddings = model.encode(
    [text[:1500] for text in texts],
    normalize_embeddings=True,
    convert_to_numpy=True,
    show_progress_bar=True,
)

builder = easyknn.EmbeddingsIndexBuilder()
builder.add(embeddings=embeddings, item_keys=urls, items=ds)
knn = easyknn.EasyKNN.from_builder_with_faiss(builder=builder)
knn.save("./data/knn_index")