nepali-minilm-demo / create_index.py
Sanjaya Subedi
update easyknn and demo article
b67d211
raw
history blame
698 Bytes
import datasets
import easyknn
from sentence_transformers import SentenceTransformer
ds = datasets.load_dataset(
"wikimedia/wikipedia", "20231101.ne", split="train", streaming=True
)
ds = list(ds.take(5000))
model = SentenceTransformer("jangedoo/all-MiniLM-L6-v2-nepali")
texts = [row["text"] for row in ds]
urls = [row["url"] for row in ds]
embeddings = model.encode(
[text[:1500] for text in texts],
normalize_embeddings=True,
convert_to_numpy=True,
show_progress_bar=True,
)
builder = easyknn.EmbeddingsIndexBuilder()
builder.add(embeddings=embeddings, item_keys=urls, items=ds)
knn = easyknn.EasyKNN.from_builder_with_faiss(builder=builder)
knn.save("./data/knn_index")