Spaces:
Sleeping
Sleeping
import datasets | |
import easyknn | |
from sentence_transformers import SentenceTransformer | |
ds = datasets.load_dataset( | |
"wikimedia/wikipedia", "20231101.ne", split="train", streaming=True | |
) | |
ds = list(ds.take(5000)) | |
model = SentenceTransformer("jangedoo/all-MiniLM-L6-v2-nepali") | |
texts = [row["text"] for row in ds] | |
urls = [row["url"] for row in ds] | |
embeddings = model.encode( | |
[text[:1500] for text in texts], | |
normalize_embeddings=True, | |
convert_to_numpy=True, | |
show_progress_bar=True, | |
) | |
builder = easyknn.EmbeddingsIndexBuilder() | |
builder.add(embeddings=embeddings, item_keys=urls, items=ds) | |
knn = easyknn.EasyKNN.from_builder_with_faiss(builder=builder) | |
knn.save("./data/knn_index") | |