Spaces:
Sleeping
Sleeping
File size: 698 Bytes
ed813a0 b67d211 ed813a0 b67d211 ed813a0 b67d211 ed813a0 b67d211 ed813a0 b67d211 ed813a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
import datasets
import easyknn
from sentence_transformers import SentenceTransformer
ds = datasets.load_dataset(
"wikimedia/wikipedia", "20231101.ne", split="train", streaming=True
)
ds = list(ds.take(5000))
model = SentenceTransformer("jangedoo/all-MiniLM-L6-v2-nepali")
texts = [row["text"] for row in ds]
urls = [row["url"] for row in ds]
embeddings = model.encode(
[text[:1500] for text in texts],
normalize_embeddings=True,
convert_to_numpy=True,
show_progress_bar=True,
)
builder = easyknn.EmbeddingsIndexBuilder()
builder.add(embeddings=embeddings, item_keys=urls, items=ds)
knn = easyknn.EasyKNN.from_builder_with_faiss(builder=builder)
knn.save("./data/knn_index")
|