Spaces:
Running
Running
File size: 1,735 Bytes
de37907 d8dca23 de37907 d8dca23 de37907 d8dca23 de37907 d8dca23 de37907 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import glob
import json
from tokenizing import tokenize_doc, tokenize_doc_to_str, tokenize_text
from rank_bm25 import BM25Okapi
import os
import pickle
import numpy as np
from tqdm import tqdm
import torch
docs = []
base_path = "./Data"
bm25_path = os.path.join(base_path, "bm25.pkl")
tokenized_docs_path = os.path.join(base_path, "tokenized_docs.pkl")
# Take all json files with names that end '_processed'
for path in glob.glob(f"{base_path}/*_processed.json"):
print(path)
with open(path, 'r') as f:
docs.extend(json.load(f))
index = 0
# for i, doc in enumerate(docs):
# if 'body' in doc:
# if doc['body'] == "I don't fuck the sandwich before eating it":
# tokenized_doc = tokenize_doc(doc)
# print(tokenized_doc)
# index = i
with open(bm25_path, 'rb') as f:
bm25 = pickle.load(f)
# tokenized_docs = [tokenize_doc(doc) for doc in tqdm(docs, desc="Tokenizing documents")]
# bm25 = BM25Okapi(tokenized_docs)
# with open(tokenized_docs_path, 'wb') as f:
# pickle.dump(tokenized_docs, f)
# with open(bm25_path, 'wb') as f:
# pickle.dump(bm25, f)
message = "tell me a joke about sandwich before eating it"
tokenized_message = tokenize_text(message)
print(tokenized_message)
scores = torch.tensor(bm25.get_scores(tokenized_message))
sorted_doc_indices = np.argsort(scores)
for i in range(1, 2):
print("Score:", scores[sorted_doc_indices[-i]] )
print(docs[sorted_doc_indices[-i]])
print("Doc number:", sorted_doc_indices[-i])
# result_docs = [docs[i] for i in sorted_doc_indices[-30:] if scores[i] > 0]
# return result_docs[::-1] # Return the top n documents in descending order which means the most relevant documents are first
|