lets save progress
Browse files
models/all-MiniLM-L6-v2-q8/modules.json
DELETED
File without changes
|
models/all-MiniLM-L6-v2-q8/sentence_bert_config.json
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"max_seq_length": 256,
|
3 |
-
"do_lower_case": false
|
4 |
-
}
|
|
|
|
|
|
|
|
|
|
ort_sentence_transformers.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
2 |
+
from scipy.spatial import distance
|
3 |
+
|
4 |
+
|
5 |
+
device = "mps"
|
6 |
+
# Load the models
|
7 |
+
# so close, and yet, .! so far ~!~ ~
|
8 |
+
providers = [
|
9 |
+
('CoreMLExecutionProvider', {
|
10 |
+
'device_id': 0,
|
11 |
+
}),
|
12 |
+
'CPUExecutionProvider',
|
13 |
+
]
|
14 |
+
|
15 |
+
model1 = SentenceTransformer('./models/optimum/all-MiniLM-L6-v2', device=device,
|
16 |
+
model_args={
|
17 |
+
"providers": providers
|
18 |
+
})
|
19 |
+
print("\033[91m", model1.modules(), "\033[0m")
|
20 |
+
# model2 = SentenceTransformer('./models/all-MiniLM-L6-v2', device=device)
|
21 |
+
|
22 |
+
sentences = [
|
23 |
+
'This framework generates embeddings for each input sentence',
|
24 |
+
'Sentences are passed as a list of string.',
|
25 |
+
'The quick brown fox jumps over the lazy dog.'
|
26 |
+
]
|
27 |
+
|
28 |
+
|
29 |
+
|
30 |
+
# Get embeddings for each sentence from both models
|
31 |
+
embeddings1 = model1.encode(sentences)
|
32 |
+
# embeddings2 = model2.encode(sentences)
|
33 |
+
|
34 |
+
|
35 |
+
# Compute and print the cosine similarity for each sentence's embeddings from the two models
|
36 |
+
for sentence, emb1, emb2 in zip(sentences, embeddings1, range(3)):
|
37 |
+
sim = 1 - distance.cosine(emb1, emb2) # Cosine similarity is the complement of cosine distance
|
38 |
+
print(f"Sentence: {sentence}")
|
39 |
+
print(f"Cosine Similarity: {sim:.4f}")
|
40 |
+
print("")
|
41 |
+
|
42 |
+
# print(model2.device)
|
43 |
+
|
44 |
+
# should be working perfectly :))
|
run_mteb.py
CHANGED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
import time
|
5 |
+
|
6 |
+
from mteb import MTEB
|
7 |
+
from sentence_transformers import SentenceTransformer
|
8 |
+
|
9 |
+
logging.basicConfig(level=logging.INFO)
|
10 |
+
|
11 |
+
logger = logging.getLogger("main")
|
12 |
+
|
13 |
+
os.environ["HF_DATASETS_OFFLINE"] = "1" # 1 for offline
|
14 |
+
os.environ["TRANSFORMERS_OFFLINE"] = "1" # 1 for offline
|
15 |
+
os.environ["TRANSFORMERS_CACHE"] = "./transformers_cache/"
|
16 |
+
os.environ["HF_DATASETS_CACHE"] = "./hf_datasets_cache/"
|
17 |
+
os.environ["HF_MODULES_CACHE"] = "./hf_modules_cache/"
|
18 |
+
os.environ["HF_METRICS_CACHE"] = "./hf_metrics_cache/"
|
19 |
+
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
TASK_LIST_CLUSTERING = [
|
24 |
+
"ArxivClusteringP2P",
|
25 |
+
"ArxivClusteringS2S",
|
26 |
+
"BiorxivClusteringP2P",
|
27 |
+
"BiorxivClusteringS2S",
|
28 |
+
"MedrxivClusteringP2P",
|
29 |
+
"MedrxivClusteringS2S",
|
30 |
+
"RedditClustering",
|
31 |
+
"RedditClusteringP2P",
|
32 |
+
"StackExchangeClustering",
|
33 |
+
"StackExchangeClusteringP2P",
|
34 |
+
"TwentyNewsgroupsClustering",
|
35 |
+
]
|
36 |
+
|
37 |
+
TASK_LIST_PAIR_CLASSIFICATION = [
|
38 |
+
"SprintDuplicateQuestions",
|
39 |
+
"TwitterSemEval2015",
|
40 |
+
"TwitterURLCorpus",
|
41 |
+
]
|
42 |
+
|
43 |
+
TASK_LIST = TASK_LIST_CLUSTERING + TASK_LIST_PAIR_CLASSIFICATION
|
44 |
+
|
45 |
+
|
46 |
+
def parse_args():
|
47 |
+
# Parse command line arguments
|
48 |
+
parser = argparse.ArgumentParser()
|
49 |
+
# parser.add_argument("--startid", type=int)
|
50 |
+
# parser.add_argument("--endid", type=int)
|
51 |
+
|
52 |
+
parser.add_argument("--modelpath", type=str, default="./models/")
|
53 |
+
parser.add_argument("--lang", type=str, default="en")
|
54 |
+
parser.add_argument("--taskname", type=str, default=None)
|
55 |
+
parser.add_argument("--batchsize", type=int, default=128)
|
56 |
+
parser.add_argument("--device", type=str, default="mps") # sorry :>
|
57 |
+
args = parser.parse_args()
|
58 |
+
return args
|
59 |
+
|
60 |
+
|
61 |
+
def main(args):
|
62 |
+
"""
|
63 |
+
ex: python run_array.py --modelpath ./models/all-MiniLM-L6-v2
|
64 |
+
"""
|
65 |
+
model = SentenceTransformer(args.modelpath, device=args.device)
|
66 |
+
model_name = args.modelpath.split("/")[-1].split("_")[-1]
|
67 |
+
if not model_name:
|
68 |
+
print(f"Model name is empty. Make sure not to end modelpath with a /")
|
69 |
+
return
|
70 |
+
|
71 |
+
print(f"Running on {model._target_device} with model {model_name}.")
|
72 |
+
|
73 |
+
for task in TASK_LIST:
|
74 |
+
print("Running task: ", task)
|
75 |
+
# this args. notation seems anti-pythonic
|
76 |
+
evaluation = MTEB(tasks=[task], task_langs=[args.lang])
|
77 |
+
retries = 5
|
78 |
+
for attempt in range(retries):
|
79 |
+
try:
|
80 |
+
evaluation.run(model, output_folder=f"results/{model_name}", batch_size=args.batchsize, eval_splits=["test"])
|
81 |
+
break
|
82 |
+
except ConnectionError:
|
83 |
+
if attempt < retries - 1:
|
84 |
+
print(f"Connection error occurred during task {task}. Waiting for 1 minute before retrying...")
|
85 |
+
time.sleep(60)
|
86 |
+
else:
|
87 |
+
print(f"Failed to execute task {task} after {retries} attempts due to connection errors.")
|
88 |
+
|
89 |
+
|
90 |
+
if __name__ == "__main__":
|
91 |
+
args = parse_args()
|
92 |
+
main(args)
|