varun4 commited on
Commit
a1f5641
1 Parent(s): f70e4f4

lets save progress

Browse files
models/all-MiniLM-L6-v2-q8/modules.json DELETED
File without changes
models/all-MiniLM-L6-v2-q8/sentence_bert_config.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "max_seq_length": 256,
3
- "do_lower_case": false
4
- }
 
 
 
 
 
ort_sentence_transformers.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from scipy.spatial import distance
3
+
4
+
5
+ device = "mps"
6
+ # Load the models
7
+ # so close, and yet, .! so far ~!~ ~
8
+ providers = [
9
+ ('CoreMLExecutionProvider', {
10
+ 'device_id': 0,
11
+ }),
12
+ 'CPUExecutionProvider',
13
+ ]
14
+
15
+ model1 = SentenceTransformer('./models/optimum/all-MiniLM-L6-v2', device=device,
16
+ model_args={
17
+ "providers": providers
18
+ })
19
+ print("\033[91m", model1.modules(), "\033[0m")
20
+ # model2 = SentenceTransformer('./models/all-MiniLM-L6-v2', device=device)
21
+
22
+ sentences = [
23
+ 'This framework generates embeddings for each input sentence',
24
+ 'Sentences are passed as a list of string.',
25
+ 'The quick brown fox jumps over the lazy dog.'
26
+ ]
27
+
28
+
29
+
30
+ # Get embeddings for each sentence from both models
31
+ embeddings1 = model1.encode(sentences)
32
+ # embeddings2 = model2.encode(sentences)
33
+
34
+
35
+ # Compute and print the cosine similarity for each sentence's embeddings from the two models
36
+ for sentence, emb1, emb2 in zip(sentences, embeddings1, range(3)):
37
+ sim = 1 - distance.cosine(emb1, emb2) # Cosine similarity is the complement of cosine distance
38
+ print(f"Sentence: {sentence}")
39
+ print(f"Cosine Similarity: {sim:.4f}")
40
+ print("")
41
+
42
+ # print(model2.device)
43
+
44
+ # should be working perfectly :))
run_mteb.py CHANGED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import os
4
+ import time
5
+
6
+ from mteb import MTEB
7
+ from sentence_transformers import SentenceTransformer
8
+
9
+ logging.basicConfig(level=logging.INFO)
10
+
11
+ logger = logging.getLogger("main")
12
+
13
+ os.environ["HF_DATASETS_OFFLINE"] = "1" # 1 for offline
14
+ os.environ["TRANSFORMERS_OFFLINE"] = "1" # 1 for offline
15
+ os.environ["TRANSFORMERS_CACHE"] = "./transformers_cache/"
16
+ os.environ["HF_DATASETS_CACHE"] = "./hf_datasets_cache/"
17
+ os.environ["HF_MODULES_CACHE"] = "./hf_modules_cache/"
18
+ os.environ["HF_METRICS_CACHE"] = "./hf_metrics_cache/"
19
+ # os.environ["TOKENIZERS_PARALLELISM"] = "false"
20
+
21
+
22
+
23
+ TASK_LIST_CLUSTERING = [
24
+ "ArxivClusteringP2P",
25
+ "ArxivClusteringS2S",
26
+ "BiorxivClusteringP2P",
27
+ "BiorxivClusteringS2S",
28
+ "MedrxivClusteringP2P",
29
+ "MedrxivClusteringS2S",
30
+ "RedditClustering",
31
+ "RedditClusteringP2P",
32
+ "StackExchangeClustering",
33
+ "StackExchangeClusteringP2P",
34
+ "TwentyNewsgroupsClustering",
35
+ ]
36
+
37
+ TASK_LIST_PAIR_CLASSIFICATION = [
38
+ "SprintDuplicateQuestions",
39
+ "TwitterSemEval2015",
40
+ "TwitterURLCorpus",
41
+ ]
42
+
43
+ TASK_LIST = TASK_LIST_CLUSTERING + TASK_LIST_PAIR_CLASSIFICATION
44
+
45
+
46
+ def parse_args():
47
+ # Parse command line arguments
48
+ parser = argparse.ArgumentParser()
49
+ # parser.add_argument("--startid", type=int)
50
+ # parser.add_argument("--endid", type=int)
51
+
52
+ parser.add_argument("--modelpath", type=str, default="./models/")
53
+ parser.add_argument("--lang", type=str, default="en")
54
+ parser.add_argument("--taskname", type=str, default=None)
55
+ parser.add_argument("--batchsize", type=int, default=128)
56
+ parser.add_argument("--device", type=str, default="mps") # sorry :>
57
+ args = parser.parse_args()
58
+ return args
59
+
60
+
61
+ def main(args):
62
+ """
63
+ ex: python run_array.py --modelpath ./models/all-MiniLM-L6-v2
64
+ """
65
+ model = SentenceTransformer(args.modelpath, device=args.device)
66
+ model_name = args.modelpath.split("/")[-1].split("_")[-1]
67
+ if not model_name:
68
+ print(f"Model name is empty. Make sure not to end modelpath with a /")
69
+ return
70
+
71
+ print(f"Running on {model._target_device} with model {model_name}.")
72
+
73
+ for task in TASK_LIST:
74
+ print("Running task: ", task)
75
+ # this args. notation seems anti-pythonic
76
+ evaluation = MTEB(tasks=[task], task_langs=[args.lang])
77
+ retries = 5
78
+ for attempt in range(retries):
79
+ try:
80
+ evaluation.run(model, output_folder=f"results/{model_name}", batch_size=args.batchsize, eval_splits=["test"])
81
+ break
82
+ except ConnectionError:
83
+ if attempt < retries - 1:
84
+ print(f"Connection error occurred during task {task}. Waiting for 1 minute before retrying...")
85
+ time.sleep(60)
86
+ else:
87
+ print(f"Failed to execute task {task} after {retries} attempts due to connection errors.")
88
+
89
+
90
+ if __name__ == "__main__":
91
+ args = parse_args()
92
+ main(args)