Fix various snippets; add required safe_serialization
#2
by
tomaarsen
HF staff
- opened
- README.md +3 -3
- sentence_bert_config.json +4 -1
README.md
CHANGED
@@ -2675,9 +2675,9 @@ from sentence_transformers import SentenceTransformer
|
|
2675 |
|
2676 |
matryoshka_dim = 512
|
2677 |
|
2678 |
-
model = SentenceTransformer(".", trust_remote_code=True)
|
2679 |
sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?']
|
2680 |
-
embeddings = model.encode(sentences)
|
2681 |
embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],))
|
2682 |
embeddings = embeddings[:, :matryoshka_dim]
|
2683 |
embeddings = F.normalize(embeddings, p=2, dim=1)
|
@@ -2699,7 +2699,7 @@ def mean_pooling(model_output, attention_mask):
|
|
2699 |
sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?']
|
2700 |
|
2701 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
2702 |
-
model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1', trust_remote_code=True)
|
2703 |
model.eval()
|
2704 |
|
2705 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
|
|
2675 |
|
2676 |
matryoshka_dim = 512
|
2677 |
|
2678 |
+
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
|
2679 |
sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?']
|
2680 |
+
embeddings = model.encode(sentences, convert_to_tensor=True)
|
2681 |
embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],))
|
2682 |
embeddings = embeddings[:, :matryoshka_dim]
|
2683 |
embeddings = F.normalize(embeddings, p=2, dim=1)
|
|
|
2699 |
sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?']
|
2700 |
|
2701 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
2702 |
+
model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1.5', trust_remote_code=True, safe_serialization=True)
|
2703 |
model.eval()
|
2704 |
|
2705 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
sentence_bert_config.json
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
{
|
2 |
"max_seq_length": 8192,
|
3 |
-
"do_lower_case": false
|
|
|
|
|
|
|
4 |
}
|
|
|
1 |
{
|
2 |
"max_seq_length": 8192,
|
3 |
+
"do_lower_case": false,
|
4 |
+
"model_args": {
|
5 |
+
"safe_serialization": true
|
6 |
+
}
|
7 |
}
|