Update embeddingModel/README.md
Browse files- embeddingModel/README.md +1 -49
embeddingModel/README.md
CHANGED
@@ -2634,57 +2634,9 @@ We compared the performance of the GTE models with other popular text embedding
|
|
2634 |
| [sentence-t5-base](https://huggingface.co/sentence-transformers/sentence-t5-base) | 0.22 | 768 | 512 | 55.27 | 40.21 | 85.18 | 53.09 | 33.63 | 81.14 | 31.39 | 69.81 |
|
2635 |
|
2636 |
|
2637 |
-
##
|
2638 |
|
2639 |
-
Code example
|
2640 |
|
2641 |
-
```python
|
2642 |
-
import torch.nn.functional as F
|
2643 |
-
from torch import Tensor
|
2644 |
-
from transformers import AutoTokenizer, AutoModel
|
2645 |
-
|
2646 |
-
def average_pool(last_hidden_states: Tensor,
|
2647 |
-
attention_mask: Tensor) -> Tensor:
|
2648 |
-
last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
|
2649 |
-
return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
|
2650 |
-
|
2651 |
-
input_texts = [
|
2652 |
-
"what is the capital of China?",
|
2653 |
-
"how to implement quick sort in python?",
|
2654 |
-
"Beijing",
|
2655 |
-
"sorting algorithms"
|
2656 |
-
]
|
2657 |
-
|
2658 |
-
tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-base")
|
2659 |
-
model = AutoModel.from_pretrained("thenlper/gte-base")
|
2660 |
-
|
2661 |
-
# Tokenize the input texts
|
2662 |
-
batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
|
2663 |
-
|
2664 |
-
outputs = model(**batch_dict)
|
2665 |
-
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
|
2666 |
-
|
2667 |
-
# (Optionally) normalize embeddings
|
2668 |
-
embeddings = F.normalize(embeddings, p=2, dim=1)
|
2669 |
-
scores = (embeddings[:1] @ embeddings[1:].T) * 100
|
2670 |
-
print(scores.tolist())
|
2671 |
-
```
|
2672 |
-
|
2673 |
-
Use with sentence-transformers:
|
2674 |
-
```python
|
2675 |
-
from sentence_transformers import SentenceTransformer
|
2676 |
-
from sentence_transformers.util import cos_sim
|
2677 |
-
|
2678 |
-
sentences = ['That is a happy person', 'That is a very happy person']
|
2679 |
-
|
2680 |
-
model = SentenceTransformer('thenlper/gte-base')
|
2681 |
-
embeddings = model.encode(sentences)
|
2682 |
-
print(cos_sim(embeddings[0], embeddings[1]))
|
2683 |
-
```
|
2684 |
-
|
2685 |
-
### Limitation
|
2686 |
-
|
2687 |
-
This model exclusively caters to English texts, and any lengthy texts will be truncated to a maximum of 512 tokens.
|
2688 |
|
2689 |
### Citation
|
2690 |
|
|
|
2634 |
| [sentence-t5-base](https://huggingface.co/sentence-transformers/sentence-t5-base) | 0.22 | 768 | 512 | 55.27 | 40.21 | 85.18 | 53.09 | 33.63 | 81.14 | 31.39 | 69.81 |
|
2635 |
|
2636 |
|
2637 |
+
##
|
2638 |
|
|
|
2639 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2640 |
|
2641 |
### Citation
|
2642 |
|