Tom Aarsen commited on
Commit
fc60470
1 Parent(s): 976214b

Add Sentence Transformers integration with CLS pooling

Browse files
1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": true,
4
+ "pooling_mode_mean_tokens": false,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
README.md CHANGED
@@ -28,6 +28,30 @@ PubMedNCL: Working with biomedical papers? Try [PubMedNCL](https://huggingface.c
28
 
29
  ## How to use the pretrained model
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  ```python
32
  from transformers import AutoTokenizer, AutoModel
33
 
@@ -49,6 +73,12 @@ result = model(**inputs)
49
 
50
  # take the first token ([CLS] token) in the batch as the embedding
51
  embeddings = result.last_hidden_state[:, 0, :]
 
 
 
 
 
 
52
  ```
53
 
54
  ## Triplet Mining Parameters
 
28
 
29
  ## How to use the pretrained model
30
 
31
+ ### Sentence Transformers
32
+
33
+ ```python
34
+ from sentence_transformers import SentenceTransformer
35
+
36
+ # Load the model
37
+ model = SentenceTransformer("malteos/scincl")
38
+
39
+ # Concatenate the title and abstract with the [SEP] token
40
+ papers = [
41
+ "BERT [SEP] We introduce a new language representation model called BERT",
42
+ "Attention is all you need [SEP] The dominant sequence transduction models are based on complex recurrent or convolutional neural networks",
43
+ ]
44
+ # Inference
45
+ embeddings = model.encode(papers)
46
+
47
+ # Compute the (cosine) similarity between embeddings
48
+ similarity = model.similarity(embeddings[0], embeddings[1])
49
+ print(similarity.item())
50
+ # => 0.8440517783164978
51
+ ```
52
+
53
+ ### Transformers
54
+
55
  ```python
56
  from transformers import AutoTokenizer, AutoModel
57
 
 
73
 
74
  # take the first token ([CLS] token) in the batch as the embedding
75
  embeddings = result.last_hidden_state[:, 0, :]
76
+
77
+ # calculate the similarity
78
+ embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
79
+ similarity = (embeddings[0] @ embeddings[1].T)
80
+ print(similarity.item())
81
+ # => 0.8440518379211426
82
  ```
83
 
84
  ## Triplet Mining Parameters
config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.0.0",
4
+ "transformers": "4.41.2",
5
+ "pytorch": "2.3.0+cu121"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": "cosine"
10
+ }
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }