Daniele Licari commited on
Commit
2b977f0
·
1 Parent(s): a1e3769

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +2 -76
README.md CHANGED
@@ -42,84 +42,10 @@ fill_mask("Il [MASK] ha chiesto revocarsi l'obbligo di pagamento")
42
  # {'sequence': "Il lavoratore ha chiesto revocarsi l'obbligo di pagamento",'score': 0.028993653133511543},
43
  # {'sequence': "Il Ministero ha chiesto revocarsi l'obbligo di pagamento", 'score': 0.025297977030277252}]
44
  ```
45
- here how to use it for sentence similarity
46
- ```python
47
- from transformers import AutoTokenizer, AutoModel
48
- from sklearn.metrics.pairwise import cosine_similarity
49
- import torch
50
- import seaborn as sns
51
- import matplotlib.pyplot as plt
52
- from textwrap import wrap
53
-
54
- #Mean Pooling - Take attention mask into account for correct averaging
55
- def mean_pooling(model_output, attention_mask):
56
- token_embeddings = model_output[0] #First element of model_output contains all token embeddings
57
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
58
- sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
59
- sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
60
- return sum_embeddings / sum_mask
61
-
62
-
63
- # gettting Sentence Embeddings
64
- def sentence_embeddings(sentences, model_name, max_length=512):
65
- # load models
66
- tokenizer = AutoTokenizer.from_pretrained(model_name)
67
- model = AutoModel.from_pretrained(model_name)
68
-
69
- #Tokenize sentences
70
- encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
71
-
72
- #Compute token embeddings
73
- with torch.no_grad():
74
- model_output = model(**encoded_input)
75
-
76
- #Perform pooling. In this case, mean pooling
77
- return mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy()
78
 
 
 
79
 
80
- def plot_similarity(sentences, model_name):
81
- # Get sentence embeddings produced by the model
82
- embeddings = sentence_embeddings(sentences, model_name)
83
- # Perfom similarity score using cosine similarity
84
- corr = cosine_similarity(embeddings, embeddings)
85
-
86
- # Plot heatmap similarity
87
- sns.set(font_scale=1.2)
88
- labels = [ '\n'.join(wrap(l, 40)) for l in sentences] # for text axis labels wrapping
89
- g = sns.heatmap(
90
- corr,
91
- xticklabels=labels,
92
- yticklabels=labels,
93
- vmax=1,
94
- cmap="YlOrRd")
95
- g.set_xticklabels(labels, rotation=90)
96
- model_short_name = model_name.split('/')[-1]
97
- g.set_title(f"Semantic Textual Similarity ({model_short_name})")
98
- plt.show()
99
-
100
- # Sentences to be compared
101
- sent = [
102
- # 1. "The court shall pronounce the judgment for the dissolution or termination of the civil effects of marriage."
103
- "Il tribunale pronuncia la sentenza per lo scioglimento o la cessazione degli effetti civili del matrimonio",
104
-
105
- # 2. "having regard to Articles 1, 2, 3 No. 2(b) and 4 Paragraph 13 of Law No. 898 of December 1, 1970, as later amended."
106
- # NOTE: Law Dec. 1, 1970 No. 898 is on divorce
107
- "visti gli articoli 1, 2, 3 n. 2 lett. b) e 4 comma 13 della legge 1 dicembre 1970 n. 898 e successive modifiche",
108
-
109
- # 3. "The plaintiff has lost the case."
110
- "Il ricorrente ha perso la causa"
111
- ]
112
-
113
- # Perform Semantic Textual Similarity using 'Italian-Legal-BERT'
114
- model_name = "dlicari/Italian-Legal-BERT"
115
- plot_similarity(sent, model_name)
116
-
117
- # Perform Semantic Textual Similarity using 'bert-base-italian-xxl-cased'
118
- model_name = 'dbmdz/bert-base-italian-xxl-cased'
119
- plot_similarity(sent, model_name)
120
- ```
121
- The similarity is shown in a heat map. The final graph is a 3x3 matrix in which each entry [i, j] is colored according to the cosine similarity of the encodings for sentences i and j
122
- <img src="https://huggingface.co/dlicari/Italian-Legal-BERT/resolve/main/semantic_text_similarity.jpg" width="700"/>
123
 
124
  <h2> Citation </h2>
125
  If you find our resource or paper is useful, please consider including the following citation in your paper.
 
42
  # {'sequence': "Il lavoratore ha chiesto revocarsi l'obbligo di pagamento",'score': 0.028993653133511543},
43
  # {'sequence': "Il Ministero ha chiesto revocarsi l'obbligo di pagamento", 'score': 0.025297977030277252}]
44
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ In this [COLAB ITALIAN-LEGAL-BERT : Minimal Start for Italian Legal Downstream Tasks](https://colab.research.google.com/drive/1aXOmqr70fjm8lYgIoGJMZDsK0QRIL4Lt?authuser=3#scrollTo=UgRBs1spJ-pG)
47
+ how to use it for sentence similarity, sentence classification and named entity recognition
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  <h2> Citation </h2>
51
  If you find our resource or paper is useful, please consider including the following citation in your paper.