Daniele Licari
commited on
Commit
·
2b977f0
1
Parent(s):
a1e3769
Update README.md
Browse files
README.md
CHANGED
@@ -42,84 +42,10 @@ fill_mask("Il [MASK] ha chiesto revocarsi l'obbligo di pagamento")
|
|
42 |
# {'sequence': "Il lavoratore ha chiesto revocarsi l'obbligo di pagamento",'score': 0.028993653133511543},
|
43 |
# {'sequence': "Il Ministero ha chiesto revocarsi l'obbligo di pagamento", 'score': 0.025297977030277252}]
|
44 |
```
|
45 |
-
here how to use it for sentence similarity
|
46 |
-
```python
|
47 |
-
from transformers import AutoTokenizer, AutoModel
|
48 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
49 |
-
import torch
|
50 |
-
import seaborn as sns
|
51 |
-
import matplotlib.pyplot as plt
|
52 |
-
from textwrap import wrap
|
53 |
-
|
54 |
-
#Mean Pooling - Take attention mask into account for correct averaging
|
55 |
-
def mean_pooling(model_output, attention_mask):
|
56 |
-
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
57 |
-
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
58 |
-
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
|
59 |
-
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
60 |
-
return sum_embeddings / sum_mask
|
61 |
-
|
62 |
-
|
63 |
-
# gettting Sentence Embeddings
|
64 |
-
def sentence_embeddings(sentences, model_name, max_length=512):
|
65 |
-
# load models
|
66 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
67 |
-
model = AutoModel.from_pretrained(model_name)
|
68 |
-
|
69 |
-
#Tokenize sentences
|
70 |
-
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
|
71 |
-
|
72 |
-
#Compute token embeddings
|
73 |
-
with torch.no_grad():
|
74 |
-
model_output = model(**encoded_input)
|
75 |
-
|
76 |
-
#Perform pooling. In this case, mean pooling
|
77 |
-
return mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy()
|
78 |
|
|
|
|
|
79 |
|
80 |
-
def plot_similarity(sentences, model_name):
|
81 |
-
# Get sentence embeddings produced by the model
|
82 |
-
embeddings = sentence_embeddings(sentences, model_name)
|
83 |
-
# Perfom similarity score using cosine similarity
|
84 |
-
corr = cosine_similarity(embeddings, embeddings)
|
85 |
-
|
86 |
-
# Plot heatmap similarity
|
87 |
-
sns.set(font_scale=1.2)
|
88 |
-
labels = [ '\n'.join(wrap(l, 40)) for l in sentences] # for text axis labels wrapping
|
89 |
-
g = sns.heatmap(
|
90 |
-
corr,
|
91 |
-
xticklabels=labels,
|
92 |
-
yticklabels=labels,
|
93 |
-
vmax=1,
|
94 |
-
cmap="YlOrRd")
|
95 |
-
g.set_xticklabels(labels, rotation=90)
|
96 |
-
model_short_name = model_name.split('/')[-1]
|
97 |
-
g.set_title(f"Semantic Textual Similarity ({model_short_name})")
|
98 |
-
plt.show()
|
99 |
-
|
100 |
-
# Sentences to be compared
|
101 |
-
sent = [
|
102 |
-
# 1. "The court shall pronounce the judgment for the dissolution or termination of the civil effects of marriage."
|
103 |
-
"Il tribunale pronuncia la sentenza per lo scioglimento o la cessazione degli effetti civili del matrimonio",
|
104 |
-
|
105 |
-
# 2. "having regard to Articles 1, 2, 3 No. 2(b) and 4 Paragraph 13 of Law No. 898 of December 1, 1970, as later amended."
|
106 |
-
# NOTE: Law Dec. 1, 1970 No. 898 is on divorce
|
107 |
-
"visti gli articoli 1, 2, 3 n. 2 lett. b) e 4 comma 13 della legge 1 dicembre 1970 n. 898 e successive modifiche",
|
108 |
-
|
109 |
-
# 3. "The plaintiff has lost the case."
|
110 |
-
"Il ricorrente ha perso la causa"
|
111 |
-
]
|
112 |
-
|
113 |
-
# Perform Semantic Textual Similarity using 'Italian-Legal-BERT'
|
114 |
-
model_name = "dlicari/Italian-Legal-BERT"
|
115 |
-
plot_similarity(sent, model_name)
|
116 |
-
|
117 |
-
# Perform Semantic Textual Similarity using 'bert-base-italian-xxl-cased'
|
118 |
-
model_name = 'dbmdz/bert-base-italian-xxl-cased'
|
119 |
-
plot_similarity(sent, model_name)
|
120 |
-
```
|
121 |
-
The similarity is shown in a heat map. The final graph is a 3x3 matrix in which each entry [i, j] is colored according to the cosine similarity of the encodings for sentences i and j
|
122 |
-
<img src="https://huggingface.co/dlicari/Italian-Legal-BERT/resolve/main/semantic_text_similarity.jpg" width="700"/>
|
123 |
|
124 |
<h2> Citation </h2>
|
125 |
If you find our resource or paper is useful, please consider including the following citation in your paper.
|
|
|
42 |
# {'sequence': "Il lavoratore ha chiesto revocarsi l'obbligo di pagamento",'score': 0.028993653133511543},
|
43 |
# {'sequence': "Il Ministero ha chiesto revocarsi l'obbligo di pagamento", 'score': 0.025297977030277252}]
|
44 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
+
In this [COLAB ITALIAN-LEGAL-BERT : Minimal Start for Italian Legal Downstream Tasks](https://colab.research.google.com/drive/1aXOmqr70fjm8lYgIoGJMZDsK0QRIL4Lt?authuser=3#scrollTo=UgRBs1spJ-pG)
|
47 |
+
how to use it for sentence similarity, sentence classification and named entity recognition
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
<h2> Citation </h2>
|
51 |
If you find our resource or paper is useful, please consider including the following citation in your paper.
|