Aidan Phillips
commited on
Commit
·
d7d2fdd
1
Parent(s):
36599ed
accuracy scores much much better
Browse files- categories/accuracy.py +9 -19
categories/accuracy.py
CHANGED
@@ -5,13 +5,15 @@ import numpy as np
|
|
5 |
from scipy.spatial.distance import cosine
|
6 |
from simalign import SentenceAligner
|
7 |
from transformers import AutoModel, AutoTokenizer
|
|
|
8 |
|
9 |
# setup global variables on import (bad practice, but whatever)
|
10 |
# --------------------------------------------------------------
|
11 |
|
12 |
aligner = SentenceAligner(model="distilbert-base-multilingual-cased", layer=6)
|
13 |
-
|
14 |
-
|
|
|
15 |
|
16 |
|
17 |
def accuracy(src_sentence: str, trg_sentence: str) -> dict:
|
@@ -66,23 +68,11 @@ def __get_bertscore(src_sentence: str, trg_sentence: str) -> float:
|
|
66 |
float: The BERTScore.
|
67 |
"""
|
68 |
# Tokenize and generate embeddings
|
69 |
-
|
70 |
-
|
71 |
-
)
|
72 |
-
inputs_trg = tokenizer(
|
73 |
-
trg_sentence, return_tensors="pt", padding=True, truncation=True
|
74 |
-
)
|
75 |
-
|
76 |
-
with torch.no_grad():
|
77 |
-
outputs_src = model(**inputs_src)
|
78 |
-
outputs_trg = model(**inputs_trg)
|
79 |
-
|
80 |
-
# Get sentence embeddings by averaging token embeddings (from last hidden state)
|
81 |
-
src_embedding = torch.mean(outputs_src.last_hidden_state, dim=1).squeeze().numpy()
|
82 |
-
trg_embedding = torch.mean(outputs_trg.last_hidden_state, dim=1).squeeze().numpy()
|
83 |
|
84 |
# Calculate cosine similarity (1 - cosine distance)
|
85 |
-
similarity = 1 - cosine(
|
86 |
|
87 |
return similarity
|
88 |
|
@@ -122,8 +112,8 @@ def __get_alignment_score(src_sentence: str, trg_sentence: str) -> list:
|
|
122 |
# Each method has a list of pairs indicating the indexes of aligned words (The alignments are zero-indexed).
|
123 |
alignments = aligner.get_word_aligns(src_list, trg_list)
|
124 |
|
125 |
-
src_aligns = {x[0] for x in alignments["
|
126 |
-
trg_aligns = {x[1] for x in alignments["
|
127 |
|
128 |
mistranslations = []
|
129 |
for i in range(len(src_list)):
|
|
|
5 |
from scipy.spatial.distance import cosine
|
6 |
from simalign import SentenceAligner
|
7 |
from transformers import AutoModel, AutoTokenizer
|
8 |
+
from laser_encoders import LaserEncoderPipeline
|
9 |
|
10 |
# setup global variables on import (bad practice, but whatever)
|
11 |
# --------------------------------------------------------------
|
12 |
|
13 |
aligner = SentenceAligner(model="distilbert-base-multilingual-cased", layer=6)
|
14 |
+
|
15 |
+
de_encoder = LaserEncoderPipeline(lang="deu_Latn")
|
16 |
+
en_encoder = LaserEncoderPipeline(lang="eng_Latn")
|
17 |
|
18 |
|
19 |
def accuracy(src_sentence: str, trg_sentence: str) -> dict:
|
|
|
68 |
float: The BERTScore.
|
69 |
"""
|
70 |
# Tokenize and generate embeddings
|
71 |
+
emb_src = de_encoder.encode_sentences([src_sentence])[0]
|
72 |
+
emb_tgt = en_encoder.encode_sentences([trg_sentence])[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
# Calculate cosine similarity (1 - cosine distance)
|
75 |
+
similarity = 1 - cosine(emb_src, emb_tgt)
|
76 |
|
77 |
return similarity
|
78 |
|
|
|
112 |
# Each method has a list of pairs indicating the indexes of aligned words (The alignments are zero-indexed).
|
113 |
alignments = aligner.get_word_aligns(src_list, trg_list)
|
114 |
|
115 |
+
src_aligns = {x[0] for x in alignments["mwmf"]}
|
116 |
+
trg_aligns = {x[1] for x in alignments["mwmf"]}
|
117 |
|
118 |
mistranslations = []
|
119 |
for i in range(len(src_list)):
|