Aidan Phillips commited on
Commit
d7d2fdd
·
1 Parent(s): 36599ed

accuracy scores much much better

Browse files
Files changed (1) hide show
  1. categories/accuracy.py +9 -19
categories/accuracy.py CHANGED
@@ -5,13 +5,15 @@ import numpy as np
5
  from scipy.spatial.distance import cosine
6
  from simalign import SentenceAligner
7
  from transformers import AutoModel, AutoTokenizer
 
8
 
9
  # setup global variables on import (bad practice, but whatever)
10
  # --------------------------------------------------------------
11
 
12
  aligner = SentenceAligner(model="distilbert-base-multilingual-cased", layer=6)
13
- tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
14
- model = AutoModel.from_pretrained("distilbert-base-multilingual-cased")
 
15
 
16
 
17
  def accuracy(src_sentence: str, trg_sentence: str) -> dict:
@@ -66,23 +68,11 @@ def __get_bertscore(src_sentence: str, trg_sentence: str) -> float:
66
  float: The BERTScore.
67
  """
68
  # Tokenize and generate embeddings
69
- inputs_src = tokenizer(
70
- src_sentence, return_tensors="pt", padding=True, truncation=True
71
- )
72
- inputs_trg = tokenizer(
73
- trg_sentence, return_tensors="pt", padding=True, truncation=True
74
- )
75
-
76
- with torch.no_grad():
77
- outputs_src = model(**inputs_src)
78
- outputs_trg = model(**inputs_trg)
79
-
80
- # Get sentence embeddings by averaging token embeddings (from last hidden state)
81
- src_embedding = torch.mean(outputs_src.last_hidden_state, dim=1).squeeze().numpy()
82
- trg_embedding = torch.mean(outputs_trg.last_hidden_state, dim=1).squeeze().numpy()
83
 
84
  # Calculate cosine similarity (1 - cosine distance)
85
- similarity = 1 - cosine(src_embedding, trg_embedding)
86
 
87
  return similarity
88
 
@@ -122,8 +112,8 @@ def __get_alignment_score(src_sentence: str, trg_sentence: str) -> list:
122
  # Each method has a list of pairs indicating the indexes of aligned words (The alignments are zero-indexed).
123
  alignments = aligner.get_word_aligns(src_list, trg_list)
124
 
125
- src_aligns = {x[0] for x in alignments["inter"]}
126
- trg_aligns = {x[1] for x in alignments["inter"]}
127
 
128
  mistranslations = []
129
  for i in range(len(src_list)):
 
5
  from scipy.spatial.distance import cosine
6
  from simalign import SentenceAligner
7
  from transformers import AutoModel, AutoTokenizer
8
+ from laser_encoders import LaserEncoderPipeline
9
 
10
  # setup global variables on import (bad practice, but whatever)
11
  # --------------------------------------------------------------
12
 
13
  aligner = SentenceAligner(model="distilbert-base-multilingual-cased", layer=6)
14
+
15
+ de_encoder = LaserEncoderPipeline(lang="deu_Latn")
16
+ en_encoder = LaserEncoderPipeline(lang="eng_Latn")
17
 
18
 
19
  def accuracy(src_sentence: str, trg_sentence: str) -> dict:
 
68
  float: The BERTScore.
69
  """
70
  # Tokenize and generate embeddings
71
+ emb_src = de_encoder.encode_sentences([src_sentence])[0]
72
+ emb_tgt = en_encoder.encode_sentences([trg_sentence])[0]
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  # Calculate cosine similarity (1 - cosine distance)
75
+ similarity = 1 - cosine(emb_src, emb_tgt)
76
 
77
  return similarity
78
 
 
112
  # Each method has a list of pairs indicating the indexes of aligned words (The alignments are zero-indexed).
113
  alignments = aligner.get_word_aligns(src_list, trg_list)
114
 
115
+ src_aligns = {x[0] for x in alignments["mwmf"]}
116
+ trg_aligns = {x[1] for x in alignments["mwmf"]}
117
 
118
  mistranslations = []
119
  for i in range(len(src_list)):