JohnnyBoy00
commited on
Commit
•
b116166
1
Parent(s):
0df4b7b
Upload evaluation.py
Browse files- evaluation.py +215 -0
evaluation.py
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
|
4 |
+
from evaluate import load as load_metric
|
5 |
+
|
6 |
+
from sklearn.metrics import mean_squared_error
|
7 |
+
from tqdm.auto import tqdm
|
8 |
+
|
9 |
+
MAX_TARGET_LENGTH = 128
|
10 |
+
|
11 |
+
# load evaluation metrics
|
12 |
+
sacrebleu = load_metric('sacrebleu')
|
13 |
+
rouge = load_metric('rouge')
|
14 |
+
meteor = load_metric('meteor')
|
15 |
+
bertscore = load_metric('bertscore')
|
16 |
+
|
17 |
+
# use gpu if it's available
|
18 |
+
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
19 |
+
|
20 |
+
def flatten_list(l):
|
21 |
+
"""
|
22 |
+
Utility function to convert a list of lists into a flattened list
|
23 |
+
Params:
|
24 |
+
l (list of lists): list to be flattened
|
25 |
+
Returns:
|
26 |
+
A flattened list with the elements of the original list
|
27 |
+
"""
|
28 |
+
return [item for sublist in l for item in sublist]
|
29 |
+
|
30 |
+
def parse_float(value):
|
31 |
+
"""
|
32 |
+
Utility function to parse a string into a float
|
33 |
+
|
34 |
+
Params:
|
35 |
+
value (string): value to be converted to float
|
36 |
+
Returns:
|
37 |
+
The float representation of the given string, or -1 if the string could
|
38 |
+
not be converted to a float
|
39 |
+
"""
|
40 |
+
try:
|
41 |
+
float_value = float(value)
|
42 |
+
return float_value
|
43 |
+
except ValueError:
|
44 |
+
return -1
|
45 |
+
|
46 |
+
def extract_scores(predictions):
|
47 |
+
"""
|
48 |
+
Utility function to extract the scores from the predictions of the model
|
49 |
+
|
50 |
+
Params:
|
51 |
+
predictions (list): complete model predictions
|
52 |
+
Returns:
|
53 |
+
scores (list): extracted scores from the model's predictions
|
54 |
+
"""
|
55 |
+
scores = []
|
56 |
+
# iterate through predictions and try to extract predicted score;
|
57 |
+
# if score could not be extracted, set it to -1
|
58 |
+
for pred in predictions:
|
59 |
+
try:
|
60 |
+
score_string = pred.split('Feedback:', 1)[0].strip()
|
61 |
+
score = parse_float(score_string)
|
62 |
+
except IndexError:
|
63 |
+
try:
|
64 |
+
score_string = pred.split(' ', 1)[0].strip()
|
65 |
+
score = parse_float(score_string)
|
66 |
+
except IndexError:
|
67 |
+
score = -1
|
68 |
+
scores.append(score)
|
69 |
+
|
70 |
+
return scores
|
71 |
+
|
72 |
+
def extract_feedback(predictions):
|
73 |
+
"""
|
74 |
+
Utility function to extract the feedback from the predictions of the model
|
75 |
+
|
76 |
+
Params:
|
77 |
+
predictions (list): complete model predictions
|
78 |
+
Returns:
|
79 |
+
feedback (list): extracted feedback from the model's predictions
|
80 |
+
"""
|
81 |
+
feedback = []
|
82 |
+
# iterate through predictions and try to extract predicted feedback
|
83 |
+
for pred in predictions:
|
84 |
+
try:
|
85 |
+
fb = pred.split(':', 1)[1]
|
86 |
+
except IndexError:
|
87 |
+
try:
|
88 |
+
fb = pred.split(' ', 1)[1]
|
89 |
+
except IndexError:
|
90 |
+
fb = pred
|
91 |
+
feedback.append(fb.strip())
|
92 |
+
|
93 |
+
return feedback
|
94 |
+
|
95 |
+
def compute_mse(predictions, labels):
|
96 |
+
"""
|
97 |
+
Utility function to compute the mean squared error of the
|
98 |
+
score predictions in relation to the golden label scores
|
99 |
+
|
100 |
+
Params:
|
101 |
+
predictions (list): model score predictions
|
102 |
+
labels (list): golden label scores
|
103 |
+
Returns:
|
104 |
+
(float, int): mse of valid samples and number of invalid samples
|
105 |
+
"""
|
106 |
+
# get indexes of valid score predictions
|
107 |
+
# (i.e., where the score is greater than zero)
|
108 |
+
idx = np.where(np.array(predictions) > 0)
|
109 |
+
|
110 |
+
# get size of the golden labels list and of
|
111 |
+
# the valid predictions array
|
112 |
+
labels_size = len(labels)
|
113 |
+
valid_predictions_size = idx[0].size
|
114 |
+
|
115 |
+
# only compute mse if valid score predictions were generated,
|
116 |
+
# otherwise set mse to 1
|
117 |
+
if valid_predictions_size > 0:
|
118 |
+
# calculate mse from labels and predictions
|
119 |
+
valid_predictions = np.array(predictions)[idx]
|
120 |
+
score_labels = np.array(labels)[idx]
|
121 |
+
mse = mean_squared_error(score_labels, valid_predictions)
|
122 |
+
|
123 |
+
# cap mse at 1
|
124 |
+
if mse > 1:
|
125 |
+
return 1, labels_size - valid_predictions_size
|
126 |
+
|
127 |
+
# return computed mse and number of invalid samples
|
128 |
+
return mse, labels_size - valid_predictions_size
|
129 |
+
else:
|
130 |
+
return 1, labels_size - valid_predictions_size
|
131 |
+
|
132 |
+
def compute_metrics(predictions, labels):
|
133 |
+
"""
|
134 |
+
Compute evaluation metrics from the predictions of the model
|
135 |
+
|
136 |
+
Params:
|
137 |
+
predictions (list): complete model predictions
|
138 |
+
labels (list): golden labels (previously tokenized)
|
139 |
+
Returns:
|
140 |
+
results (dict): dictionary with the computed evaluation metrics
|
141 |
+
"""
|
142 |
+
# extract feedback and labels from the model's predictions
|
143 |
+
predicted_feedback = extract_feedback(predictions)
|
144 |
+
predicted_scores = extract_scores(predictions)
|
145 |
+
|
146 |
+
# extract feedback and labels from the golden labels
|
147 |
+
reference_feedback = [x.split('Feedback:', 1)[1].strip() for x in labels]
|
148 |
+
reference_scores = [float(x.split('Feedback:', 1)[0].strip()) for x in labels]
|
149 |
+
|
150 |
+
# compute HF metrics
|
151 |
+
sacrebleu_score = sacrebleu.compute(predictions=predicted_feedback, references=[[x] for x in reference_feedback])['score']
|
152 |
+
rouge_score = rouge.compute(predictions=predicted_feedback, references=reference_feedback)['rouge2']
|
153 |
+
meteor_score = meteor.compute(predictions=predicted_feedback, references=reference_feedback)['meteor']
|
154 |
+
bert_score = bertscore.compute(
|
155 |
+
predictions=predicted_feedback,
|
156 |
+
references=reference_feedback,
|
157 |
+
lang='de',
|
158 |
+
model_type='bert-base-multilingual-cased',
|
159 |
+
rescale_with_baseline=True)
|
160 |
+
|
161 |
+
# compute mse of score predictions
|
162 |
+
mse, _ = compute_mse(predicted_scores, reference_scores)
|
163 |
+
|
164 |
+
results = {
|
165 |
+
'sacrebleu': sacrebleu_score,
|
166 |
+
'rouge': rouge_score,
|
167 |
+
'meteor': meteor_score,
|
168 |
+
'bert_score': np.array(bert_score['f1']).mean().item(),
|
169 |
+
'mse': mse
|
170 |
+
}
|
171 |
+
|
172 |
+
return results
|
173 |
+
|
174 |
+
def evaluate(model, tokenizer, dataloader):
|
175 |
+
"""
|
176 |
+
Evaluate model on the given dataset
|
177 |
+
Params:
|
178 |
+
model (PreTrainedModel): seq2seq model
|
179 |
+
tokenizer (PreTrainedTokenizer): tokenizer from HuggingFace
|
180 |
+
dataloader (torch Dataloader): dataloader of the dataset to be used for evaluation
|
181 |
+
Returns:
|
182 |
+
results (dict): dictionary with the computed evaluation metrics
|
183 |
+
predictions (list): list of the decoded predictions of the model
|
184 |
+
"""
|
185 |
+
decoded_preds, decoded_labels = [], []
|
186 |
+
|
187 |
+
model.eval()
|
188 |
+
# iterate through batchs in the dataloader
|
189 |
+
for batch in tqdm(dataloader):
|
190 |
+
with torch.no_grad():
|
191 |
+
batch = {k: v.to(device) for k, v in batch.items()}
|
192 |
+
# generate tokens from batch
|
193 |
+
generated_tokens = model.generate(
|
194 |
+
batch['input_ids'],
|
195 |
+
attention_mask=batch['attention_mask'],
|
196 |
+
max_length=MAX_TARGET_LENGTH
|
197 |
+
)
|
198 |
+
# get golden labels from batch
|
199 |
+
labels_batch = batch['labels']
|
200 |
+
|
201 |
+
# decode model predictions and golden labels
|
202 |
+
decoded_preds_batch = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
|
203 |
+
decoded_labels_batch = tokenizer.batch_decode(labels_batch, skip_special_tokens=True)
|
204 |
+
|
205 |
+
decoded_preds.append(decoded_preds_batch)
|
206 |
+
decoded_labels.append(decoded_labels_batch)
|
207 |
+
|
208 |
+
# convert predictions and golden labels into flattened lists
|
209 |
+
predictions = flatten_list(decoded_preds)
|
210 |
+
labels = flatten_list(decoded_labels)
|
211 |
+
|
212 |
+
# compute metrics based on predictions and golden labels
|
213 |
+
results = compute_metrics(predictions, labels)
|
214 |
+
|
215 |
+
return results, predictions
|