Upload metrics.py with huggingface_hub
Browse files- metrics.py +51 -14
metrics.py
CHANGED
@@ -5,9 +5,7 @@ from dataclasses import field
|
|
5 |
from typing import Any, Dict, Generator, List, Optional
|
6 |
|
7 |
import evaluate
|
8 |
-
import nltk
|
9 |
import numpy
|
10 |
-
from editdistance import eval
|
11 |
|
12 |
from .dataclass import InternalField
|
13 |
from .operator import (
|
@@ -19,8 +17,6 @@ from .operator import (
|
|
19 |
from .operators import CopyFields
|
20 |
from .stream import MultiStream, Stream
|
21 |
|
22 |
-
nltk.download("punkt")
|
23 |
-
|
24 |
|
25 |
def abstract_factory():
|
26 |
return {}
|
@@ -65,7 +61,8 @@ class GlobalMetric(SingleStreamOperator, Metric):
|
|
65 |
try:
|
66 |
instance_score = self._compute([refs], [pred])
|
67 |
except:
|
68 |
-
instance_score = {"score": None}
|
|
|
69 |
if isinstance(self.main_score, str) and self.main_score is not None:
|
70 |
instance_score[self.main_score] = None
|
71 |
|
@@ -86,6 +83,7 @@ class GlobalMetric(SingleStreamOperator, Metric):
|
|
86 |
def _compute(self, references: List[List[str]], predictions: List[str]) -> dict:
|
87 |
result = self.compute(references, predictions)
|
88 |
result["score"] = result[self.main_score]
|
|
|
89 |
return result
|
90 |
|
91 |
@abstractmethod
|
@@ -131,6 +129,7 @@ class InstanceMetric(SingleStreamOperator, Metric):
|
|
131 |
global_score[field] = mean([instance["score"]["instance"][field] for instance in instances])
|
132 |
if field == self.main_score:
|
133 |
global_score["score"] = global_score[field]
|
|
|
134 |
|
135 |
for instance in instances:
|
136 |
yield instance
|
@@ -138,6 +137,7 @@ class InstanceMetric(SingleStreamOperator, Metric):
|
|
138 |
def _compute(self, references: List[List[str]], predictions: List[str]) -> dict:
|
139 |
result = self.compute(references=references, predictions=predictions)
|
140 |
result["score"] = result[self.main_score]
|
|
|
141 |
return result
|
142 |
|
143 |
@abstractmethod
|
@@ -147,7 +147,6 @@ class InstanceMetric(SingleStreamOperator, Metric):
|
|
147 |
|
148 |
class Squad(GlobalMetric):
|
149 |
_metric = None
|
150 |
-
reduction_map = {"mean": ["f1"]}
|
151 |
main_score = "f1"
|
152 |
metric = "squad"
|
153 |
|
@@ -172,6 +171,7 @@ class SingleReferenceInstanceMetric(InstanceMetric):
|
|
172 |
def _compute(self, references: List[str], prediction: str) -> dict:
|
173 |
result = self.compute(references[0], prediction)
|
174 |
result["score"] = result[self.main_score]
|
|
|
175 |
return result
|
176 |
|
177 |
@abstractmethod
|
@@ -288,6 +288,7 @@ class F1MultiLabel(GlobalMetric):
|
|
288 |
_metric = None
|
289 |
main_score = "f1_macro"
|
290 |
average = None # Report per class then aggregate by mean
|
|
|
291 |
|
292 |
def prepare(self):
|
293 |
super(F1MultiLabel, self).prepare()
|
@@ -314,17 +315,41 @@ class F1MultiLabel(GlobalMetric):
|
|
314 |
len(reference) == 1 for reference in references
|
315 |
), "Only a single reference per prediction is allowed in F1 metric"
|
316 |
references = [reference[0] for reference in references]
|
317 |
-
labels =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
for label in labels:
|
319 |
self.add_str_to_id(label)
|
320 |
formatted_references = [self.get_one_hot_vector(reference) for reference in references]
|
321 |
formatted_predictions = [self.get_one_hot_vector(prediction) for prediction in predictions]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
result = self._metric.compute(
|
323 |
-
predictions=formatted_predictions,
|
|
|
|
|
|
|
324 |
)
|
325 |
if isinstance(result["f1"], numpy.ndarray):
|
326 |
from statistics import mean
|
327 |
|
|
|
|
|
|
|
328 |
final_result = {self.main_score: mean(result["f1"])}
|
329 |
for i, label in enumerate(labels):
|
330 |
final_result["f1_" + label] = result["f1"][i]
|
@@ -348,24 +373,36 @@ class Rouge(HuggingfaceMetric):
|
|
348 |
main_score = "rougeL"
|
349 |
scale = 1.0
|
350 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
def compute(self, references, predictions):
|
352 |
-
predictions = ["\n".join(
|
353 |
-
references = [["\n".join(
|
354 |
return super().compute(references, predictions)
|
355 |
|
356 |
|
357 |
-
# Computes chat edit distance, ignoring
|
358 |
class CharEditDistanceAccuracy(SingleReferenceInstanceMetric):
|
359 |
reduction_map = {"mean": ["char_edit_dist_accuracy"]}
|
360 |
main_score = "char_edit_dist_accuracy"
|
361 |
|
|
|
|
|
|
|
|
|
|
|
362 |
def compute(self, reference, prediction: str) -> dict:
|
363 |
-
formatted_prediction = "
|
364 |
-
formatted_reference = "
|
365 |
max_length = max(len(formatted_reference), len(formatted_prediction))
|
366 |
if max_length == 0:
|
367 |
return 0
|
368 |
-
edit_dist = eval(formatted_reference, formatted_prediction)
|
369 |
return {"char_edit_dist_accuracy": (1 - edit_dist / max_length)}
|
370 |
|
371 |
|
|
|
5 |
from typing import Any, Dict, Generator, List, Optional
|
6 |
|
7 |
import evaluate
|
|
|
8 |
import numpy
|
|
|
9 |
|
10 |
from .dataclass import InternalField
|
11 |
from .operator import (
|
|
|
17 |
from .operators import CopyFields
|
18 |
from .stream import MultiStream, Stream
|
19 |
|
|
|
|
|
20 |
|
21 |
def abstract_factory():
|
22 |
return {}
|
|
|
61 |
try:
|
62 |
instance_score = self._compute([refs], [pred])
|
63 |
except:
|
64 |
+
instance_score = {"score": None, "score_name": self.main_score}
|
65 |
+
|
66 |
if isinstance(self.main_score, str) and self.main_score is not None:
|
67 |
instance_score[self.main_score] = None
|
68 |
|
|
|
83 |
def _compute(self, references: List[List[str]], predictions: List[str]) -> dict:
|
84 |
result = self.compute(references, predictions)
|
85 |
result["score"] = result[self.main_score]
|
86 |
+
result["score_name"] = self.main_score
|
87 |
return result
|
88 |
|
89 |
@abstractmethod
|
|
|
129 |
global_score[field] = mean([instance["score"]["instance"][field] for instance in instances])
|
130 |
if field == self.main_score:
|
131 |
global_score["score"] = global_score[field]
|
132 |
+
global_score["score_name"] = self.main_score
|
133 |
|
134 |
for instance in instances:
|
135 |
yield instance
|
|
|
137 |
def _compute(self, references: List[List[str]], predictions: List[str]) -> dict:
|
138 |
result = self.compute(references=references, predictions=predictions)
|
139 |
result["score"] = result[self.main_score]
|
140 |
+
result["score_name"] = self.main_score
|
141 |
return result
|
142 |
|
143 |
@abstractmethod
|
|
|
147 |
|
148 |
class Squad(GlobalMetric):
|
149 |
_metric = None
|
|
|
150 |
main_score = "f1"
|
151 |
metric = "squad"
|
152 |
|
|
|
171 |
def _compute(self, references: List[str], prediction: str) -> dict:
|
172 |
result = self.compute(references[0], prediction)
|
173 |
result["score"] = result[self.main_score]
|
174 |
+
result["score_name"] = self.main_score
|
175 |
return result
|
176 |
|
177 |
@abstractmethod
|
|
|
288 |
_metric = None
|
289 |
main_score = "f1_macro"
|
290 |
average = None # Report per class then aggregate by mean
|
291 |
+
classes_to_ignore = ["none"]
|
292 |
|
293 |
def prepare(self):
|
294 |
super(F1MultiLabel, self).prepare()
|
|
|
315 |
len(reference) == 1 for reference in references
|
316 |
), "Only a single reference per prediction is allowed in F1 metric"
|
317 |
references = [reference[0] for reference in references]
|
318 |
+
labels = [
|
319 |
+
l
|
320 |
+
for l in set([label for reference in references for label in reference])
|
321 |
+
if l not in self.classes_to_ignore
|
322 |
+
]
|
323 |
+
# if no classes are left then F1 is not defined
|
324 |
+
# (e.g. only "none" in references)
|
325 |
+
if len(labels) == 0:
|
326 |
+
return {self.main_score: float("nan")}
|
327 |
+
|
328 |
for label in labels:
|
329 |
self.add_str_to_id(label)
|
330 |
formatted_references = [self.get_one_hot_vector(reference) for reference in references]
|
331 |
formatted_predictions = [self.get_one_hot_vector(prediction) for prediction in predictions]
|
332 |
+
|
333 |
+
# There is odd behavior in scikit-learn that when passing a one-hot vector with a single
|
334 |
+
# element, it is treated a class identifier. Therefore, we add labels=[1] to limit to only
|
335 |
+
# to this class.
|
336 |
+
if len(labels) == 1:
|
337 |
+
labels_param = [1]
|
338 |
+
else:
|
339 |
+
labels_param = None
|
340 |
+
|
341 |
result = self._metric.compute(
|
342 |
+
predictions=formatted_predictions,
|
343 |
+
references=formatted_references,
|
344 |
+
average=self.average,
|
345 |
+
labels=labels_param,
|
346 |
)
|
347 |
if isinstance(result["f1"], numpy.ndarray):
|
348 |
from statistics import mean
|
349 |
|
350 |
+
assert len(result["f1"]) == len(
|
351 |
+
labels
|
352 |
+
), f'F1 result ({result["f1"]}) has more entries than labels ({labels})'
|
353 |
final_result = {self.main_score: mean(result["f1"])}
|
354 |
for i, label in enumerate(labels):
|
355 |
final_result["f1_" + label] = result["f1"][i]
|
|
|
373 |
main_score = "rougeL"
|
374 |
scale = 1.0
|
375 |
|
376 |
+
def prepare(self):
|
377 |
+
super().prepare()
|
378 |
+
import nltk
|
379 |
+
|
380 |
+
nltk.download("punkt")
|
381 |
+
self.sent_tokenize = nltk.sent_tokenize
|
382 |
+
|
383 |
def compute(self, references, predictions):
|
384 |
+
predictions = ["\n".join(self.sent_tokenize(prediction.strip())) for prediction in predictions]
|
385 |
+
references = [["\n".join(self.sent_tokenize(r.strip())) for r in reference] for reference in references]
|
386 |
return super().compute(references, predictions)
|
387 |
|
388 |
|
389 |
+
# Computes chat edit distance, ignoring whitespace
|
390 |
class CharEditDistanceAccuracy(SingleReferenceInstanceMetric):
|
391 |
reduction_map = {"mean": ["char_edit_dist_accuracy"]}
|
392 |
main_score = "char_edit_dist_accuracy"
|
393 |
|
394 |
+
def prepare(self):
|
395 |
+
import editdistance
|
396 |
+
|
397 |
+
self.eval = editdistance.eval
|
398 |
+
|
399 |
def compute(self, reference, prediction: str) -> dict:
|
400 |
+
formatted_prediction = "".join(prediction.split())
|
401 |
+
formatted_reference = "".join(reference.split())
|
402 |
max_length = max(len(formatted_reference), len(formatted_prediction))
|
403 |
if max_length == 0:
|
404 |
return 0
|
405 |
+
edit_dist = self.eval(formatted_reference, formatted_prediction)
|
406 |
return {"char_edit_dist_accuracy": (1 - edit_dist / max_length)}
|
407 |
|
408 |
|