Elron commited on
Commit
11723f3
·
1 Parent(s): 018f80f

Upload metrics.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. metrics.py +51 -14
metrics.py CHANGED
@@ -5,9 +5,7 @@ from dataclasses import field
5
  from typing import Any, Dict, Generator, List, Optional
6
 
7
  import evaluate
8
- import nltk
9
  import numpy
10
- from editdistance import eval
11
 
12
  from .dataclass import InternalField
13
  from .operator import (
@@ -19,8 +17,6 @@ from .operator import (
19
  from .operators import CopyFields
20
  from .stream import MultiStream, Stream
21
 
22
- nltk.download("punkt")
23
-
24
 
25
  def abstract_factory():
26
  return {}
@@ -65,7 +61,8 @@ class GlobalMetric(SingleStreamOperator, Metric):
65
  try:
66
  instance_score = self._compute([refs], [pred])
67
  except:
68
- instance_score = {"score": None}
 
69
  if isinstance(self.main_score, str) and self.main_score is not None:
70
  instance_score[self.main_score] = None
71
 
@@ -86,6 +83,7 @@ class GlobalMetric(SingleStreamOperator, Metric):
86
  def _compute(self, references: List[List[str]], predictions: List[str]) -> dict:
87
  result = self.compute(references, predictions)
88
  result["score"] = result[self.main_score]
 
89
  return result
90
 
91
  @abstractmethod
@@ -131,6 +129,7 @@ class InstanceMetric(SingleStreamOperator, Metric):
131
  global_score[field] = mean([instance["score"]["instance"][field] for instance in instances])
132
  if field == self.main_score:
133
  global_score["score"] = global_score[field]
 
134
 
135
  for instance in instances:
136
  yield instance
@@ -138,6 +137,7 @@ class InstanceMetric(SingleStreamOperator, Metric):
138
  def _compute(self, references: List[List[str]], predictions: List[str]) -> dict:
139
  result = self.compute(references=references, predictions=predictions)
140
  result["score"] = result[self.main_score]
 
141
  return result
142
 
143
  @abstractmethod
@@ -147,7 +147,6 @@ class InstanceMetric(SingleStreamOperator, Metric):
147
 
148
  class Squad(GlobalMetric):
149
  _metric = None
150
- reduction_map = {"mean": ["f1"]}
151
  main_score = "f1"
152
  metric = "squad"
153
 
@@ -172,6 +171,7 @@ class SingleReferenceInstanceMetric(InstanceMetric):
172
  def _compute(self, references: List[str], prediction: str) -> dict:
173
  result = self.compute(references[0], prediction)
174
  result["score"] = result[self.main_score]
 
175
  return result
176
 
177
  @abstractmethod
@@ -288,6 +288,7 @@ class F1MultiLabel(GlobalMetric):
288
  _metric = None
289
  main_score = "f1_macro"
290
  average = None # Report per class then aggregate by mean
 
291
 
292
  def prepare(self):
293
  super(F1MultiLabel, self).prepare()
@@ -314,17 +315,41 @@ class F1MultiLabel(GlobalMetric):
314
  len(reference) == 1 for reference in references
315
  ), "Only a single reference per prediction is allowed in F1 metric"
316
  references = [reference[0] for reference in references]
317
- labels = list(set([label for reference in references for label in reference]))
 
 
 
 
 
 
 
 
 
318
  for label in labels:
319
  self.add_str_to_id(label)
320
  formatted_references = [self.get_one_hot_vector(reference) for reference in references]
321
  formatted_predictions = [self.get_one_hot_vector(prediction) for prediction in predictions]
 
 
 
 
 
 
 
 
 
322
  result = self._metric.compute(
323
- predictions=formatted_predictions, references=formatted_references, average=self.average
 
 
 
324
  )
325
  if isinstance(result["f1"], numpy.ndarray):
326
  from statistics import mean
327
 
 
 
 
328
  final_result = {self.main_score: mean(result["f1"])}
329
  for i, label in enumerate(labels):
330
  final_result["f1_" + label] = result["f1"][i]
@@ -348,24 +373,36 @@ class Rouge(HuggingfaceMetric):
348
  main_score = "rougeL"
349
  scale = 1.0
350
 
 
 
 
 
 
 
 
351
  def compute(self, references, predictions):
352
- predictions = ["\n".join(nltk.sent_tokenize(prediction.strip())) for prediction in predictions]
353
- references = [["\n".join(nltk.sent_tokenize(r.strip())) for r in reference] for reference in references]
354
  return super().compute(references, predictions)
355
 
356
 
357
- # Computes chat edit distance, ignoring repeating whitespace
358
  class CharEditDistanceAccuracy(SingleReferenceInstanceMetric):
359
  reduction_map = {"mean": ["char_edit_dist_accuracy"]}
360
  main_score = "char_edit_dist_accuracy"
361
 
 
 
 
 
 
362
  def compute(self, reference, prediction: str) -> dict:
363
- formatted_prediction = " ".join(prediction.split())
364
- formatted_reference = " ".join(reference.split())
365
  max_length = max(len(formatted_reference), len(formatted_prediction))
366
  if max_length == 0:
367
  return 0
368
- edit_dist = eval(formatted_reference, formatted_prediction)
369
  return {"char_edit_dist_accuracy": (1 - edit_dist / max_length)}
370
 
371
 
 
5
  from typing import Any, Dict, Generator, List, Optional
6
 
7
  import evaluate
 
8
  import numpy
 
9
 
10
  from .dataclass import InternalField
11
  from .operator import (
 
17
  from .operators import CopyFields
18
  from .stream import MultiStream, Stream
19
 
 
 
20
 
21
  def abstract_factory():
22
  return {}
 
61
  try:
62
  instance_score = self._compute([refs], [pred])
63
  except:
64
+ instance_score = {"score": None, "score_name": self.main_score}
65
+
66
  if isinstance(self.main_score, str) and self.main_score is not None:
67
  instance_score[self.main_score] = None
68
 
 
83
  def _compute(self, references: List[List[str]], predictions: List[str]) -> dict:
84
  result = self.compute(references, predictions)
85
  result["score"] = result[self.main_score]
86
+ result["score_name"] = self.main_score
87
  return result
88
 
89
  @abstractmethod
 
129
  global_score[field] = mean([instance["score"]["instance"][field] for instance in instances])
130
  if field == self.main_score:
131
  global_score["score"] = global_score[field]
132
+ global_score["score_name"] = self.main_score
133
 
134
  for instance in instances:
135
  yield instance
 
137
  def _compute(self, references: List[List[str]], predictions: List[str]) -> dict:
138
  result = self.compute(references=references, predictions=predictions)
139
  result["score"] = result[self.main_score]
140
+ result["score_name"] = self.main_score
141
  return result
142
 
143
  @abstractmethod
 
147
 
148
  class Squad(GlobalMetric):
149
  _metric = None
 
150
  main_score = "f1"
151
  metric = "squad"
152
 
 
171
  def _compute(self, references: List[str], prediction: str) -> dict:
172
  result = self.compute(references[0], prediction)
173
  result["score"] = result[self.main_score]
174
+ result["score_name"] = self.main_score
175
  return result
176
 
177
  @abstractmethod
 
288
  _metric = None
289
  main_score = "f1_macro"
290
  average = None # Report per class then aggregate by mean
291
+ classes_to_ignore = ["none"]
292
 
293
  def prepare(self):
294
  super(F1MultiLabel, self).prepare()
 
315
  len(reference) == 1 for reference in references
316
  ), "Only a single reference per prediction is allowed in F1 metric"
317
  references = [reference[0] for reference in references]
318
+ labels = [
319
+ l
320
+ for l in set([label for reference in references for label in reference])
321
+ if l not in self.classes_to_ignore
322
+ ]
323
+ # if no classes are left then F1 is not defined
324
+ # (e.g. only "none" in references)
325
+ if len(labels) == 0:
326
+ return {self.main_score: float("nan")}
327
+
328
  for label in labels:
329
  self.add_str_to_id(label)
330
  formatted_references = [self.get_one_hot_vector(reference) for reference in references]
331
  formatted_predictions = [self.get_one_hot_vector(prediction) for prediction in predictions]
332
+
333
+ # There is odd behavior in scikit-learn that when passing a one-hot vector with a single
334
+ # element, it is treated a class identifier. Therefore, we add labels=[1] to limit to only
335
+ # to this class.
336
+ if len(labels) == 1:
337
+ labels_param = [1]
338
+ else:
339
+ labels_param = None
340
+
341
  result = self._metric.compute(
342
+ predictions=formatted_predictions,
343
+ references=formatted_references,
344
+ average=self.average,
345
+ labels=labels_param,
346
  )
347
  if isinstance(result["f1"], numpy.ndarray):
348
  from statistics import mean
349
 
350
+ assert len(result["f1"]) == len(
351
+ labels
352
+ ), f'F1 result ({result["f1"]}) has more entries than labels ({labels})'
353
  final_result = {self.main_score: mean(result["f1"])}
354
  for i, label in enumerate(labels):
355
  final_result["f1_" + label] = result["f1"][i]
 
373
  main_score = "rougeL"
374
  scale = 1.0
375
 
376
+ def prepare(self):
377
+ super().prepare()
378
+ import nltk
379
+
380
+ nltk.download("punkt")
381
+ self.sent_tokenize = nltk.sent_tokenize
382
+
383
  def compute(self, references, predictions):
384
+ predictions = ["\n".join(self.sent_tokenize(prediction.strip())) for prediction in predictions]
385
+ references = [["\n".join(self.sent_tokenize(r.strip())) for r in reference] for reference in references]
386
  return super().compute(references, predictions)
387
 
388
 
389
+ # Computes chat edit distance, ignoring whitespace
390
  class CharEditDistanceAccuracy(SingleReferenceInstanceMetric):
391
  reduction_map = {"mean": ["char_edit_dist_accuracy"]}
392
  main_score = "char_edit_dist_accuracy"
393
 
394
+ def prepare(self):
395
+ import editdistance
396
+
397
+ self.eval = editdistance.eval
398
+
399
  def compute(self, reference, prediction: str) -> dict:
400
+ formatted_prediction = "".join(prediction.split())
401
+ formatted_reference = "".join(reference.split())
402
  max_length = max(len(formatted_reference), len(formatted_prediction))
403
  if max_length == 0:
404
  return 0
405
+ edit_dist = self.eval(formatted_reference, formatted_prediction)
406
  return {"char_edit_dist_accuracy": (1 - edit_dist / max_length)}
407
 
408