Elron commited on
Commit
0db93dd
·
1 Parent(s): 5a833c3

Upload metrics.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. metrics.py +219 -5
metrics.py CHANGED
@@ -1,9 +1,23 @@
 
1
  from abc import ABC, abstractmethod
2
  from dataclasses import dataclass, field
3
- from typing import Any, Dict, List, Generator
4
 
5
- from .operator import SingleStreamOperator, StreamInstanceOperator
6
- from .stream import Stream
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  def absrtact_factory():
@@ -21,6 +35,7 @@ class UpdateStream(StreamInstanceOperator):
21
  instance.update(self.update)
22
  return instance
23
 
 
24
  # TODO: currently we have two classes with this name. metric.Metric and matrics.Metric...
25
  class Metric(ABC):
26
  @property
@@ -30,7 +45,7 @@ class Metric(ABC):
30
 
31
 
32
  class GlobalMetric(SingleStreamOperator, Metric):
33
- def process(self, stream: Stream):
34
  references = []
35
  predictions = []
36
  global_score = {}
@@ -113,7 +128,7 @@ class InstanceMetric(SingleStreamOperator, Metric):
113
  yield instance
114
 
115
  def _compute(self, references: List[List[str]], predictions: List[str]) -> dict:
116
- result = self.compute(references, predictions)
117
  result["score"] = result[self.main_score]
118
  return result
119
 
@@ -122,6 +137,29 @@ class InstanceMetric(SingleStreamOperator, Metric):
122
  pass
123
 
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  class SingleReferenceInstanceMetric(InstanceMetric):
126
  def _compute(self, references: List[str], prediction: str) -> dict:
127
  result = self.compute(references[0], prediction)
@@ -139,3 +177,179 @@ class Accuracy(SingleReferenceInstanceMetric):
139
 
140
  def compute(self, reference, prediction: str) -> dict:
141
  return {"accuracy": float(str(reference) == str(prediction))}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
  from abc import ABC, abstractmethod
3
  from dataclasses import dataclass, field
4
+ from typing import Any, Dict, Generator, List, Optional
5
 
6
+ import evaluate
7
+ import nltk
8
+ import numpy
9
+
10
+ from .operator import (
11
+ MultiStreamOperator,
12
+ SequntialOperator,
13
+ SingleStreamOperator,
14
+ StreamingOperator,
15
+ StreamInstanceOperator,
16
+ )
17
+ from .operators import CopyFields
18
+ from .stream import MultiStream, Stream
19
+
20
+ nltk.download("punkt")
21
 
22
 
23
  def absrtact_factory():
 
35
  instance.update(self.update)
36
  return instance
37
 
38
+
39
  # TODO: currently we have two classes with this name. metric.Metric and matrics.Metric...
40
  class Metric(ABC):
41
  @property
 
45
 
46
 
47
  class GlobalMetric(SingleStreamOperator, Metric):
48
+ def process(self, stream: Stream, stream_name: str = None) -> Generator:
49
  references = []
50
  predictions = []
51
  global_score = {}
 
128
  yield instance
129
 
130
  def _compute(self, references: List[List[str]], predictions: List[str]) -> dict:
131
+ result = self.compute(references=references, predictions=predictions)
132
  result["score"] = result[self.main_score]
133
  return result
134
 
 
137
  pass
138
 
139
 
140
+ class Squad(GlobalMetric):
141
+ _metric = None
142
+ reduction_map = {"mean": ["f1"]}
143
+ main_score = "f1"
144
+ metric = "squad"
145
+
146
+ def prepare(self):
147
+ super(Squad, self).prepare()
148
+ self._metric = evaluate.load(self.metric)
149
+
150
+ def compute(self, references: List[List[str]], predictions: List[str]) -> dict:
151
+ ids = [str(uuid.uuid4()).replace("-", "") for _ in range(len(predictions))]
152
+ formatted_predictions = [
153
+ {"prediction_text": prediction, "id": ids[i]} for i, prediction in enumerate(predictions)
154
+ ]
155
+ formatted_references = [
156
+ {"answers": {"answer_start": [-1], "text": reference}, "id": ids[i]}
157
+ for i, reference in enumerate(references)
158
+ ]
159
+
160
+ return self._metric.compute(predictions=formatted_predictions, references=formatted_references)
161
+
162
+
163
  class SingleReferenceInstanceMetric(InstanceMetric):
164
  def _compute(self, references: List[str], prediction: str) -> dict:
165
  result = self.compute(references[0], prediction)
 
177
 
178
  def compute(self, reference, prediction: str) -> dict:
179
  return {"accuracy": float(str(reference) == str(prediction))}
180
+
181
+
182
+ class MetricPipeline(MultiStreamOperator, Metric):
183
+ main_score: str = None
184
+ preprocess_steps: Optional[List[StreamingOperator]] = field(default_factory=list)
185
+ postpreprocess_steps: Optional[List[StreamingOperator]] = field(default_factory=list)
186
+ metric: Metric = None
187
+
188
+ def verify(self):
189
+ assert self.main_score is not None, "main_score is not set"
190
+
191
+ def prepare(self):
192
+ super().prepare()
193
+ self.prepare_score = CopyFields(
194
+ field_to_field=[
195
+ [f"score/instance/{self.main_score}", "score/instance/score"],
196
+ [f"score/global/{self.main_score}", "score/global/score"],
197
+ ],
198
+ use_query=True,
199
+ )
200
+
201
+ def process(self, multi_stream: MultiStream) -> MultiStream:
202
+ for step in self.preprocess_steps:
203
+ multi_stream = step(multi_stream)
204
+ multi_stream = self.metric(multi_stream)
205
+ for step in self.postpreprocess_steps:
206
+ multi_stream = step(multi_stream)
207
+ multi_stream = self.prepare_score(multi_stream)
208
+ return multi_stream
209
+
210
+
211
+ class HuggingfaceMetric(GlobalMetric):
212
+ metric_name: str = None
213
+ main_score: str = None
214
+ scale: float = 1.0
215
+
216
+ def prepare(self):
217
+ super().prepare()
218
+ self.metric = evaluate.load(self.metric_name)
219
+
220
+ def compute(self, references: List[List[str]], predictions: List[str]) -> dict:
221
+ result = self.metric.compute(predictions=predictions, references=references)
222
+ if self.scale != 1.0:
223
+ for key in result:
224
+ if isinstance(result[key], float):
225
+ result[key] /= self.scale
226
+ return result
227
+
228
+
229
+ class F1(GlobalMetric):
230
+ _metric = None
231
+ main_score = "f1_macro"
232
+ average = None # Report per class then aggregate by mean
233
+ metric = "f1"
234
+
235
+ def prepare(self):
236
+ super(F1, self).prepare()
237
+ self._metric = evaluate.load(self.metric)
238
+
239
+ def get_str_id(self, str):
240
+ if str not in self.str_to_id:
241
+ id = len(self.str_to_id)
242
+ self.str_to_id[str] = id
243
+ self.id_to_str[id] = str
244
+ return self.str_to_id[str]
245
+
246
+ def compute(self, references: List[List[str]], predictions: List[str]) -> dict:
247
+ assert all(
248
+ len(reference) == 1 for reference in references
249
+ ), "One single reference per predictition are allowed in F1 metric"
250
+ self.str_to_id = {}
251
+ self.id_to_str = {}
252
+ formatted_references = [self.get_str_id(reference[0]) for reference in references]
253
+ unique_labels = self.str_to_id.keys()
254
+ formatted_predictions = [self.get_str_id(prediction) for prediction in predictions]
255
+ labels = list(set(formatted_references))
256
+ result = self._metric.compute(
257
+ predictions=formatted_predictions, references=formatted_references, labels=labels, average=self.average
258
+ )
259
+ if isinstance(result["f1"], numpy.ndarray):
260
+ from statistics import mean
261
+
262
+ final_result = {self.main_score: mean(result["f1"])}
263
+ for i, label in enumerate(labels):
264
+ final_result["f1_" + self.id_to_str[label]] = result["f1"][i]
265
+ else:
266
+ final_result = {self.main_score: result["f1"]}
267
+ return final_result
268
+
269
+
270
+ class F1Micro(F1):
271
+ main_score = "f1_micro"
272
+ average = "micro"
273
+
274
+
275
+ class F1Macro(F1):
276
+ main_score = "f1_macro"
277
+
278
+
279
+ class F1MultiLabel(GlobalMetric):
280
+ _metric = None
281
+ main_score = "f1_macro"
282
+ average = None # Report per class then aggregate by mean
283
+ seperator = ","
284
+
285
+ def prepare(self):
286
+ super(F1MultiLabel, self).prepare()
287
+ self._metric = evaluate.load("f1", "multilabel")
288
+
289
+ def add_str_to_id(self, str):
290
+ if not str in self.str_to_id:
291
+ id = len(self.str_to_id)
292
+ self.str_to_id[str] = id
293
+ self.id_to_str[id] = str
294
+ return
295
+
296
+ def get_one_hot_vector(self, labels: List[str]):
297
+ result = [0] * len(self.str_to_id)
298
+ for label in labels:
299
+ if label in self.str_to_id:
300
+ result[self.str_to_id[label]] = 1
301
+ return result
302
+
303
+ def compute(self, references: List[List[str]], predictions: List[str]) -> dict:
304
+ self.str_to_id = {}
305
+ self.id_to_str = {}
306
+ labels = list(set([label for reference in references for label in reference]))
307
+ for label in labels:
308
+ assert (
309
+ not self.seperator in label
310
+ ), "Reference label (f{label}) can not contain multi label seperator (f{self.seperator}) "
311
+ self.add_str_to_id(label)
312
+ formatted_references = [self.get_one_hot_vector(reference) for reference in references]
313
+ split_predictions = [
314
+ [label.strip() for label in prediction.split(self.seperator)] for prediction in predictions
315
+ ]
316
+ formatted_predictions = [self.get_one_hot_vector(prediction) for prediction in split_predictions]
317
+ result = self._metric.compute(
318
+ predictions=formatted_predictions, references=formatted_references, average=self.average
319
+ )
320
+ if isinstance(result["f1"], numpy.ndarray):
321
+ from statistics import mean
322
+
323
+ final_result = {self.main_score: mean(result["f1"])}
324
+ for i, label in enumerate(labels):
325
+ final_result["f1_" + label] = result["f1"][i]
326
+ else:
327
+ final_result = {self.main_score: result["f1"]}
328
+ return final_result
329
+
330
+
331
+ class F1MicroMultiLabel(F1MultiLabel):
332
+ main_score = "f1_micro"
333
+ average = "micro"
334
+
335
+
336
+ class F1MacroMultiLabel(F1MultiLabel):
337
+ main_score = "f1_macro"
338
+ average = None
339
+
340
+
341
+ class Rouge(HuggingfaceMetric):
342
+ metric_name = "rouge"
343
+ main_score = "rougeL"
344
+ scale = 1.0
345
+
346
+ def compute(self, references, predictions):
347
+ predictions = ["\n".join(nltk.sent_tokenize(prediction.strip())) for prediction in predictions]
348
+ references = [["\n".join(nltk.sent_tokenize(r.strip())) for r in reference] for reference in references]
349
+ return super().compute(references, predictions)
350
+
351
+
352
+ class Bleu(HuggingfaceMetric):
353
+ metric_name = "bleu"
354
+ main_score = "bleu"
355
+ scale = 1.0