File size: 2,599 Bytes
7f7285f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# -*- coding:utf-8  -*-

"""
@Author             : Bao
@Date               : 2020/8/24
@Desc               :
@Last modified by   : Bao
@Last modified date : 2020/9/1
"""

import os
import json
import numpy as np
from collections import defaultdict
import tensorflow as tf
from sklearn.metrics import precision_recall_fscore_support
try:
    from .scorer import fever_score
except:
    from scorer import fever_score


prefix = os.environ['PJ_HOME']


class FeverScorer:
    def __init__(self):
        self.id2label = {2: 'SUPPORTS', 0: 'REFUTES', 1: 'NOT ENOUGH INFO'}
        self.label2id = {value: key for key, value in self.id2label.items()}

    def get_scores(self, predicted_file, actual_file=f'{prefix}/data/fever/shared_task_dev.jsonl'):
        id2results = defaultdict(dict)

        with tf.io.gfile.GFile(predicted_file) as f:
            for line in f:
                js = json.loads(line)
                guid = js['id']
                id2results[guid] = js

        with tf.io.gfile.GFile(actual_file) as fin:
            for line in fin:
                line = json.loads(line)
                guid = line['id']
                evidence = line['evidence']
                label = line['label']
                id2results[guid]['evidence'] = evidence
                id2results[guid]['label'] = label

        results = self.label_score(list(id2results.values()))
        score, accuracy, precision, recall, f1 = fever_score(list(id2results.values()))
        results.update({
            'Evidence Precision': precision,
            'Evidence Recall': recall,
            'Evidence F1': f1,
            'FEVER Score': score,
            'Label Accuracy': accuracy
        })

        return results

    def label_score(self, results):
        truth = np.array([v['label'] for v in results])
        prediction = np.array([v['predicted_label'] for v in results])
        labels = list(self.label2id.keys())
        results = {}
        p, r, f, _ = precision_recall_fscore_support(truth, prediction, labels=labels)
        for i, label in enumerate(self.label2id.keys()):
            results['{} Precision'.format(label)] = p[i]
            results['{} Recall'.format(label)] = r[i]
            results['{} F1'.format(label)] = f[i]

        return results


if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--predicted_file", '-i', type=str)
    args = parser.parse_args()

    scorer = FeverScorer()
    results = scorer.get_scores(args.predicted_file)
    print(json.dumps(results, ensure_ascii=False, indent=4))