Spaces:

LightChen2333
/

OpenSLU

Runtime error

File size: 14,217 Bytes

'''
Author: Qiguang Chen
Date: 2023-01-11 10:39:26
LastEditors: Qiguang Chen
LastEditTime: 2023-02-17 19:39:22
Description: Metric calculation class

'''
from collections import Counter
from typing import List, Dict

import numpy as np
from sklearn.metrics import f1_score

from common.utils import InputData, OutputData


class Evaluator(object):
    """Evaluation metric funtions library class
        supported metric:
        - slot_f1
        - intent_acc
        - exactly_match_accuracy
        - intent_f1 (defult "macro_intent_f1")
            - macro_intent_f1
            - micro_intent_f1=
    """
    @staticmethod
    def exactly_match_accuracy(pred_slot: List[List[str or int]],
                               real_slot: List[List[str or int]],
                               pred_intent: List[List[str or int] or str or int],
                               real_intent: List[List[str or int] or str or int]) -> float:
        """Compute the accuracy based on the whole predictions of given sentence, including slot and intent.
            (both support str or int index as the representation of slot and intent)
        Args:
            pred_slot (List[List[str or int]]): predicted sequence of slot list
            real_slot (List[List[str or int]]): golden sequence of slot list.
            pred_intent (List[List[str or int] or str or int]): golden intent list / golden multi intent list.
            real_intent (List[List[str or int] or str or int]): predicted intent list / predicted multi intent list.

        Returns:
            float: exactly match accuracy score
        """
        total_count, correct_count = 0.0, 0.0
        for p_slot, r_slot, p_intent, r_intent in zip(pred_slot, real_slot, pred_intent, real_intent):
            if isinstance(p_intent, list):
                p_intent, r_intent = set(p_intent), set(r_intent)
            if p_slot == r_slot and p_intent == r_intent:
                correct_count += 1.0
            total_count += 1.0

        return 1.0 * correct_count / total_count


    @staticmethod
    def intent_accuracy(pred_list: List, real_list: List) -> float:
        """Get  intent accuracy measured by predictions and ground-trues. Support both multi intent and single intent.

        Args:
            pred_list (List): predicted intent list
            real_list (List): golden intent list

        Returns:
            float: intent accuracy score
        """
        total_count, correct_count = 0.0, 0.0
        for p_intent, r_intent in zip(pred_list, real_list):
            if isinstance(p_intent, list):
                p_intent, r_intent = set(p_intent), set(r_intent)
            if p_intent == r_intent:
                correct_count += 1.0
            total_count += 1.0

        return 1.0 * correct_count / total_count

    @staticmethod
    def intent_f1(pred_list: List[List[int]], real_list: List[List[int]], num_intent: int, average='macro') -> float:
        """Get  intent accuracy measured by predictions and ground-trues. Support both multi intent and single intent.
        (Only support multi intent now, but you can use [[intent1], [intent2], ...] to compute intent f1 in single intent)
        Args:
            pred_list (List[List[int]]): predicted multi intent list.
            real_list (List[List[int]]): golden multi intent list.
            num_intent (int)
            average (str): support "micro" and "macro"

        Returns:
            float: intent accuracy score
        """
        return f1_score(Evaluator.__instance2onehot(num_intent, real_list),
                        Evaluator.__instance2onehot(num_intent, pred_list),
                        average=average,
                        zero_division=0)

    @staticmethod
    def __multilabel2one_hot(labels, nums):
        res = [0.] * nums
        if len(labels) == 0:
            return res
        if isinstance(labels[0], list):
            for label in labels[0]:
                res[label] = 1.
            return res
        for label in labels:
            res[label] = 1.
        return res

    @staticmethod
    def __instance2onehot(num_intent, data):
        res = []
        for intents in data:
            res.append(Evaluator.__multilabel2one_hot(intents, num_intent))
        return np.array(res)

    @staticmethod
    def __startOfChunk(prevTag, tag, prevTagType, tagType, chunkStart=False):
        if prevTag == 'B' and tag == 'B':
            chunkStart = True
        if prevTag == 'I' and tag == 'B':
            chunkStart = True
        if prevTag == 'O' and tag == 'B':
            chunkStart = True
        if prevTag == 'O' and tag == 'I':
            chunkStart = True

        if prevTag == 'E' and tag == 'E':
            chunkStart = True
        if prevTag == 'E' and tag == 'I':
            chunkStart = True
        if prevTag == 'O' and tag == 'E':
            chunkStart = True
        if prevTag == 'O' and tag == 'I':
            chunkStart = True

        if tag != 'O' and tag != '.' and prevTagType != tagType:
            chunkStart = True
        return chunkStart

    @staticmethod
    def __endOfChunk(prevTag, tag, prevTagType, tagType, chunkEnd=False):
        if prevTag == 'B' and tag == 'B':
            chunkEnd = True
        if prevTag == 'B' and tag == 'O':
            chunkEnd = True
        if prevTag == 'I' and tag == 'B':
            chunkEnd = True
        if prevTag == 'I' and tag == 'O':
            chunkEnd = True

        if prevTag == 'E' and tag == 'E':
            chunkEnd = True
        if prevTag == 'E' and tag == 'I':
            chunkEnd = True
        if prevTag == 'E' and tag == 'O':
            chunkEnd = True
        if prevTag == 'I' and tag == 'O':
            chunkEnd = True

        if prevTag != 'O' and prevTag != '.' and prevTagType != tagType:
            chunkEnd = True
        return chunkEnd

    @staticmethod
    def __splitTagType(tag):
        s = tag.split('-')
        if len(s) > 2 or len(s) == 0:
            raise ValueError('tag format wrong. it must be B-xxx.xxx')
        if len(s) == 1:
            tag = s[0]
            tagType = ""
        else:
            tag = s[0]
            tagType = s[1]
        return tag, tagType

    @staticmethod
    def computeF1Score(correct_slots: List[List[str]], pred_slots: List[List[str]]) -> float:
        """compute f1 score is modified from conlleval.pl

        Args:
            correct_slots (List[List[str]]): golden slot string list
            pred_slots (List[List[str]]): predicted slot string list

        Returns:
            float: slot f1 score
        """
        correctChunk = {}
        correctChunkCnt = 0.0
        foundCorrect = {}
        foundCorrectCnt = 0.0
        foundPred = {}
        foundPredCnt = 0.0
        correctTags = 0.0
        tokenCount = 0.0
        for correct_slot, pred_slot in zip(correct_slots, pred_slots):
            inCorrect = False
            lastCorrectTag = 'O'
            lastCorrectType = ''
            lastPredTag = 'O'
            lastPredType = ''
            for c, p in zip(correct_slot, pred_slot):
                c = str(c)
                p = str(p)
                correctTag, correctType = Evaluator.__splitTagType(c)
                predTag, predType = Evaluator.__splitTagType(p)

                if inCorrect == True:
                    if Evaluator.__endOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) == True and \
                            Evaluator.__endOfChunk(lastPredTag, predTag, lastPredType, predType) == True and \
                            (lastCorrectType == lastPredType):
                        inCorrect = False
                        correctChunkCnt += 1.0
                        if lastCorrectType in correctChunk:
                            correctChunk[lastCorrectType] += 1.0
                        else:
                            correctChunk[lastCorrectType] = 1.0
                    elif Evaluator.__endOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) != \
                            Evaluator.__endOfChunk(lastPredTag, predTag, lastPredType, predType) or \
                            (correctType != predType):
                        inCorrect = False

                if Evaluator.__startOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) == True and \
                        Evaluator.__startOfChunk(lastPredTag, predTag, lastPredType, predType) == True and \
                        (correctType == predType):
                    inCorrect = True

                if Evaluator.__startOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) == True:
                    foundCorrectCnt += 1
                    if correctType in foundCorrect:
                        foundCorrect[correctType] += 1.0
                    else:
                        foundCorrect[correctType] = 1.0

                if Evaluator.__startOfChunk(lastPredTag, predTag, lastPredType, predType) == True:
                    foundPredCnt += 1.0
                    if predType in foundPred:
                        foundPred[predType] += 1.0
                    else:
                        foundPred[predType] = 1.0

                if correctTag == predTag and correctType == predType:
                    correctTags += 1.0

                tokenCount += 1.0

                lastCorrectTag = correctTag
                lastCorrectType = correctType
                lastPredTag = predTag
                lastPredType = predType

            if inCorrect == True:
                correctChunkCnt += 1.0
                if lastCorrectType in correctChunk:
                    correctChunk[lastCorrectType] += 1.0
                else:
                    correctChunk[lastCorrectType] = 1.0

        if foundPredCnt > 0:
            precision = 1.0 * correctChunkCnt / foundPredCnt
        else:
            precision = 0

        if foundCorrectCnt > 0:
            recall = 1.0 * correctChunkCnt / foundCorrectCnt
        else:
            recall = 0

        if (precision + recall) > 0:
            f1 = (2.0 * precision * recall) / (precision + recall)
        else:
            f1 = 0

        return f1

    @staticmethod
    def max_freq_predict(sample):
        """Max frequency prediction.
        """
        predict = []
        for items in sample:
            predict.append(Counter(items).most_common(1)[0][0])
        return predict

    @staticmethod
    def __token_map(indexes, token_label_map):
        return [[token_label_map[idx] if idx in token_label_map else -1 for idx in index] for index in indexes]

    @staticmethod
    def compute_all_metric(inps: InputData,
                           output: OutputData,
                           intent_label_map: dict = None,
                           metric_list: List=None)-> Dict:
        """Auto compute all metric mentioned in 'metric_list'
        
        Args:
            inps (InputData): input golden slot and intent labels
            output (OutputData): output predicted slot and intent labels
            intent_label_map (dict, Optional): dict like {"intent1": 0, "intent2": 1, ...},which aims to map intent string to index
            metric_list (List): support metrics in ["slot_f1", "intent_acc", "intent_f1", "macro_intent_f1", "micro_intent_f1", "EMA"]

        Returns:
            Dict: all metric mentioned in 'metric_list', like {'EMA': 0.7, ...}
            
        
        Example:
            if compute slot metric:
            
                inps.slot = [["slot1", "slot2", ...], ...]; output.slot_ids=[["slot1", "slot2", ...], ...];
                
            if compute intent metric:
            
                [Multi Intent] inps.intent = [["intent1", "intent2", ...], ...]; output.intent_ids = [["intent1", "intent2", ...], ...] 
                
                [Single Intent] inps.intent = ["intent1", ...]; [Single Intent] output.intent_ids = ["intent1", ...]
        """
        if not metric_list:
            metric_list = ["slot_f1", "intent_acc", "EMA"]
        res_dict = {}
        use_slot = output.slot_ids is not None and len(output.slot_ids) > 0
        use_intent = output.intent_ids is not None and len(
            output.intent_ids) > 0
        if use_slot and "slot_f1" in metric_list:
            
            res_dict["slot_f1"] = Evaluator.computeF1Score(
                output.slot_ids, inps.slot)
        if use_intent and "intent_acc" in metric_list:
            res_dict["intent_acc"] = Evaluator.intent_accuracy(
                output.intent_ids, inps.intent)
            if isinstance(output.intent_ids[0], list):
                if "intent_f1" in metric_list:
                    res_dict["intent_f1"] = Evaluator.intent_f1(Evaluator.__token_map(output.intent_ids, intent_label_map),
                                                                Evaluator.__token_map(
                                                                    inps.intent, intent_label_map),
                                                                len(intent_label_map.keys()))
                elif "macro_intent_f1" in metric_list:
                    res_dict["macro_intent_f1"] = Evaluator.intent_f1(Evaluator.__token_map(output.intent_ids, intent_label_map),
                                                                      Evaluator.__token_map(inps.intent, intent_label_map),
                                                                      len(intent_label_map.keys()), average="macro")
                if "micro_intent_f1" in metric_list:
                    res_dict["micro_intent_f1"] = Evaluator.intent_f1(Evaluator.__token_map(output.intent_ids, intent_label_map),
                                                                      Evaluator.__token_map(inps.intent, intent_label_map),
                                                                      len(intent_label_map.keys()), average="micro")

        if use_slot and use_intent and "EMA" in metric_list:
            res_dict["EMA"] = Evaluator.exactly_match_accuracy(output.slot_ids, inps.slot, output.intent_ids,
                                                               inps.intent)
        return res_dict