File size: 7,198 Bytes

import numpy as np
import torch

from evaluate import load as load_metric

from sklearn.metrics import mean_squared_error
from tqdm.auto import tqdm

MAX_TARGET_LENGTH = 128

# load evaluation metrics
sacrebleu = load_metric('sacrebleu')
rouge = load_metric('rouge')
meteor = load_metric('meteor')
bertscore = load_metric('bertscore')

# use gpu if it's available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

def flatten_list(l):
    """
    Utility function to convert a list of lists into a flattened list
    Params:
        l (list of lists): list to be flattened
    Returns:
        A flattened list with the elements of the original list
    """
    return [item for sublist in l for item in sublist]

def parse_float(value):
    """
    Utility function to parse a string into a float

    Params:
        value (string): value to be converted to float
    Returns:
        The float representation of the given string, or None if the string could
        not be converted to a float
    """
    try:
        float_value = float(value)
        return float_value
    except ValueError:
        return None

def extract_scores(predictions):
    """
    Utility function to extract the scores from the predictions of the model

    Params:
        predictions (list): complete model predictions
    Returns:
        scores (list): extracted scores from the model's predictions
    """
    scores = []
    # iterate through predictions and try to extract predicted score;
    # if score could not be extracted, set it to None
    for pred in predictions:
        try:
            score_string = pred.split(' ', 1)[0].strip()
            score = parse_float(score_string)
        except IndexError:
            score = None
        scores.append(score)
    
    return scores

def extract_feedback(predictions):
    """
    Utility function to extract the feedback from the predictions of the model

    Params:
        predictions (list): complete model predictions
    Returns:
        feedback (list): extracted feedback from the model's predictions
    """
    feedback = []
    # iterate through predictions and try to extract predicted feedback
    for pred in predictions:
        try:
            fb = pred.split(':', 1)[1]
        except IndexError:
            try:
                fb = pred.split(' ', 1)[1]
            except IndexError:
                fb = pred
        feedback.append(fb.strip())
    
    return feedback

def compute_rmse(predictions, labels):
    """
    Utility function to compute the root mean squared error of the
    score predictions in relation to the golden label scores

    Params:
        predictions (list): model score predictions
        labels (list): golden label scores
    Returns:
        (float, int): rmse of valid samples and number of invalid samples
    """
    # get indexes of valid score predictions
    # (i.e., where the score is not None)
    idx = np.where(np.array(predictions) != None)

    # get size of the golden labels list and of
    # the valid predictions array
    labels_size = np.array(labels).size
    valid_predictions_size = idx[0].size

    # only compute rmse if valid score predictions were generated,
    # otherwise set mse to 1
    if valid_predictions_size > 0:
        # calculate rmse from labels and predictions
        valid_predictions = np.array(predictions)[idx]
        score_labels = np.array(labels)[idx]
        rmse = mean_squared_error(score_labels, valid_predictions, squared=False)

        # cap mse at 1
        if rmse > 1:
            return 1, labels_size - valid_predictions_size
        
        # return computed rmse and number of invalid samples
        return rmse, labels_size - valid_predictions_size
    else:
        return 1, labels_size - valid_predictions_size

def compute_metrics(predictions, labels):
    """
    Compute evaluation metrics from the predictions of the model

    Params:
        predictions (list): complete model predictions
        labels (list): golden labels (previously tokenized)
    Returns:
        results (dict): dictionary with the computed evaluation metrics
    """
    # extract feedback and labels from the model's predictions
    predicted_feedback = extract_feedback(predictions)
    predicted_scores = extract_scores(predictions)

    # extract feedback and labels from the golden labels
    reference_feedback = [x.split('Feedback:', 1)[1].strip() for x in labels]
    reference_scores = [float(x.split('Feedback:', 1)[0].strip()) for x in labels]

    # compute HF metrics
    sacrebleu_score = sacrebleu.compute(predictions=predicted_feedback, references=[[x] for x in reference_feedback])['score']
    rouge_score = rouge.compute(predictions=predicted_feedback, references=reference_feedback)['rouge2']
    meteor_score = meteor.compute(predictions=predicted_feedback, references=reference_feedback)['meteor']
    bert_score = bertscore.compute(
        predictions=predicted_feedback,
        references=reference_feedback,
        lang='de',
        model_type='bert-base-multilingual-cased',
        rescale_with_baseline=True)
    
    # compute rmse of score predictions
    rmse, _ = compute_rmse(predicted_scores, reference_scores)

    results = {
        'sacrebleu': sacrebleu_score,
        'rouge': rouge_score,
        'meteor': meteor_score,
        'bert_score': np.array(bert_score['f1']).mean().item(),
        'rmse': rmse
        }
    
    return results

def evaluate(model, tokenizer, dataloader):
    """
    Evaluate model on the given dataset
    Params:
        model (PreTrainedModel): seq2seq model
        tokenizer (PreTrainedTokenizer): tokenizer from HuggingFace
        dataloader (torch Dataloader): dataloader of the dataset to be used for evaluation
    Returns:
        results (dict): dictionary with the computed evaluation metrics
        predictions (list): list of the decoded predictions of the model
    """
    decoded_preds, decoded_labels = [], []

    model.eval()
    # iterate through batchs in the dataloader
    for batch in tqdm(dataloader):
        with torch.no_grad():
            batch = {k: v.to(device) for k, v in batch.items()}
            # generate tokens from batch
            generated_tokens = model.generate(
                batch['input_ids'],
                attention_mask=batch['attention_mask'],
                max_length=MAX_TARGET_LENGTH
            )
            # get golden labels from batch
            labels_batch = batch['labels']
            
            # decode model predictions and golden labels
            decoded_preds_batch = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            decoded_labels_batch = tokenizer.batch_decode(labels_batch, skip_special_tokens=True)

            decoded_preds.append(decoded_preds_batch)
            decoded_labels.append(decoded_labels_batch)

    # convert predictions and golden labels into flattened lists
    predictions = flatten_list(decoded_preds)
    labels = flatten_list(decoded_labels)

    # compute metrics based on predictions and golden labels
    results = compute_metrics(predictions, labels)

    return results, predictions