Spaces:

faori
/

HTK

Runtime error

HTK

File size: 16,400 Bytes

550665c

import numpy as np
from .constants import (
    QUESTION_COLUMN_NAME,
    CONTEXT_COLUMN_NAME,
    ANSWER_COLUMN_NAME,
    ANSWERABLE_COLUMN_NAME,
    ID_COLUMN_NAME
)

def get_sketch_features(

    tokenizer, 

    mode, 

    data_args

):
    """

    Get the features for sketch model.



    Args:

        tokenizer (Tokenizer): Tokenizer for tokenizing input examples.

        mode (str): Mode of operation ("train", "eval", or "test").

        data_args (dict): Additional arguments for data loading.



    Returns:

        tuple: A tuple containing the function for preparing features and a boolean value indicating if labels are required.

    """

    pad_on_right = tokenizer.padding_side == "right"
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
    
    def tokenize_fn(examples):
        """

        Tokenize input examples.



        Args:

            examples (dict): Input examples.



        Returns:

            dict: Tokenized examples.

        """
        # Tokenize the input examples using the provided tokenizer.
        # The tokenizer is configured to truncate sequences to a maximum length.
        # The tokenizer also returns the overflowing tokens, offsets mapping, and token type IDs.
        # The padding strategy is determined by the data_args.pad_to_max_length parameter.
        # tokenized_examples = tokenizer(
        #     examples[QUESTION_COLUMN_NAME if pad_on_right else CONTEXT_COLUMN_NAME],
        #     examples[CONTEXT_COLUMN_NAME if pad_on_right else QUESTION_COLUMN_NAME],
        #     truncation="only_second" if pad_on_right else "only_first",
        #     truncation=True,
        #     max_length=max_seq_length,
        #     stride=data_args.doc_stride,
        #     return_overflowing_tokens=True,
        #     return_offsets_mapping=False,
        #     return_token_type_ids=data_args.return_token_type_ids,
        #     padding="max_length" if data_args.pad_to_max_length else False,
        # )


        # Strip leading and trailing whitespaces from questions and contexts
        questions = [q.strip() for q in examples[QUESTION_COLUMN_NAME if pad_on_right else CONTEXT_COLUMN_NAME]]
        contexts = [c.strip() for c in examples[CONTEXT_COLUMN_NAME if pad_on_right else QUESTION_COLUMN_NAME]]

        # Now, apply the tokenizer
        tokenized_examples = tokenizer(
            questions,
            contexts,
            truncation="only_second" if pad_on_right else "only_first",
            max_length=max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_token_type_ids=data_args.return_token_type_ids,
            padding="max_length" if data_args.pad_to_max_length else False,
        )
        
        return tokenized_examples



    
    def prepare_train_features(examples):
        """

        Prepare training features by tokenizing the input examples and adding labels.



        Args:

            examples (dict): Input examples.



        Returns:

            dict: Tokenized and labeled examples.

        """
        # Tokenize the input examples using the provided tokenizer.
        tokenized_examples = tokenize_fn(examples)
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
        
        # Add labels to the tokenized examples.
        # The label is 0 for answerable and 1 for not answerable.
        tokenized_examples["labels"] = []
        for i in range(len(tokenized_examples["input_ids"])):
            sample_index = sample_mapping[i]
            
            # Determine if the example is answerable or not.
            is_impossible = examples[ANSWERABLE_COLUMN_NAME][sample_index]
            tokenized_examples["labels"].append(1 if is_impossible else 0)
        
        return tokenized_examples

    
    def prepare_eval_features(examples):
        """

        Prepare evaluation features by tokenizing the input examples and adding labels.



        Args:

            examples (dict): Input examples.



        Returns:

            dict: Tokenized and labeled examples.



        """
        # Tokenize the input examples using the provided tokenizer.
        tokenized_examples = tokenize_fn(examples)
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
        
        # Add example ids and labels to the tokenized examples.
        tokenized_examples["example_id"] = []
        tokenized_examples["labels"] = []
        
        for i in range(len(tokenized_examples["input_ids"])):
            # Determine the sample index.
            sample_index = sample_mapping[i]
            
            # Extract the example id.
            id_col = examples[ID_COLUMN_NAME][sample_index]
            tokenized_examples["example_id"].append(id_col)
            
            # Determine the label.
            # answerable: 0, not answerable: 1.
            is_impossible = examples[ANSWERABLE_COLUMN_NAME][sample_index]
            tokenized_examples["labels"].append(1 if is_impossible else 0)
        
        return tokenized_examples

    
    def prepare_test_features(examples):
        """

        Prepare test features by tokenizing the input examples and adding example ids.



        Args:

            examples (dict): Input examples.



        Returns:

            dict: Tokenized and labeled examples.



        """
        # Tokenize the input examples using the provided tokenizer.
        tokenized_examples = tokenize_fn(examples)
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
        
        # Add example ids to the tokenized examples.
        tokenized_examples["example_id"] = []
        
        for i in range(len(tokenized_examples["input_ids"])):
            # Determine the sample index.
            sample_index = sample_mapping[i]
            
            # Extract the example id.
            id_col = examples[ID_COLUMN_NAME][sample_index]
            
            # Add the example id to the tokenized examples.
            tokenized_examples["example_id"].append(id_col)
        
        return tokenized_examples

    
    if mode == "train":
        get_features_fn = prepare_train_features
    elif mode == "eval":
        get_features_fn = prepare_eval_features
    elif mode == "test":
        get_features_fn = prepare_test_features
    
    return get_features_fn, True

def get_intensive_features(

    tokenizer, 

    mode, 

    data_args

):
    """

    Generate intensive features for training, evaluation, or testing.



    Args:

        tokenizer (Tokenizer): The tokenizer used to tokenize the input examples.

        mode (str): The mode of operation. Must be one of "train", "eval", or "test".

        data_args (DataArguments): The data arguments containing the configuration for tokenization.



    Returns:

        tuple: A tuple containing the function to prepare the features and a boolean indicating if the tokenizer is beam-based.



    Raises:

        ValueError: If the mode is not one of "train", "eval", or "test".



    """
    pad_on_right = tokenizer.padding_side == "right"
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
    beam_based = data_args.intensive_model_type in ["xlnet", "xlm"]
    
    def tokenize_fn(examples):
        """

        Tokenize input examples.



        Args:

            examples (dict): Input examples.



        Returns:

            dict: Tokenized examples.

        """
        # Tokenize the input examples using the provided tokenizer.
        # The tokenizer is configured to truncate sequences to a maximum length.
        # The tokenizer also returns the overflowing tokens, offsets mapping, and token type IDs.
        # The padding strategy is determined by the data_args.pad_to_max_length parameter.
        tokenized_examples = tokenizer(
            examples[QUESTION_COLUMN_NAME if pad_on_right else CONTEXT_COLUMN_NAME],
            examples[CONTEXT_COLUMN_NAME if pad_on_right else QUESTION_COLUMN_NAME],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_token_type_ids=data_args.return_token_type_ids,
            padding="max_length" if data_args.pad_to_max_length else False,
        )
        
        return tokenized_examples
    
    def prepare_train_features(examples):
        """

        Prepare training features by tokenizing the input examples and adding labels.



        Args:

            examples (dict): Input examples.



        Returns:

            dict: Tokenized and labeled examples.

        """
        # Tokenize the input examples using the provided tokenizer.
        tokenized_examples = tokenize_fn(examples)
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
        offset_mapping = tokenized_examples.pop("offset_mapping")
        
        # Add start positions, end positions, and is_impossibles to the tokenized examples.
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []
        tokenized_examples["is_impossibles"] = []
        
        if beam_based:
            # Add cls_index and p_mask to the tokenized examples if beam_based.
            tokenized_examples["cls_index"] = []
            tokenized_examples["p_mask"] = []
        
        for i, offsets in enumerate(offset_mapping):
            # We will label impossible answers with the index of the CLS token.
            # Get the input_ids and cls_index for the current example.
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)
            
            # Get the sequence_ids for the current example.
            sequence_ids = tokenized_examples.sequence_ids(i)
            context_index = 1 if pad_on_right else 0
            
            # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0.
            # The cls token gets 0.0 too (for predictions of empty answers).
            # Inspired by XLNet.
            if beam_based:
                tokenized_examples["cls_index"].append(cls_index)
                tokenized_examples["p_mask"].append(
                    [
                    0.0 if s == context_index or k == cls_index else 1.0
                    for s, k in enumerate(sequence_ids)
                    ]
                )
            
            # Get the sample_index, answers, and is_impossible for the current example.
            sample_index = sample_mapping[i]
            answers = examples[ANSWER_COLUMN_NAME][sample_index]
            is_impossible = examples[ANSWERABLE_COLUMN_NAME][sample_index]
            
            # If no answers are given, set the cls_index as answer.
            if is_impossible or len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
                tokenized_examples["is_impossibles"].append(1.0) # unanswerable
            else:
                # Start and end token index of the current span in the text.
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])
                
                # sequence_ids: 0 for question, 1 for context, None for others
                
                # Start token index of the current span in the tokenized context.
                token_start_index = 0
                while sequence_ids[token_start_index] != context_index:
                    token_start_index += 1
                    
                # End token index of the current span in the tokenized context.
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != context_index:
                    token_end_index -= 1
                
                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                if not (offsets[token_start_index][0] <= start_char and 
                        offsets[token_end_index][1] >= end_char
                ):
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                    tokenized_examples["is_impossibles"].append(1.0) # answerable
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while (token_start_index < len(offsets) and 
                           offsets[token_start_index][0] <= start_char):
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(token_start_index - 1)

                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(token_end_index + 1)
                    tokenized_examples["is_impossibles"].append(0.0) # answerable
                    
        return tokenized_examples            
    

    def prepare_eval_features(examples):
        """

        Prepare evaluation features by tokenizing the input examples and adding labels.



        Args:

            examples (dict): Input examples.



        Returns:

            dict: Tokenized and labeled examples.

        """
        # Tokenize the input examples using the provided tokenizer.
        tokenized_examples = tokenize_fn(examples)
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
        
        # Add example ids to the tokenized examples.
        tokenized_examples["example_id"] = []
        
        if beam_based:
            # Add cls_index and p_mask to the tokenized examples if beam_based.
            tokenized_examples["cls_index"] = []
            tokenized_examples["p_mask"] = []
        
        for i, input_ids in enumerate(tokenized_examples["input_ids"]):
            # Find the CLS index in the input_ids.
            cls_index = input_ids.index(tokenizer.cls_token_id)
            
            sequence_ids = tokenized_examples.sequence_ids(i)
            context_index = 1 if pad_on_right else 0
            
            if beam_based:
                # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0.
                # The cls token gets 0.0 too (for predictions of empty answers).
                # Inspired by XLNet.
                tokenized_examples["cls_index"].append(cls_index)
                tokenized_examples["p_mask"].append(
                    [
                    0.0 if s == context_index or k == cls_index else 1.0
                    for s, k in enumerate(sequence_ids)
                    ]
                )
            
            sample_index = sample_mapping[i]
            id_col = examples[ID_COLUMN_NAME][sample_index]
            tokenized_examples["example_id"].append(id_col)
            
            # Set to None the offset mapping that are not part of the context
            # so it's easy to determine if a token position is part of the context or not.
            tokenized_examples["offset_mapping"][i] = [
                (o if sequence_ids[k] == context_index else None)
                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
            ]
        
        return tokenized_examples
    
    if mode == "train":
        get_features_fn = prepare_train_features
    elif mode == "eval":
        get_features_fn = prepare_eval_features
    elif mode == "test":
        get_features_fn = prepare_eval_features
        
    return get_features_fn, True