Evander1
/

wav2vec2-angry-emotion

Audio Classification

Safetensors

wav2vec2

Model card Files Files and versions Community

Evander1 commited on 26 days ago

Commit

7d92072

verified ·

1 Parent(s): 0fbbefc

Upload 2 files

Browse files

Here is the prediction source

Files changed (2) hide show

model.py +111 -0
predict.py +412 -0

model.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import warnings
+from typing import Optional, Tuple
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from transformers import Wav2Vec2Model, Wav2Vec2PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutput
+from torch import nn
+class Wav2Vec2ForCTCnCLS(Wav2Vec2PreTrainedModel):
+    def __init__(self, config, cls_len=2, alpha=0.01):
+        super().__init__(config)
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
+        self.cls_head = nn.Linear(config.hidden_size, cls_len)
+        self.init_weights()
+        self.alpha = alpha
+    def freeze_feature_extractor(self):
+        self.wav2vec2.feature_extractor._freeze_parameters()
+    def _ctc_loss(self, logits, labels, input_values, attention_mask=None):
+        loss = None
+        if labels is not None:
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+            log_probs = F.log_softmax(logits, dim=-1).transpose(0, 1)
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = F.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                    )
+        return loss
+    def _cls_loss(self, logits, cls_labels): # sum hidden_states over dim 1 (the sequence length), then feed into self.cls
+        loss = None
+        if cls_labels is not None:
+            loss = F.cross_entropy(logits, cls_labels.to(logits.device))
+        return loss
+    def forward(
+        self,
+        input_values,
+        attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        labels=None, # tuple: (ctc_labels, cls_labels), shape=(batch_size, target_length)
+        if_ctc=True,
+        if_cls=True,
+        ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0] # this is the last layer's hidden states
+        hidden_states = self.dropout(hidden_states)
+        logits_ctc = self.lm_head(hidden_states)
+        logits_cls = self.cls_head(torch.mean(hidden_states, dim=1))
+        loss = None
+        if labels is not None:
+            if if_ctc:
+                loss_ctc = self._ctc_loss(logits_ctc, labels[0], input_values, attention_mask)
+            if if_cls:
+                loss_cls = self._cls_loss(logits_cls, labels[1])
+            loss = loss_cls + self.alpha * loss_ctc
+        # if not return_dict:
+        #     output = (logits,) + outputs[1:]
+        #     return ((loss,) + output) if loss is not None else output
+        return CausalLMOutput(
+            loss=loss, logits=(logits_ctc, logits_cls), hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )

predict.py ADDED Viewed

	@@ -0,0 +1,412 @@

+#!/usr/bin/env python3
+import logging
+import pathlib
+import re
+import sys
+import time
+import csv
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional, Set, Union
+import datasets
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+from torch.cuda.amp import GradScaler, autocast
+import librosa
+from lang_trans import arabic
+from datasets import Dataset
+import soundfile as sf
+from model import Wav2Vec2ForCTCnCLS
+from transformers.trainer_utils import get_last_checkpoint
+from transformers import (
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    Wav2Vec2CTCTokenizer,
+    Wav2Vec2FeatureExtractor,
+    Wav2Vec2Processor,
+    is_apex_available,
+    trainer_utils,
+)
+local_model_path = "local_model"
+if is_apex_available():
+    from apex import amp
+if version.parse(torch.__version__) >= version.parse("1.6"):
+    _is_native_amp_available = True
+    from torch.cuda.amp import autocast
+logger = logging.getLogger(__name__)
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        default="local_model",
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    freeze_feature_extractor: Optional[bool] = field(
+        default=False, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
+    )
+    verbose_logging: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether to log verbose messages or not."},
+    )
+    tokenizer: Optional[str] = field(
+        default="checkpoint-33000",
+        metadata={"help": "Path to pretrained tokenizer"}
+    )
+def configure_logger(model_args: ModelArguments, training_args: TrainingArguments):
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logging_level = logging.WARNING
+    if model_args.verbose_logging:
+        logging_level = logging.DEBUG
+    elif trainer_utils.is_main_process(training_args.local_rank):
+        logging_level = logging.INFO
+    logger.setLevel(logging_level)
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+    dataset_name: str = field(
+        default='emotion', metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_split_name: Optional[str] = field(
+        default="train",
+        metadata={
+            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
+        },
+    )
+    validation_split_name: Optional[str] = field(
+        default="validation",
+        metadata={
+            "help": "The name of the validation data set split to use (via the datasets library). Defaults to 'validation'"
+        },
+    )
+    target_text_column: Optional[str] = field(
+        default="text",
+        metadata={"help": "Column in the dataset that contains label (target text). Defaults to 'text'"},
+    )
+    speech_file_column: Optional[str] = field(
+        default="file",
+        metadata={"help": "Column in the dataset that contains speech file path. Defaults to 'file'"},
+    )
+    target_feature_extractor_sampling_rate: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Resample loaded audio to target feature extractor's sampling rate or not."},
+    )
+    max_duration_in_seconds: Optional[float] = field(
+        default=None,
+        metadata={"help": "Filters out examples longer than specified. Defaults to no filtering."},
+    )
+    orthography: Optional[str] = field(
+        default="librispeech",
+        metadata={
+            "help": "Orthography used for normalization and tokenization: 'librispeech' (default), 'timit', or 'buckwalter'."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=8,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    output_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "Output file."},
+    )
+@dataclass
+class Orthography:
+    """
+    Orthography scheme used for text normalization and tokenization.
+    Args:
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to accept lowercase input and lowercase the output when decoding.
+        vocab_file (:obj:`str`, `optional`, defaults to :obj:`None`):
+            File containing the vocabulary.
+        word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`"|"`):
+            The token used for delimiting words; it needs to be in the vocabulary.
+        translation_table (:obj:`Dict[str, str]`, `optional`, defaults to :obj:`{}`):
+            Table to use with `str.translate()` when preprocessing text (e.g., "-" -> " ").
+        words_to_remove (:obj:`Set[str]`, `optional`, defaults to :obj:`set()`):
+            Words to remove when preprocessing text (e.g., "sil").
+        untransliterator (:obj:`Callable[[str], str]`, `optional`, defaults to :obj:`None`):
+            Function that untransliterates text back into native writing system.
+        tokenizer (:obj:`str`, `optional`, defaults to :obj:`None`):
+            Tokenizer type, e.g., 'jieba' for Chinese.
+    """
+    do_lower_case: bool = False
+    vocab_file: Optional[str] = None
+    word_delimiter_token: Optional[str] = "|"
+    translation_table: Optional[Dict[str, str]] = field(default_factory=dict)
+    words_to_remove: Optional[Set[str]] = field(default_factory=set)
+    tokenizer: Optional[str] = None
+    untransliterator: Optional[Callable[[str], str]] = None
+    @classmethod
+    def from_name(cls, name: str):
+        if name == "librispeech":
+            return cls()
+        else:
+            raise ValueError(f"Unsupported orthography: '{name}'.")
+    def create_processor(self, model_args: ModelArguments) -> Wav2Vec2Processor:
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+            local_model_path, cache_dir=model_args.cache_dir
+        )
+        if self.vocab_file:
+            tokenizer = Wav2Vec2CTCTokenizer(
+                self.vocab_file,
+                cache_dir=model_args.cache_dir,
+                do_lower_case=self.do_lower_case,
+                word_delimiter_token=self.word_delimiter_token,
+            )
+        else:
+            tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
+                local_model_path,
+                # self.tokenizer,
+                cache_dir=model_args.cache_dir,
+                do_lower_case=self.do_lower_case,
+                word_delimiter_token=self.word_delimiter_token,
+                device_map="cuda:0",
+            )
+        return Wav2Vec2Processor(feature_extractor, tokenizer)
+@dataclass
+class TrainingArguments(TrainingArguments):
+    output_dir: str = field(
+        default="output/angry_tmp", metadata={"help": "The store of your output."})
+    do_predict: bool = field(
+        default=True, metadata={"help": "The store of your output."})
+    do_eval: bool = field(
+        default=False, metadata={"help": "The store of your output."})
+    overwrite_output_dir: str = field(
+        default='overwrite_output_dir', metadata={"help": "The store of your output."}   )
+    per_device_eval_batch_size: int = field(
+        default=2, metadata={"help": "The store of your output."})
+    warmup_ratio: float = field(
+        default=0.1, metadata={"help": "Linear warmup over warmup_ratio fraction of total steps."}
+    )
+@dataclass
+class DataCollatorCTCWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        processor (:class:`~transformers.Wav2Vec2Processor`)
+            The processor used for proccessing the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
+        max_length_labels (:obj:`int`, `optional`):
+            Maximum length of the ``labels`` returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+    """
+    processor: Wav2Vec2Processor
+    padding: Union[bool, str] = True
+    max_length: Optional[int] = None
+    max_length_labels: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    pad_to_multiple_of_labels: Optional[int] = None
+    audio_only = False
+    duration = 6
+    sample_rate = 16000
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lenghts and need
+        # different padding methods
+        input_features = [{"input_values": feature["input_values"]} for feature in features]
+        batch = self.processor.pad(
+            input_features,
+            padding=self.padding,
+            # max_length=self.max_length,
+            max_length=self.duration*self.sample_rate,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        return batch
+class CTCTrainer(Trainer):
+    def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]]) -> Dict[str, Union[torch.Tensor, Any]]:
+        self.use_amp = False
+        self.use_apex = False
+        self.deepspeed = False
+        self.scaler = GradScaler()
+        for k, v in inputs.items():
+            if isinstance(v, torch.Tensor):
+                kwargs = dict(device=self.args.device)
+                if self.deepspeed and inputs[k].dtype != torch.int64:
+                    kwargs.update(dict(dtype=self.args.hf_deepspeed_config.dtype()))
+                inputs[k] = v.to(**kwargs)
+        if self.args.past_index >= 0 and self._past is not None:
+            inputs["mems"] = self._past
+        return inputs
+def create_dataset(audio_path):
+    data = {
+        'file': [audio_path]
+    }
+    dataset = Dataset.from_dict(data)
+    return dataset
+def exeute_angry_predict(audio_path):
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    target_sr = 16000
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    configure_logger(model_args, training_args)
+    orthography = Orthography.from_name(data_args.orthography.lower())
+    orthography.tokenizer = model_args.tokenizer
+    processor = orthography.create_processor(model_args)
+    if data_args.dataset_name == 'emotion':
+        val_dataset = create_dataset(audio_path)
+        cls_label_map = {"neutral":0, "angry":1}
+    model = Wav2Vec2ForCTCnCLS.from_pretrained(
+        local_model_path,
+        gradient_checkpointing=True, # training_args.gradient_checkpointing,
+        cls_len=len(cls_label_map),
+    )
+    def prepare_example(example, audio_only=False):  # TODO(elgeish) make use of multiprocessing?
+        example["speech"], example["sampling_rate"] = librosa.load(example[data_args.speech_file_column], sr=target_sr)
+        orig_sample_rate = example["sampling_rate"]
+        target_sample_rate = target_sr
+        if orig_sample_rate != target_sample_rate:
+            example["speech"] = librosa.resample(example["speech"], orig_sr=orig_sample_rate, target_sr=target_sample_rate)
+        if data_args.max_duration_in_seconds is not None:
+            example["duration_in_seconds"] = len(example["speech"]) / example["sampling_rate"]
+        return example
+    if training_args.do_predict:
+        val_dataset = val_dataset.map(prepare_example, fn_kwargs={'audio_only':True})
+    def prepare_dataset(batch, audio_only=False):
+        # check that all files have the correct sampling rate
+        assert (
+            len(set(batch["sampling_rate"])) == 1
+        ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
+        batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
+        return batch
+    if training_args.do_predict:
+        val_dataset = val_dataset.map(
+            prepare_dataset,
+            fn_kwargs={'audio_only':True},
+            batch_size=training_args.per_device_eval_batch_size,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+        )
+    data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
+    if model_args.freeze_feature_extractor:
+        model.freeze_feature_extractor()
+    trainer = CTCTrainer(
+        model=model,
+        args=training_args,
+        eval_dataset=val_dataset,
+        tokenizer=processor.feature_extractor,
+    )
+    if training_args.do_predict:
+        logger.info('******* Predict ********')
+        data_collator.audio_only=True
+        results= {}
+        result= ''
+        predictions, labels, metrics = trainer.predict(val_dataset, metric_key_prefix="predict")
+        logits_ctc, logits_cls = predictions
+        pred_ids = np.argmax(logits_cls, axis=-1)
+        if pred_ids==0:
+            result = "非愤怒"
+        if pred_ids==1:
+            result = "愤怒"
+        results[audio_path] = result
+        print("results", results)
+if __name__ == "__main__":
+    audio_path = 'audio.mp3'
+    exeute_angry_predict(audio_path)