from typing import Any, Optional, Tuple, Union,List,Dict import torch from dataclasses import dataclass from transformers.modeling_outputs import ( BaseModelOutput, ModelOutput, ) #............................................. @dataclass class PosteriorDecoderModelOutput(ModelOutput): labels_padding_mask: torch.FloatTensor = None posterior_latents: torch.FloatTensor = None posterior_means: torch.FloatTensor = None posterior_log_variances: torch.FloatTensor = None latents_slice : torch.FloatTensor = None ids_slice: torch.FloatTensor = None waveform: torch.FloatTensor = None #............................................................................................. @dataclass class VitsModelOutput(ModelOutput): waveform: torch.FloatTensor = None sequence_lengths: torch.FloatTensor = None spectrogram: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None #............................................................................................. @dataclass class VitsTrainingOutput(ModelOutput): waveform: torch.FloatTensor = None log_duration: torch.FloatTensor = None attn: torch.FloatTensor = None ids_slice: torch.FloatTensor = None input_padding_mask: torch.FloatTensor = None labels_padding_mask: torch.FloatTensor = None latents: torch.FloatTensor = None prior_latents: torch.FloatTensor = None prior_means: torch.FloatTensor = None prior_log_variances: torch.FloatTensor = None posterior_means: torch.FloatTensor = None posterior_log_variances: torch.FloatTensor = None #............................................................................................. @dataclass class VitsTextEncoderOutput(ModelOutput): """ Describes the outputs for the VITS text encoder model, with potential hidden states and attentions. Args: last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. prior_means (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): The predicted mean values of the prior distribution for the latent text variables. prior_log_variances (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): The predicted log-variance values of the prior distribution for the latent text variables. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attention weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ last_hidden_state: torch.FloatTensor = None prior_means: torch.FloatTensor = None prior_log_variances: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None #.............................................................................................