Spaces:

atsushieee
/

sovits-test

Running

File size: 26,254 Bytes
import warnings

import numpy as np
import resampy
import torch
import tqdm

import crepe


__all__ = ['CENTS_PER_BIN',
           'MAX_FMAX',
           'PITCH_BINS',
           'SAMPLE_RATE',
           'WINDOW_SIZE',
           'UNVOICED',
           'embed',
           'embed_from_file',
           'embed_from_file_to_file',
           'embed_from_files_to_files',
           'infer',
           'predict',
           'predict_from_file',
           'predict_from_file_to_file',
           'predict_from_files_to_files',
           'preprocess',
           'postprocess',
           'resample']


###############################################################################
# Constants
###############################################################################


CENTS_PER_BIN = 20  # cents
MAX_FMAX = 2006.  # hz
PITCH_BINS = 360
SAMPLE_RATE = 16000  # hz
WINDOW_SIZE = 1024  # samples
UNVOICED = np.nan


###############################################################################
# Crepe pitch prediction
###############################################################################


def predict(audio,

            sample_rate,

            hop_length=None,

            fmin=50.,

            fmax=MAX_FMAX,

            model='full',

            decoder=crepe.decode.viterbi,

            return_harmonicity=False,

            return_periodicity=False,

            batch_size=None,

            device='cpu',

            pad=True):
    """Performs pitch estimation



    Arguments

        audio (torch.tensor [shape=(1, time)])

            The audio signal

        sample_rate (int)

            The sampling rate in Hz

        hop_length (int)

            The hop_length in samples

        fmin (float)

            The minimum allowable frequency in Hz

        fmax (float)

            The maximum allowable frequency in Hz

        model (string)

            The model capacity. One of 'full' or 'tiny'.

        decoder (function)

            The decoder to use. See decode.py for decoders.

        return_harmonicity (bool) [DEPRECATED]

            Whether to also return the network confidence

        return_periodicity (bool)

            Whether to also return the network confidence

        batch_size (int)

            The number of frames per batch

        device (string)

            The device used to run inference

        pad (bool)

            Whether to zero-pad the audio



    Returns

        pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))])

        (Optional) periodicity (torch.tensor

                                [shape=(1, 1 + int(time // hop_length))])

    """
    # Deprecate return_harmonicity
    if return_harmonicity:
        message = (
            'The crepe return_harmonicity argument is deprecated and '
            'will be removed in a future release. Please use '
            'return_periodicity. Rationale: if network confidence measured '
            'harmonics, the value would be low for non-harmonic, periodic '
            'sounds (e.g., sine waves). But this is not observed.')
        warnings.warn(message, DeprecationWarning)
        return_periodicity = return_harmonicity

    results = []

    # Postprocessing breaks gradients, so just don't compute them
    with torch.no_grad():

        # Preprocess audio
        generator = preprocess(audio,
                               sample_rate,
                               hop_length,
                               batch_size,
                               device,
                               pad)
        for frames in generator:

            # Infer independent probabilities for each pitch bin
            probabilities = infer(frames, model)

            # shape=(batch, 360, time / hop_length)
            probabilities = probabilities.reshape(
                audio.size(0), -1, PITCH_BINS).transpose(1, 2)

            # Convert probabilities to F0 and periodicity
            result = postprocess(probabilities,
                                 fmin,
                                 fmax,
                                 decoder,
                                 return_harmonicity,
                                 return_periodicity)

            # Place on same device as audio to allow very long inputs
            if isinstance(result, tuple):
                result = (result[0].to(audio.device),
                          result[1].to(audio.device))
            else:
                 result = result.to(audio.device)

            results.append(result)

    # Split pitch and periodicity
    if return_periodicity:
        pitch, periodicity = zip(*results)
        return torch.cat(pitch, 1), torch.cat(periodicity, 1)

    # Concatenate
    return torch.cat(results, 1)


def predict_from_file(audio_file,

                      hop_length=None,

                      fmin=50.,

                      fmax=MAX_FMAX,

                      model='full',

                      decoder=crepe.decode.viterbi,

                      return_harmonicity=False,

                      return_periodicity=False,

                      batch_size=None,

                      device='cpu',

                      pad=True):
    """Performs pitch estimation from file on disk



    Arguments

        audio_file (string)

            The file to perform pitch tracking on

        hop_length (int)

            The hop_length in samples

        fmin (float)

            The minimum allowable frequency in Hz

        fmax (float)

            The maximum allowable frequency in Hz

        model (string)

            The model capacity. One of 'full' or 'tiny'.

        decoder (function)

            The decoder to use. See decode.py for decoders.

        return_harmonicity (bool) [DEPRECATED]

            Whether to also return the network confidence

        return_periodicity (bool)

            Whether to also return the network confidence

        batch_size (int)

            The number of frames per batch

        device (string)

            The device used to run inference

        pad (bool)

            Whether to zero-pad the audio



    Returns

        pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))])

        (Optional) periodicity (torch.tensor

                                [shape=(1, 1 + int(time // hop_length))])

    """
    # Load audio
    audio, sample_rate = crepe.load.audio(audio_file)

    # Predict
    return predict(audio,
                   sample_rate,
                   hop_length,
                   fmin,
                   fmax,
                   model,
                   decoder,
                   return_harmonicity,
                   return_periodicity,
                   batch_size,
                   device,
                   pad)


def predict_from_file_to_file(audio_file,

                              output_pitch_file,

                              output_harmonicity_file=None,

                              output_periodicity_file=None,

                              hop_length=None,

                              fmin=50.,

                              fmax=MAX_FMAX,

                              model='full',

                              decoder=crepe.decode.viterbi,

                              batch_size=None,

                              device='cpu',

                              pad=True):
    """Performs pitch estimation from file on disk



    Arguments

        audio_file (string)

            The file to perform pitch tracking on

        output_pitch_file (string)

            The file to save predicted pitch

        output_harmonicity_file (string or None) [DEPRECATED]

            The file to save predicted harmonicity

        output_periodicity_file (string or None)

            The file to save predicted periodicity

        hop_length (int)

            The hop_length in samples

        fmin (float)

            The minimum allowable frequency in Hz

        fmax (float)

            The maximum allowable frequency in Hz

        model (string)

            The model capacity. One of 'full' or 'tiny'.

        decoder (function)

            The decoder to use. See decode.py for decoders.

        batch_size (int)

            The number of frames per batch

        device (string)

            The device used to run inference

        pad (bool)

            Whether to zero-pad the audio

    """
    # Deprecate output_harmonicity_file
    if output_harmonicity_file is not None:
        message = (
            'The crepe output_harmonicity_file argument is deprecated and '
            'will be removed in a future release. Please use '
            'output_periodicity_file. Rationale: if network confidence measured '
            'harmonic content, the value would be low for non-harmonic, periodic '
            'sounds (e.g., sine waves). But this is not observed.')
        warnings.warn(message, DeprecationWarning)
        output_periodicity_file = output_harmonicity_file

    # Predict from file
    prediction = predict_from_file(audio_file,
                                   hop_length,
                                   fmin,
                                   fmax,
                                   model,
                                   decoder,
                                   False,
                                   output_periodicity_file is not None,
                                   batch_size,
                                   device,
                                   pad)

    # Save to disk
    if output_periodicity_file is not None:
        torch.save(prediction[0].detach(), output_pitch_file)
        torch.save(prediction[1].detach(), output_periodicity_file)
    else:
        torch.save(prediction.detach(), output_pitch_file)


def predict_from_files_to_files(audio_files,

                                output_pitch_files,

                                output_harmonicity_files=None,

                                output_periodicity_files=None,

                                hop_length=None,

                                fmin=50.,

                                fmax=MAX_FMAX,

                                model='full',

                                decoder=crepe.decode.viterbi,

                                batch_size=None,

                                device='cpu',

                                pad=True):
    """Performs pitch estimation from files on disk without reloading model



    Arguments

        audio_files (list[string])

            The files to perform pitch tracking on

        output_pitch_files (list[string])

            The files to save predicted pitch

        output_harmonicity_files (list[string] or None) [DEPRECATED]

            The files to save predicted harmonicity

        output_periodicity_files (list[string] or None)

            The files to save predicted periodicity

        hop_length (int)

            The hop_length in samples

        fmin (float)

            The minimum allowable frequency in Hz

        fmax (float)

            The maximum allowable frequency in Hz

        model (string)

            The model capacity. One of 'full' or 'tiny'.

        decoder (function)

            The decoder to use. See decode.py for decoders.

        batch_size (int)

            The number of frames per batch

        device (string)

            The device used to run inference

        pad (bool)

            Whether to zero-pad the audio

    """
    # Deprecate output_harmonicity_files
    if output_harmonicity_files is not None:
        message = (
            'The crepe output_harmonicity_files argument is deprecated and '
            'will be removed in a future release. Please use '
            'output_periodicity_files. Rationale: if network confidence measured '
            'harmonic content, the value would be low for non-harmonic, periodic '
            'sounds (e.g., sine waves). But this is not observed.')
        warnings.warn(message, DeprecationWarning)
        output_periodicity_files = output_harmonicity_files

    if output_periodicity_files is None:
        output_periodicity_files = len(audio_files) * [None]

    # Setup iterator
    iterator = zip(audio_files, output_pitch_files, output_periodicity_files)
    iterator = tqdm.tqdm(iterator, desc='crepe', dynamic_ncols=True)
    for audio_file, output_pitch_file, output_periodicity_file in iterator:

        # Predict a file
        predict_from_file_to_file(audio_file,
                                  output_pitch_file,
                                  None,
                                  output_periodicity_file,
                                  hop_length,
                                  fmin,
                                  fmax,
                                  model,
                                  decoder,
                                  batch_size,
                                  device,
                                  pad)

###############################################################################
# Crepe pitch embedding
###############################################################################


def embed(audio,

          sample_rate,

          hop_length=None,

          model='full',

          batch_size=None,

          device='cpu',

          pad=True):
    """Embeds audio to the output of CREPE's fifth maxpool layer



    Arguments

        audio (torch.tensor [shape=(1, time)])

            The audio signals

        sample_rate (int)

            The sampling rate in Hz

        hop_length (int)

            The hop_length in samples

        model (string)

            The model capacity. One of 'full' or 'tiny'.

        batch_size (int)

            The number of frames per batch

        device (string)

            The device to run inference on

        pad (bool)

            Whether to zero-pad the audio



    Returns

        embedding (torch.tensor [shape=(1,

                                        1 + int(time // hop_length), 32, -1)])

    """
    results = []

    # Preprocess audio
    generator = preprocess(audio,
                           sample_rate,
                           hop_length,
                           batch_size,
                           device,
                           pad)
    for frames in generator:

        # Infer pitch embeddings
        embedding = infer(frames, model, embed=True)

        # shape=(batch, time / hop_length, 32, embedding_size)
        result = embedding.reshape(audio.size(0), frames.size(0), 32, -1)

        # Place on same device as audio. This allows for large inputs.
        results.append(result.to(audio.device))

    # Concatenate
    return torch.cat(results, 1)


def embed_from_file(audio_file,

                    hop_length=None,

                    model='full',

                    batch_size=None,

                    device='cpu',

                    pad=True):
    """Embeds audio from disk to the output of CREPE's fifth maxpool layer



    Arguments

        audio_file (string)

            The wav file containing the audio to embed

        hop_length (int)

            The hop_length in samples

        model (string)

            The model capacity. One of 'full' or 'tiny'.

        batch_size (int)

            The number of frames per batch

        device (string)

            The device to run inference on

        pad (bool)

            Whether to zero-pad the audio



    Returns

        embedding (torch.tensor [shape=(1,

                                        1 + int(time // hop_length), 32, -1)])

    """
    # Load audio
    audio, sample_rate = crepe.load.audio(audio_file)

    # Embed
    return embed(audio,
                 sample_rate,
                 hop_length,
                 model,
                 batch_size,
                 device,
                 pad)


def embed_from_file_to_file(audio_file,

                            output_file,

                            hop_length=None,

                            model='full',

                            batch_size=None,

                            device='cpu',

                            pad=True):
    """Embeds audio from disk and saves to disk



    Arguments

        audio_file (string)

            The wav file containing the audio to embed

        hop_length (int)

            The hop_length in samples

        output_file (string)

            The file to save the embedding

        model (string)

            The model capacity. One of 'full' or 'tiny'.

        batch_size (int)

            The number of frames per batch

        device (string)

            The device to run inference on

        pad (bool)

            Whether to zero-pad the audio

    """
    # No use computing gradients if we're just saving to file
    with torch.no_grad():

        # Embed
        embedding = embed_from_file(audio_file,
                                    hop_length,
                                    model,
                                    batch_size,
                                    device,
                                    pad)

        # Save to disk
        torch.save(embedding.detach(), output_file)


def embed_from_files_to_files(audio_files,

                              output_files,

                              hop_length=None,

                              model='full',

                              batch_size=None,

                              device='cpu',

                              pad=True):
    """Embeds audio from disk and saves to disk without reloading model



    Arguments

        audio_files (list[string])

            The wav files containing the audio to embed

        output_files (list[string])

            The files to save the embeddings

        hop_length (int)

            The hop_length in samples

        model (string)

            The model capacity. One of 'full' or 'tiny'.

        batch_size (int)

            The number of frames per batch

        device (string)

            The device to run inference on

        pad (bool)

            Whether to zero-pad the audio

    """
    # Setup iterator
    iterator = zip(audio_files, output_files)
    iterator = tqdm.tqdm(iterator, desc='crepe', dynamic_ncols=True)
    for audio_file, output_file in iterator:

        # Embed a file
        embed_from_file_to_file(audio_file,
                                output_file,
                                hop_length,
                                model,
                                batch_size,
                                device,
                                pad)


###############################################################################
# Components for step-by-step prediction
###############################################################################


def infer(frames, model='full', embed=False):
    """Forward pass through the model



    Arguments

        frames (torch.tensor [shape=(time / hop_length, 1024)])

            The network input

        model (string)

            The model capacity. One of 'full' or 'tiny'.

        embed (bool)

            Whether to stop inference at the intermediate embedding layer



    Returns

        logits (torch.tensor [shape=(1 + int(time // hop_length), 360)]) OR

        embedding (torch.tensor [shape=(1 + int(time // hop_length),

                                       embedding_size)])

    """
    # Load the model if necessary
    if not hasattr(infer, 'model') or not hasattr(infer, 'capacity') or \
       (hasattr(infer, 'capacity') and infer.capacity != model):
        crepe.load.model(frames.device, model)

    # Move model to correct device (no-op if devices are the same)
    infer.model = infer.model.to(frames.device)

    # Apply model
    return infer.model(frames, embed=embed)


def postprocess(probabilities,

                fmin=0.,

                fmax=MAX_FMAX,

                decoder=crepe.decode.viterbi,

                return_harmonicity=False,

                return_periodicity=False):
    """Convert model output to F0 and periodicity



    Arguments

        probabilities (torch.tensor [shape=(1, 360, time / hop_length)])

            The probabilities for each pitch bin inferred by the network

        fmin (float)

            The minimum allowable frequency in Hz

        fmax (float)

            The maximum allowable frequency in Hz

        viterbi (bool)

            Whether to use viterbi decoding

        return_harmonicity (bool) [DEPRECATED]

            Whether to also return the network confidence

        return_periodicity (bool)

            Whether to also return the network confidence



    Returns

        pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))])

        periodicity (torch.tensor [shape=(1, 1 + int(time // hop_length))])

    """
    # Sampling is non-differentiable, so remove from graph
    probabilities = probabilities.detach()

    # Convert frequency range to pitch bin range
    minidx = crepe.convert.frequency_to_bins(torch.tensor(fmin))
    maxidx = crepe.convert.frequency_to_bins(torch.tensor(fmax),
                                                  torch.ceil)

    # Remove frequencies outside of allowable range
    probabilities[:, :minidx] = -float('inf')
    probabilities[:, maxidx:] = -float('inf')

    # Perform argmax or viterbi sampling
    bins, pitch = decoder(probabilities)

    # Deprecate return_harmonicity
    if return_harmonicity:
        message = (
            'The crepe return_harmonicity argument is deprecated and '
            'will be removed in a future release. Please use '
            'return_periodicity. Rationale: if network confidence measured '
            'harmonics, the value would be low for non-harmonic, periodic '
            'sounds (e.g., sine waves). But this is not observed.')
        warnings.warn(message, DeprecationWarning)
        return_periodicity = return_harmonicity

    if not return_periodicity:
        return pitch

    # Compute periodicity from probabilities and decoded pitch bins
    return pitch, periodicity(probabilities, bins)


def preprocess(audio,

               sample_rate,

               hop_length=None,

               batch_size=None,

               device='cpu',

               pad=True):
    """Convert audio to model input



    Arguments

        audio (torch.tensor [shape=(1, time)])

            The audio signals

        sample_rate (int)

            The sampling rate in Hz

        hop_length (int)

            The hop_length in samples

        batch_size (int)

            The number of frames per batch

        device (string)

            The device to run inference on

        pad (bool)

            Whether to zero-pad the audio



    Returns

        frames (torch.tensor [shape=(1 + int(time // hop_length), 1024)])

    """
    # Default hop length of 10 ms
    hop_length = sample_rate // 100 if hop_length is None else hop_length

    # Resample
    if sample_rate != SAMPLE_RATE:
        audio = resample(audio, sample_rate)
        hop_length = int(hop_length * SAMPLE_RATE / sample_rate)

    # Get total number of frames

    # Maybe pad
    if pad:
        total_frames = 1 + int(audio.size(1) // hop_length)
        audio = torch.nn.functional.pad(
            audio,
            (WINDOW_SIZE // 2, WINDOW_SIZE // 2))
    else:
        total_frames = 1 + int((audio.size(1) - WINDOW_SIZE) // hop_length)

    # Default to running all frames in a single batch
    batch_size = total_frames if batch_size is None else batch_size

    # Generate batches
    for i in range(0, total_frames, batch_size):

        # Batch indices
        start = max(0, i * hop_length)
        end = min(audio.size(1),
                  (i + batch_size - 1) * hop_length + WINDOW_SIZE)

        # Chunk
        frames = torch.nn.functional.unfold(
            audio[:, None, None, start:end],
            kernel_size=(1, WINDOW_SIZE),
            stride=(1, hop_length))

        # shape=(1 + int(time / hop_length, 1024)
        frames = frames.transpose(1, 2).reshape(-1, WINDOW_SIZE)

        # Place on device
        frames = frames.to(device)

        # Mean-center
        frames -= frames.mean(dim=1, keepdim=True)

        # Scale
        # Note: during silent frames, this produces very large values. But
        # this seems to be what the network expects.
        frames /= torch.max(torch.tensor(1e-10, device=frames.device),
                            frames.std(dim=1, keepdim=True))

        yield frames


###############################################################################
# Utilities
###############################################################################


def periodicity(probabilities, bins):
    """Computes the periodicity from the network output and pitch bins"""
    # shape=(batch * time / hop_length, 360)
    probs_stacked = probabilities.transpose(1, 2).reshape(-1, PITCH_BINS)

    # shape=(batch * time / hop_length, 1)
    bins_stacked = bins.reshape(-1, 1).to(torch.int64)

    # Use maximum logit over pitch bins as periodicity
    periodicity = probs_stacked.gather(1, bins_stacked)

    # shape=(batch, time / hop_length)
    return periodicity.reshape(probabilities.size(0), probabilities.size(2))


def resample(audio, sample_rate):
    """Resample audio"""
    # Store device for later placement
    device = audio.device

    # Convert to numpy
    audio = audio.detach().cpu().numpy().squeeze(0)

    # Resample
    # We have to use resampy if we want numbers to match Crepe
    audio = resampy.resample(audio, sample_rate, SAMPLE_RATE)

    # Convert to pytorch
    return torch.tensor(audio, device=device).unsqueeze(0)