Spaces:

lord-reso
/

host

Sleeping

App Files Files Community

help

by projanshakya - opened Jan 30, 2024

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+979

-4597

This PR is in draft mode

This view is limited to 50 files because it contains too many changes. See the raw diff here.

Files changed (50) hide show

app.py +31 -38
distributed.py +173 -0
encoder/audio.py +0 -117
encoder/config.py +0 -45
encoder/data_objects/__init__.py +0 -2
encoder/data_objects/random_cycler.py +0 -37
encoder/data_objects/speaker.py +0 -40
encoder/data_objects/speaker_batch.py +0 -13
encoder/data_objects/speaker_verification_dataset.py +0 -56
encoder/data_objects/utterance.py +0 -26
encoder/inference.py +0 -178
encoder/model.py +0 -135
encoder/params_data.py +0 -29
encoder/params_model.py +0 -11
encoder/preprocess.py +0 -184
encoder/train.py +0 -125
encoder/visualizations.py +0 -179
encoderCoren.pt +0 -3
{hifigan → hifi-gan}/LICENSE +0 -0
hifi-gan/README.md +105 -0
diagrams/apple.txt → hifi-gan/apple.py +0 -0
{hifigan → hifi-gan}/env.py +0 -0
{hifigan → hifi-gan}/inference.py +3 -3
{hifigan → hifi-gan}/inference_e2e.py +51 -37
{hifigan → hifi-gan}/meldataset.py +0 -0
{hifigan → hifi-gan}/models.py +1 -1
hifi-gan/requirements.txt +7 -0
{hifigan → hifi-gan}/train.py +4 -4
hifigan/hifigan_utils.py → hifi-gan/utils.py +0 -0
hparams.py +0 -1
kaggle_12000.pt +0 -3
logger.py +48 -0
logic.py +41 -79
loss_function.py +19 -0
loss_scaler.py +131 -0
model.py +12 -16
multiproc.py +23 -0
requirements.txt +8 -7
saved_model.pt +0 -3
speaker/__init__.py +0 -0
speaker/bana.txt +0 -0
speaker/data.py +0 -109
speaker/model.py +0 -191
speaker/preprocess.py +0 -1
speaker/saved_model.pt +0 -3
speaker/saved_model_e175.pt +0 -3
speaker/saved_models/dog.txt +0 -0
speaker/saved_models/saved_model_e175.pt +0 -3
speaker/saved_models/saved_model_e273_LargeBatch.pt +0 -3
speaker/saved_models/saved_model_e300.pt +0 -3

app.py CHANGED Viewed

@@ -3,70 +3,63 @@ from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 from logic import synthesize_voice, plot_data, plot_waveforms
 import base64
-import sys
-import numpy as np
-from io import BytesIO
-from hifigan.inference_e2e import hifi_gan_inference
 app = FastAPI()
-@app.get("/")
-def read_root():
-    data = {"Voice": "Cloning", "Status": "Success"}
-    return JSONResponse(content=data)
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
-hugging_face_api_url = "https://huggingface.co/spaces/lord-reso/host/synthesize"
-@app.post("/synthesize")
-async def synthesize(request: Request):
-    print("call successful")
-    json = await request.json()
-    print(json)
-    font_type = json['font_select']
-    input_text = json['input_text']
-    print("generating mel-spectrogram")
     # Generate mel-spectrogram using Tacotron2
-    # mel_output_data, mel_output_postnet_data, alignments_data = synthesize_voice(input_text, "Shruti_finetuned.pt")
-    mel_output_data, mel_output_postnet_data, alignments_data = synthesize_voice(input_text, "kaggle_12000.pt")
-    print("mel generation successful")
     # Convert mel-spectrogram to base64 for display in HTML
     mel_output_base64 = plot_data([mel_output_data, mel_output_postnet_data, alignments_data])
-    # Audio Synthesis begins
-    print("Starting audio synthesis")
-    buffer = BytesIO()
-    np.save(buffer, mel_output_data)
-    input_mel = buffer.getvalue()
-    hifigan_checkpoint = "generator_v1"
-    # Generate audio using Hifigan
-    audio_data = hifi_gan_inference(input_mel, hifigan_checkpoint)
-    print("Creating time-domain waveform")
     # Plot the waveform
-    wave_base64 = plot_waveforms(audio_data)
     # Encode audio content as Base64
-    audio_base64 = base64.b64encode(audio_data).decode('utf-8')
     # Customize the response based on the information you want to send to the frontend
     response_data = {
         'mel_spectrogram': mel_output_base64,
         'audio_data': audio_base64,
         'waveform': wave_base64,
     }
-    return JSONResponse(content=response_data)

 from fastapi.middleware.cors import CORSMiddleware
 from logic import synthesize_voice, plot_data, plot_waveforms
 import base64
+from typing import Dict
 app = FastAPI()
+# You need to replace the placeholders above with the actual URLs for the models.
+# Allow requests from your Vercel domain
+origins = [
+    "https://host-test-smoky.vercel.app",
+    # Add other allowed origins if needed
+]
+# Set up CORS middleware
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=origins,
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
+@app.post("/synthesize", response_model=Dict[str, str])
+async def synthesize(request_data: Dict[str, str]):
+    font_type = request_data['font_select']
+    input_text = request_data['input_text']
+    # Font selection logic (customize based on your requirements)
+    if font_type == 'Preeti':
+        # Implement Preeti font logic
+        pass
+    elif font_type == 'Unicode':
+        # Implement Unicode font logic
+        pass
     # Generate mel-spectrogram using Tacotron2
+    mel_output_data, mel_output_postnet_data, alignments_data = synthesize_voice(input_text, "Shruti_finetuned")
     # Convert mel-spectrogram to base64 for display in HTML
     mel_output_base64 = plot_data([mel_output_data, mel_output_postnet_data, alignments_data])
+    # Save the generated audio file
+    audio_file_path = 'audio_output/mel1_generated_e2e.wav'
     # Plot the waveform
+    wave_base64 = plot_waveforms(audio_file_path)
     # Encode audio content as Base64
+    with open(audio_file_path, 'rb') as audio_file:
+        audio_base64 = base64.b64encode(audio_file.read()).decode('utf-8')
     # Customize the response based on the information you want to send to the frontend
     response_data = {
         'mel_spectrogram': mel_output_base64,
         'audio_data': audio_base64,
         'waveform': wave_base64,
+        'some_other_data': 'example_value',
     }
+    return JSONResponse(content=response_data)

distributed.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import torch
+import torch.distributed as dist
+from torch.nn.modules import Module
+from torch.autograd import Variable
+def _flatten_dense_tensors(tensors):
+    """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
+    same dense type.
+    Since inputs are dense, the resulting tensor will be a concatenated 1D
+    buffer. Element-wise operation on this buffer will be equivalent to
+    operating individually.
+    Arguments:
+        tensors (Iterable[Tensor]): dense tensors to flatten.
+    Returns:
+        A contiguous 1D buffer containing input tensors.
+    """
+    if len(tensors) == 1:
+        return tensors[0].contiguous().view(-1)
+    flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0)
+    return flat
+def _unflatten_dense_tensors(flat, tensors):
+    """View a flat buffer using the sizes of tensors. Assume that tensors are of
+    same dense type, and that flat is given by _flatten_dense_tensors.
+    Arguments:
+        flat (Tensor): flattened dense tensors to unflatten.
+        tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
+          unflatten flat.
+    Returns:
+        Unflattened dense tensors with sizes same as tensors and values from
+        flat.
+    """
+    outputs = []
+    offset = 0
+    for tensor in tensors:
+        numel = tensor.numel()
+        outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
+        offset += numel
+    return tuple(outputs)
+'''
+This version of DistributedDataParallel is designed to be used in conjunction with the multiproc.py
+launcher included with this example. It assumes that your run is using multiprocess with 1
+GPU/process, that the model is on the correct device, and that torch.set_device has been
+used to set the device.
+Parameters are broadcasted to the other processes on initialization of DistributedDataParallel,
+and will be allreduced at the finish of the backward pass.
+'''
+class DistributedDataParallel(Module):
+    def __init__(self, module):
+        super(DistributedDataParallel, self).__init__()
+        #fallback for PyTorch 0.3
+        if not hasattr(dist, '_backend'):
+            self.warn_on_half = True
+        else:
+            self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
+        self.module = module
+        for p in self.module.state_dict().values():
+            if not torch.is_tensor(p):
+                continue
+            dist.broadcast(p, 0)
+        def allreduce_params():
+            if(self.needs_reduction):
+                self.needs_reduction = False
+                buckets = {}
+                for param in self.module.parameters():
+                    if param.requires_grad and param.grad is not None:
+                        tp = type(param.data)
+                        if tp not in buckets:
+                            buckets[tp] = []
+                        buckets[tp].append(param)
+                if self.warn_on_half:
+                    if torch.cuda.HalfTensor in buckets:
+                        print("WARNING: gloo dist backend for half parameters may be extremely slow." +
+                              " It is recommended to use the NCCL backend in this case. This currently requires" +
+                              "PyTorch built from top of tree master.")
+                        self.warn_on_half = False
+                for tp in buckets:
+                    bucket = buckets[tp]
+                    grads = [param.grad.data for param in bucket]
+                    coalesced = _flatten_dense_tensors(grads)
+                    dist.all_reduce(coalesced)
+                    coalesced /= dist.get_world_size()
+                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+                        buf.copy_(synced)
+        for param in list(self.module.parameters()):
+            def allreduce_hook(*unused):
+                param._execution_engine.queue_callback(allreduce_params)
+            if param.requires_grad:
+                param.register_hook(allreduce_hook)
+    def forward(self, *inputs, **kwargs):
+        self.needs_reduction = True
+        return self.module(*inputs, **kwargs)
+    '''
+    def _sync_buffers(self):
+        buffers = list(self.module._all_buffers())
+        if len(buffers) > 0:
+            # cross-node buffer sync
+            flat_buffers = _flatten_dense_tensors(buffers)
+            dist.broadcast(flat_buffers, 0)
+            for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
+                buf.copy_(synced)
+     def train(self, mode=True):
+        # Clear NCCL communicator and CUDA event cache of the default group ID,
+        # These cache will be recreated at the later call. This is currently a
+        # work-around for a potential NCCL deadlock.
+        if dist._backend == dist.dist_backend.NCCL:
+            dist._clear_group_cache()
+        super(DistributedDataParallel, self).train(mode)
+        self.module.train(mode)
+    '''
+'''
+Modifies existing model to do gradient allreduce, but doesn't change class
+so you don't need "module"
+'''
+def apply_gradient_allreduce(module):
+        if not hasattr(dist, '_backend'):
+            module.warn_on_half = True
+        else:
+            module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
+        for p in module.state_dict().values():
+            if not torch.is_tensor(p):
+                continue
+            dist.broadcast(p, 0)
+        def allreduce_params():
+            if(module.needs_reduction):
+                module.needs_reduction = False
+                buckets = {}
+                for param in module.parameters():
+                    if param.requires_grad and param.grad is not None:
+                        tp = param.data.dtype
+                        if tp not in buckets:
+                            buckets[tp] = []
+                        buckets[tp].append(param)
+                if module.warn_on_half:
+                    if torch.cuda.HalfTensor in buckets:
+                        print("WARNING: gloo dist backend for half parameters may be extremely slow." +
+                              " It is recommended to use the NCCL backend in this case. This currently requires" +
+                              "PyTorch built from top of tree master.")
+                        module.warn_on_half = False
+                for tp in buckets:
+                    bucket = buckets[tp]
+                    grads = [param.grad.data for param in bucket]
+                    coalesced = _flatten_dense_tensors(grads)
+                    dist.all_reduce(coalesced)
+                    coalesced /= dist.get_world_size()
+                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+                        buf.copy_(synced)
+        for param in list(module.parameters()):
+            def allreduce_hook(*unused):
+                Variable._execution_engine.queue_callback(allreduce_params)
+            if param.requires_grad:
+                param.register_hook(allreduce_hook)
+        def set_needs_reduction(self, input, output):
+            self.needs_reduction = True
+        module.register_forward_hook(set_needs_reduction)
+        return module

encoder/audio.py DELETED Viewed

@@ -1,117 +0,0 @@
-from scipy.ndimage.morphology import binary_dilation
-from encoder.params_data import *
-from pathlib import Path
-from typing import Optional, Union
-from warnings import warn
-import numpy as np
-import librosa
-import struct
-try:
-    import webrtcvad
-except:
-    warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")
-    webrtcvad=None
-int16_max = (2 ** 15) - 1
-def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
-                   source_sr: Optional[int] = None,
-                   normalize: Optional[bool] = True,
-                   trim_silence: Optional[bool] = True):
-    """
-    Applies the preprocessing operations used in training the Speaker Encoder to a waveform
-    either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
-    :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
-    just .wav), either the waveform as a numpy array of floats.
-    :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
-    preprocessing. After preprocessing, the waveform's sampling rate will match the data
-    hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
-    this argument will be ignored.
-    """
-    # Load the wav from disk if needed
-    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
-        wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
-    else:
-        wav = fpath_or_wav
-    # Resample the wav if needed
-    if source_sr is not None and source_sr != sampling_rate:
-        wav = librosa.resample(wav, source_sr, sampling_rate)
-    # Apply the preprocessing: normalize volume and shorten long silences
-    if normalize:
-        wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
-    if webrtcvad and trim_silence:
-        wav = trim_long_silences(wav)
-    return wav
-def wav_to_mel_spectrogram(wav):
-    """
-    Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
-    Note: this not a log-mel spectrogram.
-    """
-    frames = librosa.feature.melspectrogram(
-        wav,
-        sampling_rate,
-        n_fft=int(sampling_rate * mel_window_length / 1000),
-        hop_length=int(sampling_rate * mel_window_step / 1000),
-        n_mels=mel_n_channels
-    )
-    return frames.astype(np.float32).T
-def trim_long_silences(wav):
-    """
-    Ensures that segments without voice in the waveform remain no longer than a
-    threshold determined by the VAD parameters in params.py.
-    :param wav: the raw waveform as a numpy array of floats
-    :return: the same waveform with silences trimmed away (length <= original wav length)
-    """
-    # Compute the voice detection window size
-    samples_per_window = (vad_window_length * sampling_rate) // 1000
-    # Trim the end of the audio to have a multiple of the window size
-    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
-    # Convert the float waveform to 16-bit mono PCM
-    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
-    # Perform voice activation detection
-    voice_flags = []
-    vad = webrtcvad.Vad(mode=3)
-    for window_start in range(0, len(wav), samples_per_window):
-        window_end = window_start + samples_per_window
-        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
-                                         sample_rate=sampling_rate))
-    voice_flags = np.array(voice_flags)
-    # Smooth the voice detection with a moving average
-    def moving_average(array, width):
-        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
-        ret = np.cumsum(array_padded, dtype=float)
-        ret[width:] = ret[width:] - ret[:-width]
-        return ret[width - 1:] / width
-    audio_mask = moving_average(voice_flags, vad_moving_average_width)
-    audio_mask = np.round(audio_mask).astype(np.bool)
-    # Dilate the voiced regions
-    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
-    audio_mask = np.repeat(audio_mask, samples_per_window)
-    return wav[audio_mask == True]
-def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
-    if increase_only and decrease_only:
-        raise ValueError("Both increase only and decrease only are set")
-    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
-    if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
-        return wav
-    return wav * (10 ** (dBFS_change / 20))

encoder/config.py DELETED Viewed

@@ -1,45 +0,0 @@
-librispeech_datasets = {
-    "train": {
-        "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
-        "other": ["LibriSpeech/train-other-500"]
-    },
-    "test": {
-        "clean": ["LibriSpeech/test-clean"],
-        "other": ["LibriSpeech/test-other"]
-    },
-    "dev": {
-        "clean": ["LibriSpeech/dev-clean"],
-        "other": ["LibriSpeech/dev-other"]
-    },
-}
-libritts_datasets = {
-    "train": {
-        "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
-        "other": ["LibriTTS/train-other-500"]
-    },
-    "test": {
-        "clean": ["LibriTTS/test-clean"],
-        "other": ["LibriTTS/test-other"]
-    },
-    "dev": {
-        "clean": ["LibriTTS/dev-clean"],
-        "other": ["LibriTTS/dev-other"]
-    },
-}
-voxceleb_datasets = {
-    "voxceleb1" : {
-        "train": ["VoxCeleb1/wav"],
-        "test": ["VoxCeleb1/test_wav"]
-    },
-    "voxceleb2" : {
-        "train": ["VoxCeleb2/dev/aac"],
-        "test": ["VoxCeleb2/test_wav"]
-    }
-}
-other_datasets = [
-    "LJSpeech-1.1",
-    "VCTK-Corpus/wav48",
-]
-anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]

encoder/data_objects/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
2	- from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader

encoder/data_objects/random_cycler.py DELETED Viewed

@@ -1,37 +0,0 @@
-import random
-class RandomCycler:
-    """
-    Creates an internal copy of a sequence and allows access to its items in a constrained random
-    order. For a source sequence of n items and one or several consecutive queries of a total
-    of m items, the following guarantees hold (one implies the other):
-        - Each item will be returned between m // n and ((m - 1) // n) + 1 times.
-        - Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
-    """
-    def __init__(self, source):
-        if len(source) == 0:
-            raise Exception("Can't create RandomCycler from an empty collection")
-        self.all_items = list(source)
-        self.next_items = []
-    def sample(self, count: int):
-        shuffle = lambda l: random.sample(l, len(l))
-        out = []
-        while count > 0:
-            if count >= len(self.all_items):
-                out.extend(shuffle(list(self.all_items)))
-                count -= len(self.all_items)
-                continue
-            n = min(count, len(self.next_items))
-            out.extend(self.next_items[:n])
-            count -= n
-            self.next_items = self.next_items[n:]
-            if len(self.next_items) == 0:
-                self.next_items = shuffle(list(self.all_items))
-        return out
-    def __next__(self):
-        return self.sample(1)[0]

encoder/data_objects/speaker.py DELETED Viewed

@@ -1,40 +0,0 @@
-from encoder.data_objects.random_cycler import RandomCycler
-from encoder.data_objects.utterance import Utterance
-from pathlib import Path
-# Contains the set of utterances of a single speaker
-class Speaker:
-    def __init__(self, root: Path):
-        self.root = root
-        self.name = root.name
-        self.utterances = None
-        self.utterance_cycler = None
-    def _load_utterances(self):
-        with self.root.joinpath("_sources.txt").open("r") as sources_file:
-            sources = [l.split(",") for l in sources_file]
-        sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
-        self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
-        self.utterance_cycler = RandomCycler(self.utterances)
-    def random_partial(self, count, n_frames):
-        """
-        Samples a batch of <count> unique partial utterances from the disk in a way that all
-        utterances come up at least once every two cycles and in a random order every time.
-        :param count: The number of partial utterances to sample from the set of utterances from
-        that speaker. Utterances are guaranteed not to be repeated if <count> is not larger than
-        the number of utterances available.
-        :param n_frames: The number of frames in the partial utterance.
-        :return: A list of tuples (utterance, frames, range) where utterance is an Utterance,
-        frames are the frames of the partial utterances and range is the range of the partial
-        utterance with regard to the complete utterance.
-        """
-        if self.utterances is None:
-            self._load_utterances()
-        utterances = self.utterance_cycler.sample(count)
-        a = [(u,) + u.random_partial(n_frames) for u in utterances]
-        return a

encoder/data_objects/speaker_batch.py DELETED Viewed

@@ -1,13 +0,0 @@
-import numpy as np
-from typing import List
-from encoder.data_objects.speaker import Speaker
-class SpeakerBatch:
-    def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
-        self.speakers = speakers
-        self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
-        # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
-        # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
-        self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])

encoder/data_objects/speaker_verification_dataset.py DELETED Viewed

@@ -1,56 +0,0 @@
-from encoder.data_objects.random_cycler import RandomCycler
-from encoder.data_objects.speaker_batch import SpeakerBatch
-from encoder.data_objects.speaker import Speaker
-from encoder.params_data import partials_n_frames
-from torch.utils.data import Dataset, DataLoader
-from pathlib import Path
-# TODO: improve with a pool of speakers for data efficiency
-class SpeakerVerificationDataset(Dataset):
-    def __init__(self, datasets_root: Path):
-        self.root = datasets_root
-        speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
-        if len(speaker_dirs) == 0:
-            raise Exception("No speakers found. Make sure you are pointing to the directory "
-                            "containing all preprocessed speaker directories.")
-        self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
-        self.speaker_cycler = RandomCycler(self.speakers)
-    def __len__(self):
-        return int(1e10)
-    def __getitem__(self, index):
-        return next(self.speaker_cycler)
-    def get_logs(self):
-        log_string = ""
-        for log_fpath in self.root.glob("*.txt"):
-            with log_fpath.open("r") as log_file:
-                log_string += "".join(log_file.readlines())
-        return log_string
-class SpeakerVerificationDataLoader(DataLoader):
-    def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None,
-                 batch_sampler=None, num_workers=0, pin_memory=False, timeout=0,
-                 worker_init_fn=None):
-        self.utterances_per_speaker = utterances_per_speaker
-        super().__init__(
-            dataset=dataset,
-            batch_size=speakers_per_batch,
-            shuffle=False,
-            sampler=sampler,
-            batch_sampler=batch_sampler,
-            num_workers=num_workers,
-            collate_fn=self.collate,
-            pin_memory=pin_memory,
-            drop_last=False,
-            timeout=timeout,
-            worker_init_fn=worker_init_fn
-        )
-    def collate(self, speakers):
-        return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames)

encoder/data_objects/utterance.py DELETED Viewed

@@ -1,26 +0,0 @@
-import numpy as np
-class Utterance:
-    def __init__(self, frames_fpath, wave_fpath):
-        self.frames_fpath = frames_fpath
-        self.wave_fpath = wave_fpath
-    def get_frames(self):
-        return np.load(self.frames_fpath)
-    def random_partial(self, n_frames):
-        """
-        Crops the frames into a partial utterance of n_frames
-        :param n_frames: The number of frames of the partial utterance
-        :return: the partial utterance frames and a tuple indicating the start and end of the
-        partial utterance in the complete utterance.
-        """
-        frames = self.get_frames()
-        if frames.shape[0] == n_frames:
-            start = 0
-        else:
-            start = np.random.randint(0, frames.shape[0] - n_frames)
-        end = start + n_frames
-        return frames[start:end], (start, end)

encoder/inference.py DELETED Viewed

@@ -1,178 +0,0 @@
-from encoder.params_data import *
-from encoder.model import SpeakerEncoder
-from encoder.audio import preprocess_wav   # We want to expose this function from here
-from matplotlib import cm
-from encoder import audio
-from pathlib import Path
-import numpy as np
-import torch
-_model = None # type: SpeakerEncoder
-_device = None # type: torch.device
-def load_model(weights_fpath: Path, device=None):
-    """
-    Loads the model in memory. If this function is not explicitely called, it will be run on the
-    first call to embed_frames() with the default weights file.
-    :param weights_fpath: the path to saved model weights.
-    :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
-    model will be loaded and will run on this device. Outputs will however always be on the cpu.
-    If None, will default to your GPU if it"s available, otherwise your CPU.
-    """
-    # TODO: I think the slow loading of the encoder might have something to do with the device it
-    #   was saved on. Worth investigating.
-    global _model, _device
-    if device is None:
-        _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    elif isinstance(device, str):
-        _device = torch.device(device)
-    _model = SpeakerEncoder(_device, torch.device("cpu"))
-    checkpoint = torch.load(weights_fpath, _device)
-    _model.load_state_dict(checkpoint["model_state"])
-    _model.eval()
-    print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
-def is_loaded():
-    return _model is not None
-def embed_frames_batch(frames_batch):
-    """
-    Computes embeddings for a batch of mel spectrogram.
-    :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
-    (batch_size, n_frames, n_channels)
-    :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
-    """
-    if _model is None:
-        raise Exception("Model was not loaded. Call load_model() before inference.")
-    frames = torch.from_numpy(frames_batch).to(_device)
-    embed = _model.forward(frames).detach().cpu().numpy()
-    return embed
-def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
-                           min_pad_coverage=0.75, overlap=0.5):
-    """
-    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
-    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
-    spectrogram slices are returned, so as to make each partial utterance waveform correspond to
-    its spectrogram. This function assumes that the mel spectrogram parameters used are those
-    defined in params_data.py.
-    The returned ranges may be indexing further than the length of the waveform. It is
-    recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
-    :param n_samples: the number of samples in the waveform
-    :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
-    utterance
-    :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
-    enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
-    then the last partial utterance will be considered, as if we padded the audio. Otherwise,
-    it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
-    utterance, this parameter is ignored so that the function always returns at least 1 slice.
-    :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
-    utterances are entirely disjoint.
-    :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
-    respectively the waveform and the mel spectrogram with these slices to obtain the partial
-    utterances.
-    """
-    assert 0 <= overlap < 1
-    assert 0 < min_pad_coverage <= 1
-    samples_per_frame = int((sampling_rate * mel_window_step / 1000))
-    n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
-    frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
-    # Compute the slices
-    wav_slices, mel_slices = [], []
-    steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
-    for i in range(0, steps, frame_step):
-        mel_range = np.array([i, i + partial_utterance_n_frames])
-        wav_range = mel_range * samples_per_frame
-        mel_slices.append(slice(*mel_range))
-        wav_slices.append(slice(*wav_range))
-    # Evaluate whether extra padding is warranted or not
-    last_wav_range = wav_slices[-1]
-    coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
-    if coverage < min_pad_coverage and len(mel_slices) > 1:
-        mel_slices = mel_slices[:-1]
-        wav_slices = wav_slices[:-1]
-    return wav_slices, mel_slices
-def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
-    """
-    Computes an embedding for a single utterance.
-    # TODO: handle multiple wavs to benefit from batching on GPU
-    :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
-    :param using_partials: if True, then the utterance is split in partial utterances of
-    <partial_utterance_n_frames> frames and the utterance embedding is computed from their
-    normalized average. If False, the utterance is instead computed from feeding the entire
-    spectogram to the network.
-    :param return_partials: if True, the partial embeddings will also be returned along with the
-    wav slices that correspond to the partial embeddings.
-    :param kwargs: additional arguments to compute_partial_splits()
-    :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
-    <return_partials> is True, the partial utterances as a numpy array of float32 of shape
-    (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
-    returned. If <using_partials> is simultaneously set to False, both these values will be None
-    instead.
-    """
-    # Process the entire utterance if not using partials
-    if not using_partials:
-        frames = audio.wav_to_mel_spectrogram(wav)
-        embed = embed_frames_batch(frames[None, ...])[0]
-        if return_partials:
-            return embed, None, None
-        return embed
-    # Compute where to split the utterance into partials and pad if necessary
-    wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
-    max_wave_length = wave_slices[-1].stop
-    if max_wave_length >= len(wav):
-        wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
-    # Split the utterance into partials
-    frames = audio.wav_to_mel_spectrogram(wav)
-    frames_batch = np.array([frames[s] for s in mel_slices])
-    partial_embeds = embed_frames_batch(frames_batch)
-    # Compute the utterance embedding from the partial embeddings
-    raw_embed = np.mean(partial_embeds, axis=0)
-    embed = raw_embed / np.linalg.norm(raw_embed, 2)
-    if return_partials:
-        return embed, partial_embeds, wave_slices
-    return embed
-def embed_speaker(wavs, **kwargs):
-    raise NotImplemented()
-def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
-    import matplotlib.pyplot as plt
-    if ax is None:
-        ax = plt.gca()
-    if shape is None:
-        height = int(np.sqrt(len(embed)))
-        shape = (height, -1)
-    embed = embed.reshape(shape)
-    cmap = cm.get_cmap()
-    mappable = ax.imshow(embed, cmap=cmap)
-    cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
-    sm = cm.ScalarMappable(cmap=cmap)
-    sm.set_clim(*color_range)
-    ax.set_xticks([]), ax.set_yticks([])
-    ax.set_title(title)

encoder/model.py DELETED Viewed

@@ -1,135 +0,0 @@
-from encoder.params_model import *
-from encoder.params_data import *
-from scipy.interpolate import interp1d
-from sklearn.metrics import roc_curve
-from torch.nn.utils import clip_grad_norm_
-from scipy.optimize import brentq
-from torch import nn
-import numpy as np
-import torch
-class SpeakerEncoder(nn.Module):
-    def __init__(self, device, loss_device):
-        super().__init__()
-        self.loss_device = loss_device
-        # Network defition
-        self.lstm = nn.LSTM(input_size=mel_n_channels,
-                            hidden_size=model_hidden_size,
-                            num_layers=model_num_layers,
-                            batch_first=True).to(device)
-        self.linear = nn.Linear(in_features=model_hidden_size,
-                                out_features=model_embedding_size).to(device)
-        self.relu = torch.nn.ReLU().to(device)
-        # Cosine similarity scaling (with fixed initial parameter values)
-        self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
-        self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
-        # Loss
-        self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
-    def do_gradient_ops(self):
-        # Gradient scale
-        self.similarity_weight.grad *= 0.01
-        self.similarity_bias.grad *= 0.01
-        # Gradient clipping
-        clip_grad_norm_(self.parameters(), 3, norm_type=2)
-    def forward(self, utterances, hidden_init=None):
-        """
-        Computes the embeddings of a batch of utterance spectrograms.
-        :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
-        (batch_size, n_frames, n_channels)
-        :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
-        batch_size, hidden_size). Will default to a tensor of zeros if None.
-        :return: the embeddings as a tensor of shape (batch_size, embedding_size)
-        """
-        # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
-        # and the final cell state.
-        out, (hidden, cell) = self.lstm(utterances, hidden_init)
-        # We take only the hidden state of the last layer
-        embeds_raw = self.relu(self.linear(hidden[-1]))
-        # L2-normalize it
-        embeds = embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5)
-        return embeds
-    def similarity_matrix(self, embeds):
-        """
-        Computes the similarity matrix according the section 2.1 of GE2E.
-        :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
-        utterances_per_speaker, embedding_size)
-        :return: the similarity matrix as a tensor of shape (speakers_per_batch,
-        utterances_per_speaker, speakers_per_batch)
-        """
-        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
-        # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
-        centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
-        centroids_incl = centroids_incl.clone() / (torch.norm(centroids_incl, dim=2, keepdim=True) + 1e-5)
-        # Exclusive centroids (1 per utterance)
-        centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
-        centroids_excl /= (utterances_per_speaker - 1)
-        centroids_excl = centroids_excl.clone() / (torch.norm(centroids_excl, dim=2, keepdim=True) + 1e-5)
-        # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
-        # product of these vectors (which is just an element-wise multiplication reduced by a sum).
-        # We vectorize the computation for efficiency.
-        sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
-                                 speakers_per_batch).to(self.loss_device)
-        mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
-        for j in range(speakers_per_batch):
-            mask = np.where(mask_matrix[j])[0]
-            sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
-            sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
-        ## Even more vectorized version (slower maybe because of transpose)
-        # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
-        #                           ).to(self.loss_device)
-        # eye = np.eye(speakers_per_batch, dtype=np.int)
-        # mask = np.where(1 - eye)
-        # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
-        # mask = np.where(eye)
-        # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
-        # sim_matrix2 = sim_matrix2.transpose(1, 2)
-        sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
-        return sim_matrix
-    def loss(self, embeds):
-        """
-        Computes the softmax loss according the section 2.1 of GE2E.
-        :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
-        utterances_per_speaker, embedding_size)
-        :return: the loss and the EER for this batch of embeddings.
-        """
-        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
-        # Loss
-        sim_matrix = self.similarity_matrix(embeds)
-        sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker,
-                                         speakers_per_batch))
-        ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
-        target = torch.from_numpy(ground_truth).long().to(self.loss_device)
-        loss = self.loss_fn(sim_matrix, target)
-        # EER (not backpropagated)
-        with torch.no_grad():
-            inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
-            labels = np.array([inv_argmax(i) for i in ground_truth])
-            preds = sim_matrix.detach().cpu().numpy()
-            # Snippet from https://yangcha.github.io/EER-ROC/
-            fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
-            eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
-        return loss, eer

encoder/params_data.py DELETED Viewed

@@ -1,29 +0,0 @@
-## Mel-filterbank
-mel_window_length = 25  # In milliseconds
-mel_window_step = 10    # In milliseconds
-mel_n_channels = 40
-## Audio
-sampling_rate = 16000
-# Number of spectrogram frames in a partial utterance
-partials_n_frames = 160     # 1600 ms
-# Number of spectrogram frames at inference
-inference_n_frames = 80     #  800 ms
-## Voice Activation Detection
-# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
-# This sets the granularity of the VAD. Should not need to be changed.
-vad_window_length = 30  # In milliseconds
-# Number of frames to average together when performing the moving average smoothing.
-# The larger this value, the larger the VAD variations must be to not get smoothed out.
-vad_moving_average_width = 8
-# Maximum number of consecutive silent frames a segment can have.
-vad_max_silence_length = 6
-## Audio volume normalization
-audio_norm_target_dBFS = -30

encoder/params_model.py DELETED Viewed

@@ -1,11 +0,0 @@
-## Model parameters
-model_hidden_size = 256
-model_embedding_size = 256
-model_num_layers = 3
-## Training parameters
-learning_rate_init = 1e-4
-speakers_per_batch = 64
-utterances_per_speaker = 10

encoder/preprocess.py DELETED Viewed

@@ -1,184 +0,0 @@
-from datetime import datetime
-from functools import partial
-from multiprocessing import Pool
-from pathlib import Path
-import numpy as np
-from tqdm import tqdm
-from encoder import audio
-from encoder.config import librispeech_datasets, anglophone_nationalites
-from encoder.params_data import *
-_AUDIO_EXTENSIONS = ("wav", "flac", "m4a", "mp3")
-class DatasetLog:
-    """
-    Registers metadata about the dataset in a text file.
-    """
-    def __init__(self, root, name):
-        self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
-        self.sample_data = dict()
-        start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
-        self.write_line("Creating dataset %s on %s" % (name, start_time))
-        self.write_line("-----")
-        self._log_params()
-    def _log_params(self):
-        from encoder import params_data
-        self.write_line("Parameter values:")
-        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
-            value = getattr(params_data, param_name)
-            self.write_line("\t%s: %s" % (param_name, value))
-        self.write_line("-----")
-    def write_line(self, line):
-        self.text_file.write("%s\n" % line)
-    def add_sample(self, **kwargs):
-        for param_name, value in kwargs.items():
-            if not param_name in self.sample_data:
-                self.sample_data[param_name] = []
-            self.sample_data[param_name].append(value)
-    def finalize(self):
-        self.write_line("Statistics:")
-        for param_name, values in self.sample_data.items():
-            self.write_line("\t%s:" % param_name)
-            self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
-            self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
-        self.write_line("-----")
-        end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
-        self.write_line("Finished on %s" % end_time)
-        self.text_file.close()
-def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
-    dataset_root = datasets_root.joinpath(dataset_name)
-    if not dataset_root.exists():
-        print("Couldn\'t find %s, skipping this dataset." % dataset_root)
-        return None, None
-    return dataset_root, DatasetLog(out_dir, dataset_name)
-def _preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, skip_existing: bool):
-    # Give a name to the speaker that includes its dataset
-    speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
-    # Create an output directory with that name, as well as a txt file containing a
-    # reference to each source file.
-    speaker_out_dir = out_dir.joinpath(speaker_name)
-    speaker_out_dir.mkdir(exist_ok=True)
-    sources_fpath = speaker_out_dir.joinpath("_sources.txt")
-    # There's a possibility that the preprocessing was interrupted earlier, check if
-    # there already is a sources file.
-    if sources_fpath.exists():
-        try:
-            with sources_fpath.open("r") as sources_file:
-                existing_fnames = {line.split(",")[0] for line in sources_file}
-        except:
-            existing_fnames = {}
-    else:
-        existing_fnames = {}
-    # Gather all audio files for that speaker recursively
-    sources_file = sources_fpath.open("a" if skip_existing else "w")
-    audio_durs = []
-    for extension in _AUDIO_EXTENSIONS:
-        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
-            # Check if the target output file already exists
-            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
-            out_fname = out_fname.replace(".%s" % extension, ".npy")
-            if skip_existing and out_fname in existing_fnames:
-                continue
-            # Load and preprocess the waveform
-            wav = audio.preprocess_wav(in_fpath)
-            if len(wav) == 0:
-                continue
-            # Create the mel spectrogram, discard those that are too short
-            frames = audio.wav_to_mel_spectrogram(wav)
-            if len(frames) < partials_n_frames:
-                continue
-            out_fpath = speaker_out_dir.joinpath(out_fname)
-            np.save(out_fpath, frames)
-            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
-            audio_durs.append(len(wav) / sampling_rate)
-    sources_file.close()
-    return audio_durs
-def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger):
-    print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
-    # Process the utterances for each speaker
-    work_fn = partial(_preprocess_speaker, datasets_root=datasets_root, out_dir=out_dir, skip_existing=skip_existing)
-    with Pool(4) as pool:
-        tasks = pool.imap(work_fn, speaker_dirs)
-        for sample_durs in tqdm(tasks, dataset_name, len(speaker_dirs), unit="speakers"):
-            for sample_dur in sample_durs:
-                logger.add_sample(duration=sample_dur)
-    logger.finalize()
-    print("Done preprocessing %s.\n" % dataset_name)
-def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
-    for dataset_name in librispeech_datasets["train"]["other"]:
-        # Initialize the preprocessing
-        dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
-        if not dataset_root:
-            return
-        # Preprocess all speakers
-        speaker_dirs = list(dataset_root.glob("*"))
-        _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)
-def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
-    # Initialize the preprocessing
-    dataset_name = "VoxCeleb1"
-    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
-    if not dataset_root:
-        return
-    # Get the contents of the meta file
-    with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
-        metadata = [line.split("\t") for line in metafile][1:]
-    # Select the ID and the nationality, filter out non-anglophone speakers
-    nationalities = {line[0]: line[3] for line in metadata}
-    keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if
-                        nationality.lower() in anglophone_nationalites]
-    print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." %
-          (len(keep_speaker_ids), len(nationalities)))
-    # Get the speaker directories for anglophone speakers only
-    speaker_dirs = dataset_root.joinpath("wav").glob("*")
-    speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
-                    speaker_dir.name in keep_speaker_ids]
-    print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." %
-          (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
-    # Preprocess all speakers
-    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)
-def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
-    # Initialize the preprocessing
-    dataset_name = "VoxCeleb2"
-    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
-    if not dataset_root:
-        return
-    # Get the speaker directories
-    # Preprocess all speakers
-    speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
-    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)

encoder/train.py DELETED Viewed

@@ -1,125 +0,0 @@
-from pathlib import Path
-import torch
-from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
-from encoder.model import SpeakerEncoder
-from encoder.params_model import *
-from encoder.visualizations import Visualizations
-from utils.profiler import Profiler
-def sync(device: torch.device):
-    # For correct profiling (cuda operations are async)
-    if device.type == "cuda":
-        torch.cuda.synchronize(device)
-def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
-          backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
-          no_visdom: bool):
-    # Create a dataset and a dataloader
-    dataset = SpeakerVerificationDataset(clean_data_root)
-    loader = SpeakerVerificationDataLoader(
-        dataset,
-        speakers_per_batch,
-        utterances_per_speaker,
-        num_workers=4,
-    )
-    # Setup the device on which to run the forward pass and the loss. These can be different,
-    # because the forward pass is faster on the GPU whereas the loss is often (depending on your
-    # hyperparameters) faster on the CPU.
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    # FIXME: currently, the gradient is None if loss_device is cuda
-    loss_device = torch.device("cpu")
-    # Create the model and the optimizer
-    model = SpeakerEncoder(device, loss_device)
-    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
-    init_step = 1
-    # Configure file path for the model
-    model_dir = models_dir / run_id
-    model_dir.mkdir(exist_ok=True, parents=True)
-    state_fpath = model_dir / "encoder.pt"
-    # Load any existing model
-    if not force_restart:
-        if state_fpath.exists():
-            print("Found existing model \"%s\", loading it and resuming training." % run_id)
-            checkpoint = torch.load(state_fpath)
-            init_step = checkpoint["step"]
-            model.load_state_dict(checkpoint["model_state"])
-            optimizer.load_state_dict(checkpoint["optimizer_state"])
-            optimizer.param_groups[0]["lr"] = learning_rate_init
-        else:
-            print("No model \"%s\" found, starting training from scratch." % run_id)
-    else:
-        print("Starting the training from scratch.")
-    model.train()
-    # Initialize the visualization environment
-    vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
-    vis.log_dataset(dataset)
-    vis.log_params()
-    device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
-    vis.log_implementation({"Device": device_name})
-    # Training loop
-    profiler = Profiler(summarize_every=10, disabled=False)
-    for step, speaker_batch in enumerate(loader, init_step):
-        profiler.tick("Blocking, waiting for batch (threaded)")
-        # Forward pass
-        inputs = torch.from_numpy(speaker_batch.data).to(device)
-        sync(device)
-        profiler.tick("Data to %s" % device)
-        embeds = model(inputs)
-        sync(device)
-        profiler.tick("Forward pass")
-        embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
-        loss, eer = model.loss(embeds_loss)
-        sync(loss_device)
-        profiler.tick("Loss")
-        # Backward pass
-        model.zero_grad()
-        loss.backward()
-        profiler.tick("Backward pass")
-        model.do_gradient_ops()
-        optimizer.step()
-        profiler.tick("Parameter update")
-        # Update visualizations
-        # learning_rate = optimizer.param_groups[0]["lr"]
-        vis.update(loss.item(), eer, step)
-        # Draw projections and save them to the backup folder
-        if umap_every != 0 and step % umap_every == 0:
-            print("Drawing and saving projections (step %d)" % step)
-            projection_fpath = model_dir / f"umap_{step:06d}.png"
-            embeds = embeds.detach().cpu().numpy()
-            vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
-            vis.save()
-        # Overwrite the latest version of the model
-        if save_every != 0 and step % save_every == 0:
-            print("Saving the model (step %d)" % step)
-            torch.save({
-                "step": step + 1,
-                "model_state": model.state_dict(),
-                "optimizer_state": optimizer.state_dict(),
-            }, state_fpath)
-        # Make a backup
-        if backup_every != 0 and step % backup_every == 0:
-            print("Making a backup (step %d)" % step)
-            backup_fpath = model_dir / f"encoder_{step:06d}.bak"
-            torch.save({
-                "step": step + 1,
-                "model_state": model.state_dict(),
-                "optimizer_state": optimizer.state_dict(),
-            }, backup_fpath)
-        profiler.tick("Extras (visualizations, saving)")

encoder/visualizations.py DELETED Viewed

@@ -1,179 +0,0 @@
-from datetime import datetime
-from time import perf_counter as timer
-import numpy as np
-import umap
-import visdom
-from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
-colormap = np.array([
-    [76, 255, 0],
-    [0, 127, 70],
-    [255, 0, 0],
-    [255, 217, 38],
-    [0, 135, 255],
-    [165, 0, 165],
-    [255, 167, 255],
-    [0, 255, 255],
-    [255, 96, 38],
-    [142, 76, 0],
-    [33, 0, 127],
-    [0, 0, 0],
-    [183, 183, 183],
-], dtype=np.float) / 255
-class Visualizations:
-    def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
-        # Tracking data
-        self.last_update_timestamp = timer()
-        self.update_every = update_every
-        self.step_times = []
-        self.losses = []
-        self.eers = []
-        print("Updating the visualizations every %d steps." % update_every)
-        # If visdom is disabled TODO: use a better paradigm for that
-        self.disabled = disabled
-        if self.disabled:
-            return
-        # Set the environment name
-        now = str(datetime.now().strftime("%d-%m %Hh%M"))
-        if env_name is None:
-            self.env_name = now
-        else:
-            self.env_name = "%s (%s)" % (env_name, now)
-        # Connect to visdom and open the corresponding window in the browser
-        try:
-            self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
-        except ConnectionError:
-            raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
-                            "start it.")
-        # webbrowser.open("http://localhost:8097/env/" + self.env_name)
-        # Create the windows
-        self.loss_win = None
-        self.eer_win = None
-        # self.lr_win = None
-        self.implementation_win = None
-        self.projection_win = None
-        self.implementation_string = ""
-    def log_params(self):
-        if self.disabled:
-            return
-        from encoder import params_data
-        from encoder import params_model
-        param_string = "<b>Model parameters</b>:<br>"
-        for param_name in (p for p in dir(params_model) if not p.startswith("__")):
-            value = getattr(params_model, param_name)
-            param_string += "\t%s: %s<br>" % (param_name, value)
-        param_string += "<b>Data parameters</b>:<br>"
-        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
-            value = getattr(params_data, param_name)
-            param_string += "\t%s: %s<br>" % (param_name, value)
-        self.vis.text(param_string, opts={"title": "Parameters"})
-    def log_dataset(self, dataset: SpeakerVerificationDataset):
-        if self.disabled:
-            return
-        dataset_string = ""
-        dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
-        dataset_string += "\n" + dataset.get_logs()
-        dataset_string = dataset_string.replace("\n", "<br>")
-        self.vis.text(dataset_string, opts={"title": "Dataset"})
-    def log_implementation(self, params):
-        if self.disabled:
-            return
-        implementation_string = ""
-        for param, value in params.items():
-            implementation_string += "<b>%s</b>: %s\n" % (param, value)
-            implementation_string = implementation_string.replace("\n", "<br>")
-        self.implementation_string = implementation_string
-        self.implementation_win = self.vis.text(
-            implementation_string,
-            opts={"title": "Training implementation"}
-        )
-    def update(self, loss, eer, step):
-        # Update the tracking data
-        now = timer()
-        self.step_times.append(1000 * (now - self.last_update_timestamp))
-        self.last_update_timestamp = now
-        self.losses.append(loss)
-        self.eers.append(eer)
-        print(".", end="")
-        # Update the plots every <update_every> steps
-        if step % self.update_every != 0:
-            return
-        time_string = "Step time:  mean: %5dms  std: %5dms" % \
-                      (int(np.mean(self.step_times)), int(np.std(self.step_times)))
-        print("\nStep %6d   Loss: %.4f   EER: %.4f   %s" %
-              (step, np.mean(self.losses), np.mean(self.eers), time_string))
-        if not self.disabled:
-            self.loss_win = self.vis.line(
-                [np.mean(self.losses)],
-                [step],
-                win=self.loss_win,
-                update="append" if self.loss_win else None,
-                opts=dict(
-                    legend=["Avg. loss"],
-                    xlabel="Step",
-                    ylabel="Loss",
-                    title="Loss",
-                )
-            )
-            self.eer_win = self.vis.line(
-                [np.mean(self.eers)],
-                [step],
-                win=self.eer_win,
-                update="append" if self.eer_win else None,
-                opts=dict(
-                    legend=["Avg. EER"],
-                    xlabel="Step",
-                    ylabel="EER",
-                    title="Equal error rate"
-                )
-            )
-            if self.implementation_win is not None:
-                self.vis.text(
-                    self.implementation_string + ("<b>%s</b>" % time_string),
-                    win=self.implementation_win,
-                    opts={"title": "Training implementation"},
-                )
-        # Reset the tracking
-        self.losses.clear()
-        self.eers.clear()
-        self.step_times.clear()
-    def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None, max_speakers=10):
-        import matplotlib.pyplot as plt
-        max_speakers = min(max_speakers, len(colormap))
-        embeds = embeds[:max_speakers * utterances_per_speaker]
-        n_speakers = len(embeds) // utterances_per_speaker
-        ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
-        colors = [colormap[i] for i in ground_truth]
-        reducer = umap.UMAP()
-        projected = reducer.fit_transform(embeds)
-        plt.scatter(projected[:, 0], projected[:, 1], c=colors)
-        plt.gca().set_aspect("equal", "datalim")
-        plt.title("UMAP projection (step %d)" % step)
-        if not self.disabled:
-            self.projection_win = self.vis.matplot(plt, win=self.projection_win)
-        if out_fpath is not None:
-            plt.savefig(out_fpath)
-        plt.clf()
-    def save(self):
-        if not self.disabled:
-            self.vis.save([self.env_name])

encoderCoren.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:39373b86598fa3da9fcddee6142382efe09777e8d37dc9c0561f41f0070f134e
-size 17090379

{hifigan → hifi-gan}/LICENSE RENAMED Viewed

File without changes

hifi-gan/README.md ADDED Viewed

	@@ -0,0 +1,105 @@

+# HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis
+### Jungil Kong, Jaehyeon Kim, Jaekyoung Bae
+In our [paper](https://arxiv.org/abs/2010.05646),
+we proposed HiFi-GAN: a GAN-based model capable of generating high fidelity speech efficiently.<br/>
+We provide our implementation and pretrained models as open source in this repository.
+**Abstract :**
+Several recent work on speech synthesis have employed generative adversarial networks (GANs) to produce raw waveforms.
+Although such methods improve the sampling efficiency and memory usage,
+their sample quality has not yet reached that of autoregressive and flow-based generative models.
+In this work, we propose HiFi-GAN, which achieves both efficient and high-fidelity speech synthesis.
+As speech audio consists of sinusoidal signals with various periods,
+we demonstrate that modeling periodic patterns of an audio is crucial for enhancing sample quality.
+A subjective human evaluation (mean opinion score, MOS) of a single speaker dataset indicates that our proposed method
+demonstrates similarity to human quality while generating 22.05 kHz high-fidelity audio 167.9 times faster than
+real-time on a single V100 GPU. We further show the generality of HiFi-GAN to the mel-spectrogram inversion of unseen
+speakers and end-to-end speech synthesis. Finally, a small footprint version of HiFi-GAN generates samples 13.4 times
+faster than real-time on CPU with comparable quality to an autoregressive counterpart.
+Visit our [demo website](https://jik876.github.io/hifi-gan-demo/) for audio samples.
+## Pre-requisites
+1. Python >= 3.6
+2. Clone this repository.
+3. Install python requirements. Please refer [requirements.txt](requirements.txt)
+4. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/).
+And move all wav files to `LJSpeech-1.1/wavs`
+## Training
+```
+python train.py --config config_v1.json
+```
+To train V2 or V3 Generator, replace `config_v1.json` with `config_v2.json` or `config_v3.json`.<br>
+Checkpoints and copy of the configuration file are saved in `cp_hifigan` directory by default.<br>
+You can change the path by adding `--checkpoint_path` option.
+Validation loss during training with V1 generator.<br>
+![validation loss](./validation_loss.png)
+## Pretrained Model
+You can also use pretrained models we provide.<br/>
+[Download pretrained models](https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing)<br/>
+Details of each folder are as in follows:
+|Folder Name|Generator|Dataset|Fine-Tuned|
+|------|---|---|---|
+|LJ_V1|V1|LJSpeech|No|
+|LJ_V2|V2|LJSpeech|No|
+|LJ_V3|V3|LJSpeech|No|
+|LJ_FT_T2_V1|V1|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
+|LJ_FT_T2_V2|V2|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
+|LJ_FT_T2_V3|V3|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
+|VCTK_V1|V1|VCTK|No|
+|VCTK_V2|V2|VCTK|No|
+|VCTK_V3|V3|VCTK|No|
+|UNIVERSAL_V1|V1|Universal|No|
+We provide the universal model with discriminator weights that can be used as a base for transfer learning to other datasets.
+## Fine-Tuning
+1. Generate mel-spectrograms in numpy format using [Tacotron2](https://github.com/NVIDIA/tacotron2) with teacher-forcing.<br/>
+The file name of the generated mel-spectrogram should match the audio file and the extension should be `.npy`.<br/>
+Example:
+    ```
+    Audio File : LJ001-0001.wav
+    Mel-Spectrogram File : LJ001-0001.npy
+    ```
+2. Create `ft_dataset` folder and copy the generated mel-spectrogram files into it.<br/>
+3. Run the following command.
+    ```
+    python train.py --fine_tuning True --config config_v1.json
+    ```
+    For other command line options, please refer to the training section.
+## Inference from wav file
+1. Make `test_files` directory and copy wav files into the directory.
+2. Run the following command.
+    ```
+    python inference.py --checkpoint_file [generator checkpoint file path]
+    ```
+Generated wav files are saved in `generated_files` by default.<br>
+You can change the path by adding `--output_dir` option.
+## Inference for end-to-end speech synthesis
+1. Make `test_mel_files` directory and copy generated mel-spectrogram files into the directory.<br>
+You can generate mel-spectrograms using [Tacotron2](https://github.com/NVIDIA/tacotron2),
+[Glow-TTS](https://github.com/jaywalnut310/glow-tts) and so forth.
+2. Run the following command.
+    ```
+    python inference_e2e.py --checkpoint_file [generator checkpoint file path]
+    ```
+Generated wav files are saved in `generated_files_from_mel` by default.<br>
+You can change the path by adding `--output_dir` option.
+## Acknowledgements
+We referred to [WaveGlow](https://github.com/NVIDIA/waveglow), [MelGAN](https://github.com/descriptinc/melgan-neurips)
+and [Tacotron2](https://github.com/NVIDIA/tacotron2) to implement this.

diagrams/apple.txt → hifi-gan/apple.py RENAMED Viewed

File without changes

{hifigan → hifi-gan}/env.py RENAMED Viewed

File without changes

{hifigan → hifi-gan}/inference.py RENAMED Viewed

@@ -6,9 +6,9 @@ import argparse
 import json
 import torch
 from scipy.io.wavfile import write
-from hifigan.env import AttrDict
-from hifigan.meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav
-from hifigan.models import Generator
 h = None
 device = None

 import json
 import torch
 from scipy.io.wavfile import write
+from env import AttrDict
+from meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav
+from models import Generator
 h = None
 device = None

{hifigan → hifi-gan}/inference_e2e.py RENAMED Viewed

@@ -1,13 +1,15 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 import os
 import numpy as np
 import json
 import torch
 from scipy.io.wavfile import write
-from hifigan.env import AttrDict
-from hifigan.models import Generator
-from io import BytesIO
 h = None
 device = None
@@ -21,9 +23,50 @@ def load_checkpoint(filepath, device):
     return checkpoint_dict
-def hifi_gan_inference(input_mel, checkpoint_file):
     print('Initializing Inference Process..')
-    config_file = os.path.join(os.path.split(checkpoint_file)[0], 'config.json')
     with open(config_file) as f:
         data = f.read()
@@ -31,10 +74,6 @@ def hifi_gan_inference(input_mel, checkpoint_file):
     json_config = json.loads(data)
     h = AttrDict(json_config)
-    # Set MAX_WAV_VALUE if not present
-    if 'MAX_WAV_VALUE' not in h:
-        h.MAX_WAV_VALUE = 32768.0  # Adjust this value based on your requirements
     torch.manual_seed(h.seed)
     global device
     if torch.cuda.is_available():
@@ -43,34 +82,9 @@ def hifi_gan_inference(input_mel, checkpoint_file):
     else:
         device = torch.device('cpu')
-    generator = Generator(h).to(device)
-    state_dict_g = load_checkpoint(checkpoint_file, device)
-    generator.load_state_dict(state_dict_g['generator'])
-    generator.eval()
-    generator.remove_weight_norm()
-    # Load data from BytesIO
-    buffer = BytesIO(input_mel)
-    x = np.load(buffer)
-    x = torch.FloatTensor(x).to(device)
-    y_g_hat = generator(x)
-    # Detach tensor before converting to numpy
-    audio = y_g_hat.squeeze().detach().numpy()
-    # Set MAX_WAV_VALUE if not present
-    if 'MAX_WAV_VALUE' not in h:
-        h.MAX_WAV_VALUE = 32768.0  # Adjust this value based on your requirements
-    audio = audio * h.MAX_WAV_VALUE
-    audio = audio.astype('int16')
-    # Save audio to BytesIO
-    output_buffer = BytesIO()
-    write(output_buffer, h.sampling_rate, audio)
-    return output_buffer.getvalue()

 from __future__ import absolute_import, division, print_function, unicode_literals
+import glob
 import os
 import numpy as np
+import argparse
 import json
 import torch
 from scipy.io.wavfile import write
+from env import AttrDict
+from meldataset import MAX_WAV_VALUE
+from models import Generator
 h = None
 device = None
     return checkpoint_dict
+def scan_checkpoint(cp_dir, prefix):
+    pattern = os.path.join(cp_dir, prefix + '*')
+    cp_list = glob.glob(pattern)
+    if len(cp_list) == 0:
+        return ''
+    return sorted(cp_list)[-1]
+def inference(a):
+    generator = Generator(h).to(device)
+    state_dict_g = load_checkpoint(a.checkpoint_file, device)
+    generator.load_state_dict(state_dict_g['generator'])
+    filelist = os.listdir(a.input_mels_dir)
+    os.makedirs(a.output_dir, exist_ok=True)
+    generator.eval()
+    generator.remove_weight_norm()
+    with torch.no_grad():
+        for i, filname in enumerate(filelist):
+            x = np.load(os.path.join(a.input_mels_dir, filname))
+            x = torch.FloatTensor(x).to(device)
+            y_g_hat = generator(x)
+            audio = y_g_hat.squeeze()
+            audio = audio * MAX_WAV_VALUE
+            audio = audio.cpu().numpy().astype('int16')
+            output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + '_generated_e2e.wav')
+            write(output_file, h.sampling_rate, audio)
+            print(output_file)
+def main():
     print('Initializing Inference Process..')
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_mels_dir', default='test_mel_files')
+    parser.add_argument('--output_dir', default='generated_files_from_mel')
+    parser.add_argument('--checkpoint_file', required=True)
+    a = parser.parse_args()
+    config_file = os.path.join(os.path.split(a.checkpoint_file)[0], 'config.json')
     with open(config_file) as f:
         data = f.read()
     json_config = json.loads(data)
     h = AttrDict(json_config)
     torch.manual_seed(h.seed)
     global device
     if torch.cuda.is_available():
     else:
         device = torch.device('cpu')
+    inference(a)
+if __name__ == '__main__':
+    main()

{hifigan → hifi-gan}/meldataset.py RENAMED Viewed

File without changes

{hifigan → hifi-gan}/models.py RENAMED Viewed

@@ -3,7 +3,7 @@ import torch.nn.functional as F
 import torch.nn as nn
 from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
-from hifigan.hifigan_utils import init_weights, get_padding
 LRELU_SLOPE = 0.1

 import torch.nn as nn
 from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from utils import init_weights, get_padding
 LRELU_SLOPE = 0.1

hifi-gan/requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch==1.4.0
+numpy==1.17.4
+librosa==0.7.2
+scipy==1.4.1
+tensorboard==2.0
+soundfile==0.10.3.post1
+matplotlib==3.1.3

{hifigan → hifi-gan}/train.py RENAMED Viewed

@@ -12,11 +12,11 @@ from torch.utils.data import DistributedSampler, DataLoader
 import torch.multiprocessing as mp
 from torch.distributed import init_process_group
 from torch.nn.parallel import DistributedDataParallel
-from hifigan.env import AttrDict, build_env
-from hifigan.meldataset import MelDataset, mel_spectrogram, get_dataset_filelist
-from hifigan.models import Generator, MultiPeriodDiscriminator, MultiScaleDiscriminator, feature_loss, generator_loss,\
     discriminator_loss
-from hifigan.hifigan_utils import plot_spectrogram, scan_checkpoint, load_checkpoint, save_checkpoint
 torch.backends.cudnn.benchmark = True

 import torch.multiprocessing as mp
 from torch.distributed import init_process_group
 from torch.nn.parallel import DistributedDataParallel
+from env import AttrDict, build_env
+from meldataset import MelDataset, mel_spectrogram, get_dataset_filelist
+from models import Generator, MultiPeriodDiscriminator, MultiScaleDiscriminator, feature_loss, generator_loss,\
     discriminator_loss
+from utils import plot_spectrogram, scan_checkpoint, load_checkpoint, save_checkpoint
 torch.backends.cudnn.benchmark = True

hifigan/hifigan_utils.py → hifi-gan/utils.py RENAMED Viewed

File without changes

hparams.py CHANGED Viewed

@@ -61,7 +61,6 @@ def create_hparams(hparams_string=None, verbose=False):
         "encoder_kernel_size":5,
         "encoder_n_convolutions":3,
         "encoder_embedding_dim":512,
-        "speaker_embedding_dim":256,
         # Decoder parameters
         "n_frames_per_step":1,  # currently only 1 is supported

         "encoder_kernel_size":5,
         "encoder_n_convolutions":3,
         "encoder_embedding_dim":512,
         # Decoder parameters
         "n_frames_per_step":1,  # currently only 1 is supported

kaggle_12000.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:27d4936bff68d3fe37053ec3110486bdea9f23bf137f07477c28bbd4f36b85ae
-size 338426303

logger.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import random
+import torch
+from torch.utils.tensorboard import SummaryWriter
+from plotting_utils import plot_alignment_to_numpy, plot_spectrogram_to_numpy
+from plotting_utils import plot_gate_outputs_to_numpy
+class Tacotron2Logger(SummaryWriter):
+    def __init__(self, logdir):
+        super(Tacotron2Logger, self).__init__(logdir)
+    def log_training(self, reduced_loss, grad_norm, learning_rate, duration,
+                     iteration):
+            self.add_scalar("training.loss", reduced_loss, iteration)
+            self.add_scalar("grad.norm", grad_norm, iteration)
+            self.add_scalar("learning.rate", learning_rate, iteration)
+            self.add_scalar("duration", duration, iteration)
+    def log_validation(self, reduced_loss, model, y, y_pred, iteration):
+        self.add_scalar("validation.loss", reduced_loss, iteration)
+        _, mel_outputs, gate_outputs, alignments = y_pred
+        mel_targets, gate_targets = y
+        # plot distribution of parameters
+        for tag, value in model.named_parameters():
+            tag = tag.replace('.', '/')
+            self.add_histogram(tag, value.data.cpu().numpy(), iteration)
+        # plot alignment, mel target and predicted, gate target and predicted
+        idx = random.randint(0, alignments.size(0) - 1)
+        self.add_image(
+            "alignment",
+            plot_alignment_to_numpy(alignments[idx].data.cpu().numpy().T),
+            iteration, dataformats='HWC')
+        self.add_image(
+            "mel_target",
+            plot_spectrogram_to_numpy(mel_targets[idx].data.cpu().numpy()),
+            iteration, dataformats='HWC')
+        self.add_image(
+            "mel_predicted",
+            plot_spectrogram_to_numpy(mel_outputs[idx].data.cpu().numpy()),
+            iteration, dataformats='HWC')
+        self.add_image(
+            "gate",
+            plot_gate_outputs_to_numpy(
+                gate_targets[idx].data.cpu().numpy(),
+                torch.sigmoid(gate_outputs[idx]).data.cpu().numpy()),
+            iteration, dataformats='HWC')

logic.py CHANGED Viewed

@@ -3,28 +3,15 @@ import numpy as np
 import torch
 import base64
 import io
-from io import BytesIO
 import matplotlib.pyplot as plt
 from hparams import create_hparams
 from model import Tacotron2
-from layers import TacotronSTFT
 from train import load_model
 from text import text_to_sequence
-from utils import load_wav_to_torch
 import os
-import random
-import librosa
 import librosa.display
-use_cuda = torch.cuda.is_available()
-device = torch.device('cuda' if use_cuda else 'cpu')
-hparams = create_hparams()
-hparams.sampling_rate = 22050
-stft = TacotronSTFT(
-    hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels,
-    hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax).to(device)
 # Function to plot data
 def plot_data(data, figsize=(16, 4), titles=['Mel Spectrogram (Original)', 'Mel Spectrogram (Postnet)', 'Alignment'],
               xlabel=['Time Steps', 'Time Steps', 'Decoder Time Steps'],
@@ -55,84 +42,59 @@ def plot_data(data, figsize=(16, 4), titles=['Mel Spectrogram (Original)', 'Mel
     return img_base64
 #Function to plot timedomain waveform
-def plot_waveforms(audio_data):
-    # Load the audio from BytesIO
-    buffer = BytesIO(audio_data)
-    y, sr = librosa.load(buffer, sr=None)
-    # Create waveform plot
-    plt.figure(figsize=(10, 4))
-    librosa.display.waveshow(y, sr=sr)
-    plt.xlabel("Time (s)")
-    plt.ylabel("Amplitude")
-    plt.title("Waveform")
-    # Save the plot to a BytesIO object
-    wave_buffer = BytesIO()
-    plt.savefig(wave_buffer, format="png")
-    wave_buffer.seek(0)
-    plt.close()
-    # Encode the plot as base64
-    wave_base64 = base64.b64encode(wave_buffer.read()).decode('utf-8')
-    return wave_base64
-# load speaker model
-def load_speaker_model(speaker_model_path):
-    from speaker.model import SpeakerEncoder
-    device = torch.device('cuda' if use_cuda else 'cpu')
-    loss_device = torch.device("cpu")
-    model = SpeakerEncoder(device, loss_device)
-    speaker_dict = torch.load(speaker_model_path, map_location='cpu')
-    model.load_state_dict(speaker_dict)
-    # Freeze the weights of the speaker model
-    for param in model.parameters():
-        param.requires_grad = False
-    return model
-speaker_model = load_speaker_model('speaker/saved_models/saved_model_e273_LargeBatch.pt').to(device).eval().float()
-def extract_speech_embedding(audio_path: str):
-    audio, sampling_rate = load_wav_to_torch(audio_path)
-    if sampling_rate != stft.sampling_rate:
-        raise ValueError("{} SR doesn't match target {} SR".format(sampling_rate, stft.sampling_rate))
-    audio_norm = audio / 32768.0
-    audio_norm = audio_norm.unsqueeze(0)
-    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False).to(device)
-    melspec = stft.mel_spectrogram(audio_norm).transpose(1,2).float()
-    if melspec.shape[1] <= 128:
-        mel_slice = mel
-    else:
-        slice_start = random.randint(0,melspec.shape[1]-128)
-        mel_slice = melspec[:,slice_start:slice_start+128]
-    speaker_embedding = speaker_model(mel_slice)
-    return speaker_embedding
 def synthesize_voice(text_input, checkpoint_path):
-    # Load Tacotron2 model from checkpoint
-    model = load_model(hparams)
-    checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
-    model.load_state_dict(checkpoint['state_dict'])
-    model = model.to(device).eval().float()
     # Nepali text
-    speaker_audio_path='speaker_audio/ariana.wav'
     sequence = np.array(text_to_sequence(text_input, ['transliteration_cleaners']))[None, :]
-    sequence = torch.autograd.Variable(torch.from_numpy(sequence)).to(device).long()
-    speaker_embedding = extract_speech_embedding(speaker_audio_path)
     # Melspectrogram and Alignment graph
-    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, speaker_embedding)
     mel_output_data = mel_outputs.data.cpu().numpy()[0]
     mel_output_postnet_data = mel_outputs_postnet.data.cpu().numpy()[0]
     alignments_data = alignments.data.cpu().numpy()[0].T
-    return mel_output_data, mel_output_postnet_data, alignments_data

 import torch
 import base64
 import io
 import matplotlib.pyplot as plt
 from hparams import create_hparams
 from model import Tacotron2
 from train import load_model
 from text import text_to_sequence
 import os
+import subprocess
 import librosa.display
 # Function to plot data
 def plot_data(data, figsize=(16, 4), titles=['Mel Spectrogram (Original)', 'Mel Spectrogram (Postnet)', 'Alignment'],
               xlabel=['Time Steps', 'Time Steps', 'Decoder Time Steps'],
     return img_base64
 #Function to plot timedomain waveform
+def plot_waveforms(audio_file, sr=22050):
+    # Load audio waveform
+    y, sr = librosa.load(audio_file, sr=sr)
+    # Create time vector
+    time = librosa.times_like(y, sr=sr)
+    # Plot the waveform
+    plt.figure(figsize=(16, 4))
+    librosa.display.waveshow(y, sr=sr)
+    plt.title('Time vs Amplitude')
+    plt.xlabel('Time (s)')
+    plt.ylabel('Amplitude')
+    plt.tight_layout()
+    # plt.savefig('static/waveform.png')
+    img_buffer = io.BytesIO()
+    plt.savefig(img_buffer, format='png', bbox_inches='tight', pad_inches=0)
+    plt.close()
+    img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
+    return img_base64
 def synthesize_voice(text_input, checkpoint_path):
+    # Load Tacotron2 model
+    hparams = create_hparams()
+    hparams.sampling_rate = 22050
+    # Load model from checkpoint
+    model = load_model(hparams)
+    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
+    model = model.cuda().eval().half()
     # Nepali text
     sequence = np.array(text_to_sequence(text_input, ['transliteration_cleaners']))[None, :]
+    sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
     # Melspectrogram and Alignment graph
+    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
     mel_output_data = mel_outputs.data.cpu().numpy()[0]
     mel_output_postnet_data = mel_outputs_postnet.data.cpu().numpy()[0]
     alignments_data = alignments.data.cpu().numpy()[0].T
+    np.save('mel_files/mel1'+'.npy', mel_output_data)
+    input_mels_dir = 'mel_files/'
+    output_dir = 'audio_output/'
+    run_hifigan_inference(input_mels_dir, output_dir)
+    return mel_output_data, mel_output_postnet_data, alignments_data
+def run_hifigan_inference(input_mels_dir, output_dir):
+    script_path = os.path.join(os.path.dirname("hifigan/"), "inference_e2e.py")  # Assuming both scripts are in the same directory
+    subprocess.run(["python", script_path, "--checkpoint_file", "generator_v1", "--input_mels_dir", input_mels_dir, "--output_dir", output_dir])

loss_function.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from torch import nn
+class Tacotron2Loss(nn.Module):
+    def __init__(self):
+        super(Tacotron2Loss, self).__init__()
+    def forward(self, model_output, targets):
+        mel_target, gate_target = targets[0], targets[1]
+        mel_target.requires_grad = False
+        gate_target.requires_grad = False
+        gate_target = gate_target.view(-1, 1)
+        mel_out, mel_out_postnet, gate_out, _ = model_output
+        gate_out = gate_out.view(-1, 1)
+        mel_loss = nn.MSELoss()(mel_out, mel_target) + \
+            nn.MSELoss()(mel_out_postnet, mel_target)
+        gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target)
+        return mel_loss + gate_loss

loss_scaler.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import torch
+class LossScaler:
+    def __init__(self, scale=1):
+        self.cur_scale = scale
+    # `params` is a list / generator of torch.Variable
+    def has_overflow(self, params):
+        return False
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        return False
+    # `overflow` is boolean indicating whether we overflowed in gradient
+    def update_scale(self, overflow):
+        pass
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+    def backward(self, loss):
+        scaled_loss = loss*self.loss_scale
+        scaled_loss.backward()
+class DynamicLossScaler:
+    def __init__(self,
+                 init_scale=2**32,
+                 scale_factor=2.,
+                 scale_window=1000):
+        self.cur_scale = init_scale
+        self.cur_iter = 0
+        self.last_overflow_iter = -1
+        self.scale_factor = scale_factor
+        self.scale_window = scale_window
+    # `params` is a list / generator of torch.Variable
+    def has_overflow(self, params):
+#        return False
+        for p in params:
+            if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
+                return True
+        return False
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        cpu_sum = float(x.float().sum())
+        if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+            return True
+        return False
+    # `overflow` is boolean indicating whether we overflowed in gradient
+    def update_scale(self, overflow):
+        if overflow:
+            #self.cur_scale /= self.scale_factor
+            self.cur_scale = max(self.cur_scale/self.scale_factor, 1)
+            self.last_overflow_iter = self.cur_iter
+        else:
+            if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
+                self.cur_scale *= self.scale_factor
+#        self.cur_scale = 1
+        self.cur_iter += 1
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+    def backward(self, loss):
+        scaled_loss = loss*self.loss_scale
+        scaled_loss.backward()
+##############################################################
+# Example usage below here -- assuming it's in a separate file
+##############################################################
+if __name__ == "__main__":
+    import torch
+    from torch.autograd import Variable
+    from dynamic_loss_scaler import DynamicLossScaler
+    # N is batch size; D_in is input dimension;
+    # H is hidden dimension; D_out is output dimension.
+    N, D_in, H, D_out = 64, 1000, 100, 10
+    # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
+    x = Variable(torch.randn(N, D_in), requires_grad=False)
+    y = Variable(torch.randn(N, D_out), requires_grad=False)
+    w1 = Variable(torch.randn(D_in, H), requires_grad=True)
+    w2 = Variable(torch.randn(H, D_out), requires_grad=True)
+    parameters = [w1, w2]
+    learning_rate = 1e-6
+    optimizer = torch.optim.SGD(parameters, lr=learning_rate)
+    loss_scaler = DynamicLossScaler()
+    for t in range(500):
+        y_pred = x.mm(w1).clamp(min=0).mm(w2)
+        loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
+        print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
+        print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
+        print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
+        # Run backprop
+        optimizer.zero_grad()
+        loss.backward()
+        # Check for overflow
+        has_overflow = DynamicLossScaler.has_overflow(parameters)
+        # If no overflow, unscale grad and update as usual
+        if not has_overflow:
+            for param in parameters:
+                param.grad.data.mul_(1. / loss_scaler.loss_scale)
+            optimizer.step()
+        # Otherwise, don't do anything -- ie, skip iteration
+        else:
+            print('OVERFLOW!')
+        # Update loss scale for next iteration
+        loss_scaler.update_scale(has_overflow)

model.py CHANGED Viewed

@@ -147,8 +147,13 @@ class Postnet(nn.Module):
 class Encoder(nn.Module):
     def __init__(self, hparams):
         super(Encoder, self).__init__()
         convolutions = []
         for _ in range(hparams.encoder_n_convolutions):
             conv_layer = nn.Sequential(
@@ -165,15 +170,13 @@ class Encoder(nn.Module):
                             int(hparams.encoder_embedding_dim / 2), 1,
                             batch_first=True, bidirectional=True)
-    def forward(self, x, input_lengths, speaker_embedding):
-        # Modify the input x to concatenate the speaker embedding
-        x = torch.cat((x, speaker_embedding.unsqueeze(1).expand(-1, x.size(1), -1)), dim=-1)
         for conv in self.convolutions:
             x = F.dropout(F.relu(conv(x)), 0.5, self.training)
         x = x.transpose(1, 2)
         input_lengths = input_lengths.cpu().numpy()
         x = nn.utils.rnn.pack_padded_sequence(
             x, input_lengths, batch_first=True)
@@ -186,10 +189,7 @@ class Encoder(nn.Module):
         return outputs
-    def inference(self, x, speaker_embedding):
-        # Modify the input x to concatenate the speaker embedding
-        x = torch.cat((x, speaker_embedding.unsqueeze(1).expand(-1, x.size(1), -1)), dim=-1)
         for conv in self.convolutions:
             x = F.dropout(F.relu(conv(x)), 0.5, self.training)
@@ -496,14 +496,13 @@ class Tacotron2(nn.Module):
         return outputs
-    def forward(self, inputs, speaker_embedding):
         text_inputs, text_lengths, mels, max_len, output_lengths = inputs
         text_lengths, output_lengths = text_lengths.data, output_lengths.data
         embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
-        # Pass the speaker embedding to the Encoder
-        encoder_outputs = self.encoder(embedded_inputs, text_lengths, speaker_embedding)
         mel_outputs, gate_outputs, alignments = self.decoder(
             encoder_outputs, mels, memory_lengths=text_lengths)
@@ -515,11 +514,9 @@ class Tacotron2(nn.Module):
             [mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
             output_lengths)
-    def inference(self, inputs, speaker_embedding):
         embedded_inputs = self.embedding(inputs).transpose(1, 2)
-        # Pass the speaker embedding to the Encoder
-        encoder_outputs = self.encoder.inference(embedded_inputs, speaker_embedding)
         mel_outputs, gate_outputs, alignments = self.decoder.inference(
             encoder_outputs)
@@ -530,4 +527,3 @@ class Tacotron2(nn.Module):
             [mel_outputs, mel_outputs_postnet, gate_outputs, alignments])
         return outputs

 class Encoder(nn.Module):
+    """Encoder module:
+        - Three 1-d convolution banks
+        - Bidirectional LSTM
+    """
     def __init__(self, hparams):
         super(Encoder, self).__init__()
         convolutions = []
         for _ in range(hparams.encoder_n_convolutions):
             conv_layer = nn.Sequential(
                             int(hparams.encoder_embedding_dim / 2), 1,
                             batch_first=True, bidirectional=True)
+    def forward(self, x, input_lengths):
         for conv in self.convolutions:
             x = F.dropout(F.relu(conv(x)), 0.5, self.training)
         x = x.transpose(1, 2)
+        # pytorch tensor are not reversible, hence the conversion
         input_lengths = input_lengths.cpu().numpy()
         x = nn.utils.rnn.pack_padded_sequence(
             x, input_lengths, batch_first=True)
         return outputs
+    def inference(self, x):
         for conv in self.convolutions:
             x = F.dropout(F.relu(conv(x)), 0.5, self.training)
         return outputs
+    def forward(self, inputs):
         text_inputs, text_lengths, mels, max_len, output_lengths = inputs
         text_lengths, output_lengths = text_lengths.data, output_lengths.data
         embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
+        encoder_outputs = self.encoder(embedded_inputs, text_lengths)
         mel_outputs, gate_outputs, alignments = self.decoder(
             encoder_outputs, mels, memory_lengths=text_lengths)
             [mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
             output_lengths)
+    def inference(self, inputs):
         embedded_inputs = self.embedding(inputs).transpose(1, 2)
+        encoder_outputs = self.encoder.inference(embedded_inputs)
         mel_outputs, gate_outputs, alignments = self.decoder.inference(
             encoder_outputs)
             [mel_outputs, mel_outputs_postnet, gate_outputs, alignments])
         return outputs

multiproc.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import time
+import torch
+import sys
+import subprocess
+argslist = list(sys.argv)[1:]
+num_gpus = torch.cuda.device_count()
+argslist.append('--n_gpus={}'.format(num_gpus))
+workers = []
+job_id = time.strftime("%Y_%m_%d-%H%M%S")
+argslist.append("--group_name=group_{}".format(job_id))
+for i in range(num_gpus):
+    argslist.append('--rank={}'.format(i))
+    stdout = None if i == 0 else open("logs/{}_GPU_{}.log".format(job_id, i),
+                                      "w")
+    print(argslist)
+    p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout)
+    workers.append(p)
+    argslist = argslist[:-1]
+for p in workers:
+    p.wait()

requirements.txt CHANGED Viewed

@@ -1,8 +1,11 @@
-fastapi[all]
 gunicorn
-torch==1.12.1+cu113
-torchaudio==0.12.1+cu113
-torchvision==0.13.1+cu113
 matplotlib==3.5.3
 numpy==1.18.5
 inflect
@@ -11,6 +14,4 @@ scipy==1.7.3
 tensorboard==2.11.2
 Unidecode
 pillow
-uvicorn
-httpx==0.19.0
---extra-index-url https://download.pytorch.org/whl/cu113

+flask
+flask_cors
+typing
+fastapi
 gunicorn
+torch==1.12.1
+torchaudio==0.12.1
+torchvision==0.13.1
 matplotlib==3.5.3
 numpy==1.18.5
 inflect
 tensorboard==2.11.2
 Unidecode
 pillow
+uvicorn

saved_model.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6ccc0abcd0fb77104be73e6675454a06e7797bf1d4a1177181c32b648e9d75a9
-size 5697243

speaker/__init__.py DELETED Viewed

File without changes

speaker/bana.txt DELETED Viewed

File without changes

speaker/data.py DELETED Viewed

@@ -1,109 +0,0 @@
-import torch
-import torchaudio.datasets as datasets
-import torchaudio.transforms as transforms
-from collections import defaultdict
-import random
-import layers
-import warnings
-class SpeakerMelLoader(torch.utils.data.Dataset):
-    """
-    computes mel-spectrograms from audio file and pulls the speaker ID from the
-    dataset
-    """
-    def __init__(self, dataset, format='speaker', speaker_utterances=4, mel_length = 128, mel_type = 'Tacotron'):
-        self.dataset = dataset
-        self.set_format(format)
-        self.speaker_utterances = speaker_utterances
-        self.mel_length = mel_length
-        self.mel_type = mel_type
-        self.mel_generators = dict()
-    def set_format(self,format):
-        self.format = format
-        if format == 'speaker':
-            self.create_speaker_index()
-    def create_speaker_index(self):
-        vals = [x.split('-',1) for x in self.dataset._walker]
-        speaker_map = defaultdict(list)
-        for i,v in enumerate(vals):
-            speaker_map[v[0]].append(i)
-        self.speaker_map = speaker_map
-        self.speaker_keys = list(speaker_map.keys())
-    def apply_mel_gen(self, waveform, sampling_rate, channels=80):
-        if (sampling_rate, channels) not in self.mel_generators:
-            if self.mel_type == 'MFCC':
-                mel_gen = transforms.MFCC(sample_rate=sampling_rate, n_mfcc=channels)
-            elif self.mel_type == 'Mel':
-                mel_gen = transforms.MelSpectrogram(sample_rate=sampling_rate, n_mels=channels)
-            elif self.mel_type == 'Tacotron':
-                mel_gen = layers.TacotronSTFT(sampling_rate=sampling_rate,n_mel_channels=channels)
-            else:
-                raise NotImplementedError('Unsupported mel_type in MelSpeakerLoader: '+self.mel_type)
-            self.mel_generators[(sampling_rate,channels)] = mel_gen
-        else:
-            mel_gen = self.mel_generators[(sampling_rate, channels)]
-        if self.mel_type == 'Tacotron':
-            #Replicating from Tacotron2 data loader
-            max_wav_value=32768.0
-            #skip normalization from Tacotron2, LibriSpeech data looks pre-normalized (all vals between 0-1)
-            audio_norm = waveform #/ max_wav_value
-            audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
-            melspec = mel_gen.mel_spectrogram(audio_norm)
-        else:
-            audio = waveform.unsqueeze(0)
-            audio = torch.autograd.Variable(audio, requires_grad=False)
-            melspec = mel_gen(audio)
-        return melspec
-    def get_mel(self, waveform, sampling_rate, channels=80):
-        # We previously identified that these warnings were ok.
-        with warnings.catch_warnings():
-            warnings.filterwarnings('ignore', message=r'At least one mel filterbank has all zero values.*', module=r'torchaudio.*')
-            melspec = self.apply_mel_gen(waveform, sampling_rate, channels)
-            # melspec is (1,1,channels, time) by default
-            # return (time, channels)
-            melspec = torch.squeeze(melspec).T
-            return melspec
-    def __getitem__(self, index):
-        if self.format == 'utterance':
-            (waveform, sample_rate, _, speaker_id, _, _) = self.dataset[index]
-            mel = self.get_mel(waveform, sample_rate)
-            return (speaker_id, mel)
-        elif self.format == 'speaker':
-            speaker_id = self.speaker_keys[index]
-            utter_indexes = random.sample(self.speaker_map[speaker_id], self.speaker_utterances)
-            mels = []
-            for i in utter_indexes:
-                (waveform, sample_rate, _, speaker_id, _, _) = self.dataset[i]
-                mel = self.get_mel(waveform, sample_rate)
-                if mel.shape[0] < self.mel_length:
-                    #Zero pad mel on the right to mel_length
-                    #pad_tuple is (dn start, dn end, dn-1 start, dn-1 end, ... , d1 start, d1 end)
-                    pad_tuple = (0,0,0,self.mel_length-mel.shape[0])
-                    mel=torch.nn.functional.pad(mel,pad_tuple)
-                    mel_frame = 0
-                else:
-                    mel_frame = random.randint(0,mel.shape[0]-self.mel_length)
-                mels.append(mel[mel_frame:mel_frame+self.mel_length,:])
-            return (speaker_id, torch.stack(mels,0))
-        else:
-            raise NotImplementedError()
-    def __len__(self):
-        if self.format == 'utterance':
-            return len(self.dataset)
-        elif self.format == 'speaker':
-            return len(self.speaker_keys)
-        else:
-            raise NotImplementedError()

speaker/model.py DELETED Viewed

@@ -1,191 +0,0 @@
-from torch import nn
-import numpy as np
-import torch
-from torch.nn.utils import clip_grad_norm_
-class SpeakerEncoder(nn.Module):
-    """ Learn speaker representation from speech utterance of arbitrary lengths.
-    """
-    def __init__(self, device, loss_device):
-        super().__init__()
-        self.loss_device = loss_device
-        # lstm block consisting of 3 layers
-        # takes input 80 channel log-mel spectrograms, projected to 256 dimensions
-        self.lstm = nn.LSTM(
-            input_size=80,
-            hidden_size=256,
-            num_layers=3,
-            batch_first=True,
-            dropout=0,
-            bidirectional=False
-        ).to(device)
-        self.linear = nn.Linear(in_features=256, out_features=256).to(device)
-        self.relu = nn.ReLU().to(device)
-        # epsilon term for numerical stability ( ie - division by 0)
-        self.epsilon = 1e-5
-        #Cosine similarity weights
-        self.sim_weight = nn.Parameter(torch.tensor([5.])).to(loss_device)
-        self.sim_bias = nn.Parameter(torch.tensor([-1.])).to(loss_device)
-    def forward(self, utterances, h_init=None, c_init=None):
-        # implement section 2.1 from https://arxiv.org/pdf/1806.04558.pdf
-        if h_init is None or c_init is None:
-            out, (hidden, cell) = self.lstm(utterances)
-        else:
-            out, (hidden, cell) = self.lstm(utterances, (h_init, c_init))
-        # compute speaker embedding from hidden state of final layer
-        final_hidden = hidden[-1]
-        speaker_embedding = self.relu(self.linear(final_hidden))
-        # l2 norm of speaker embedding
-        speaker_embedding = speaker_embedding / (torch.norm(speaker_embedding, dim=1, keepdim=True) + self.epsilon)
-        return speaker_embedding
-    def gradient_clipping(self):
-        self.sim_weight.grad *= 0.01
-        self.sim_bias.grad *= 0.01
-        #Pytorch to clip gradients if norm greater than max
-        clip_grad_norm_(self.parameters(),max_norm=3,norm_type=2)
-    def similarity_matrix(self, embeds, debug=False):
-        # calculate s_ji,k from section 2.1 of GE2E paper
-        # output matrix is cosine similarity between each utterance x centroid of each speaker
-        # embeds input size: (speakers, utterances, embedding size)
-        # Speaker centroids
-        # Equal to average of utterance embeddings for the speaker
-        # Used for neg examples (utterance comparing to false speaker)
-        # Equation 1 in paper
-        # size: (speakers, 1, embedding size)
-        speaker_centroid = torch.mean(embeds,dim=1,keepdim=True)
-        # Utterance exclusive centroids
-        # Equal to average of utterance embeddings for the speaker, excluding ith utterance
-        # Used for pos samples (utterance comparing to true speaker; speaker centroid exludes the utterance)
-        # Equation 8 in paper
-        # size: (speakers, utterances, embedding size)
-        num_utterance = embeds.shape[1]
-        utter_ex_centroid = (torch.sum(embeds,dim=1,keepdim=True) - embeds) / (num_utterance-1)
-        if debug:
-            print("e",embeds.shape)
-            print(embeds)
-            print("sc",speaker_centroid.shape)
-            print(speaker_centroid)
-            print("uc",utter_ex_centroid.shape)
-            print(utter_ex_centroid)
-        # Create pos and neg masks
-        num_speaker = embeds.shape[0]
-        i = torch.eye(num_speaker, dtype=torch.int)
-        pos_mask = torch.where(i)
-        neg_mask = torch.where(1-i)
-        if debug:
-            print("pm",len(pos_mask),len(pos_mask[0]))
-            print(pos_mask)
-            print("nm",len(neg_mask),len(neg_mask[0]))
-            print(neg_mask)
-        # Compile similarity matrix
-        # size: (speakers, utterances, speakers)
-        # initial size is (speakers, speakers, utterances for easier vectorization)
-        sim_matrix = torch.zeros(num_speaker, num_speaker, num_utterance).to(self.loss_device)
-        sim_matrix[pos_mask] = nn.functional.cosine_similarity(embeds,utter_ex_centroid,dim=2)
-        sim_matrix[neg_mask] = nn.functional.cosine_similarity(embeds[neg_mask[0]],speaker_centroid[neg_mask[1]],dim=2)
-        if debug:
-            print("sm",sim_matrix.shape)
-            print("pos vals",sim_matrix[pos_mask])
-            print("neg vals",sim_matrix[neg_mask])
-            print(sim_matrix)
-        sim_matrix = sim_matrix.permute(0,2,1)
-        if debug:
-            print("sm",sim_matrix.shape)
-            print(sim_matrix)
-            print("cos sim weight", self.sim_weight)
-            print("cos sim bias", self.sim_bias)
-        # Apply weight / bias
-        sim_matrix = sim_matrix * self.sim_weight + self.sim_bias
-        return sim_matrix
-    def softmax_loss(self, embeds):
-        """
-        computes softmax loss as defined by equ 6 in the GE2E paper
-        :param embeds: shape (speakers, utterances, embedding size)
-        :return: computed softmax loss
-        """
-        # per the GE2E paper, softmax loss as defined by equ 6
-        # performs slightly better over Text-Independent Speaker
-        # Verification tasks.
-        # ref section 2.1 of the GE2E paper
-        speaker_count = embeds.shape[0]
-        # speaker, utterance, speaker
-        similarities = self.similarity_matrix(embeds)
-        # equ 6
-        loss_matrix = -similarities[torch.arange(0, speaker_count), :, torch.arange(0, speaker_count)] + \
-                      torch.log(torch.sum(torch.exp(similarities), dim=2))
-        # equ 10
-        return torch.sum(loss_matrix)
-    def contrast_loss(self, embeds):
-        """
-        computes contrast loss as defined by equ 7 in the GE2E paper
-        :param embeds: shape (speakers, utterances, embedding size)
-        :return: computed softmax loss
-        """
-        # per the GE2E paper, contrast loss as defined by equ 7
-        # performs slightly better over Text-Dependent Speaker
-        # Verification tasks.
-        # ref section 2.1 of the GE2E paper
-        speaker_count, utterance_count = embeds.shape[0:2]
-        # speaker, utterance, speaker
-        similarities = self.similarity_matrix(embeds)
-        # Janky indexing to resolve k != j
-        mask = torch.ones(similarities.shape, dtype=torch.bool)
-        mask[torch.arange(speaker_count), :, torch.arange(speaker_count)] = False
-        closest_neighbors, _ = torch.max(similarities[mask].reshape(speaker_count, utterance_count, speaker_count - 1), dim=2)
-        # Positive influence over matching embeddings
-        matching_embedding = similarities[torch.arange(0, speaker_count), :, torch.arange(0, speaker_count)]
-        # equ 7
-        loss_matrix = 1 - torch.sigmoid(matching_embedding) + torch.sigmoid(closest_neighbors)
-        # equ 10
-        return torch.sum(loss_matrix)
-    def accuracy(self, embeds):
-        """
-        computes argmax accuracy
-        :param embeds: shape (speakers, utterances, speakers)
-        :return: accuracy
-        """
-        num_speaker, num_utter = embeds.shape[:2]
-        similarities = self.similarity_matrix(embeds)
-        preds = torch.argmax(similarities, dim=2)
-        preds_one_hot = torch.nn.functional.one_hot(preds,num_classes = num_speaker)
-        actual = torch.arange(num_speaker).unsqueeze(1).repeat(1,num_utter)
-        actual_one_hot = torch.nn.functional.one_hot(actual,num_classes=num_speaker)
-        return torch.sum(preds_one_hot * actual_one_hot)/(num_speaker*num_utter)

speaker/preprocess.py DELETED Viewed

	@@ -1 +0,0 @@
1	- # Reference https://github.com/CorentinJ/Real-Time-Voice-Cloning/blob/0713f860a3dd41afb56e83cff84dbdf589d5e11a/encoder/preprocess.py#L16

speaker/saved_model.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6ccc0abcd0fb77104be73e6675454a06e7797bf1d4a1177181c32b648e9d75a9
-size 5697243

speaker/saved_model_e175.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:52ba80266b9f45fc3d825942aae40858eeaaa73994ba86e9ed017a533dc13323
-size 5861083

speaker/saved_models/dog.txt DELETED Viewed

File without changes

speaker/saved_models/saved_model_e175.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:52ba80266b9f45fc3d825942aae40858eeaaa73994ba86e9ed017a533dc13323
-size 5861083

speaker/saved_models/saved_model_e273_LargeBatch.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fbaaaa28a7d58b1316f322e1f33a5a68c00046b7b89a823ae7d987a632b8c7d6
-size 5861083

speaker/saved_models/saved_model_e300.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d9be127fb61b6d2306ff877ab2184f187450953a5555a6751b3616b5ed84e78a
-size 5698805