This view is limited to 50 files because it contains too many changes.Β  See the raw diff here.
Files changed (50) hide show
  1. app.py +31 -38
  2. distributed.py +173 -0
  3. encoder/audio.py +0 -117
  4. encoder/config.py +0 -45
  5. encoder/data_objects/__init__.py +0 -2
  6. encoder/data_objects/random_cycler.py +0 -37
  7. encoder/data_objects/speaker.py +0 -40
  8. encoder/data_objects/speaker_batch.py +0 -13
  9. encoder/data_objects/speaker_verification_dataset.py +0 -56
  10. encoder/data_objects/utterance.py +0 -26
  11. encoder/inference.py +0 -178
  12. encoder/model.py +0 -135
  13. encoder/params_data.py +0 -29
  14. encoder/params_model.py +0 -11
  15. encoder/preprocess.py +0 -184
  16. encoder/train.py +0 -125
  17. encoder/visualizations.py +0 -179
  18. encoderCoren.pt +0 -3
  19. {hifigan β†’ hifi-gan}/LICENSE +0 -0
  20. hifi-gan/README.md +105 -0
  21. diagrams/apple.txt β†’ hifi-gan/apple.py +0 -0
  22. {hifigan β†’ hifi-gan}/env.py +0 -0
  23. {hifigan β†’ hifi-gan}/inference.py +3 -3
  24. {hifigan β†’ hifi-gan}/inference_e2e.py +51 -37
  25. {hifigan β†’ hifi-gan}/meldataset.py +0 -0
  26. {hifigan β†’ hifi-gan}/models.py +1 -1
  27. hifi-gan/requirements.txt +7 -0
  28. {hifigan β†’ hifi-gan}/train.py +4 -4
  29. hifigan/hifigan_utils.py β†’ hifi-gan/utils.py +0 -0
  30. hparams.py +0 -1
  31. kaggle_12000.pt +0 -3
  32. logger.py +48 -0
  33. logic.py +41 -79
  34. loss_function.py +19 -0
  35. loss_scaler.py +131 -0
  36. model.py +12 -16
  37. multiproc.py +23 -0
  38. requirements.txt +8 -7
  39. saved_model.pt +0 -3
  40. speaker/__init__.py +0 -0
  41. speaker/bana.txt +0 -0
  42. speaker/data.py +0 -109
  43. speaker/model.py +0 -191
  44. speaker/preprocess.py +0 -1
  45. speaker/saved_model.pt +0 -3
  46. speaker/saved_model_e175.pt +0 -3
  47. speaker/saved_models/dog.txt +0 -0
  48. speaker/saved_models/saved_model_e175.pt +0 -3
  49. speaker/saved_models/saved_model_e273_LargeBatch.pt +0 -3
  50. speaker/saved_models/saved_model_e300.pt +0 -3
app.py CHANGED
@@ -3,70 +3,63 @@ from fastapi.responses import JSONResponse
3
  from fastapi.middleware.cors import CORSMiddleware
4
  from logic import synthesize_voice, plot_data, plot_waveforms
5
  import base64
6
- import sys
7
- import numpy as np
8
- from io import BytesIO
9
- from hifigan.inference_e2e import hifi_gan_inference
10
 
11
  app = FastAPI()
12
 
13
- @app.get("/")
14
- def read_root():
15
- data = {"Voice": "Cloning", "Status": "Success"}
16
- return JSONResponse(content=data)
17
 
 
 
 
 
 
 
 
 
 
18
  app.add_middleware(
19
  CORSMiddleware,
20
- allow_origins=["*"],
21
  allow_credentials=True,
22
  allow_methods=["*"],
23
  allow_headers=["*"],
24
  )
25
 
26
- hugging_face_api_url = "https://huggingface.co/spaces/lord-reso/host/synthesize"
 
 
 
27
 
28
- @app.post("/synthesize")
29
- async def synthesize(request: Request):
30
- print("call successful")
31
-
32
- json = await request.json()
33
- print(json)
34
-
35
- font_type = json['font_select']
36
- input_text = json['input_text']
37
 
38
- print("generating mel-spectrogram")
39
  # Generate mel-spectrogram using Tacotron2
40
- # mel_output_data, mel_output_postnet_data, alignments_data = synthesize_voice(input_text, "Shruti_finetuned.pt")
41
- mel_output_data, mel_output_postnet_data, alignments_data = synthesize_voice(input_text, "kaggle_12000.pt")
42
- print("mel generation successful")
43
-
44
  # Convert mel-spectrogram to base64 for display in HTML
45
  mel_output_base64 = plot_data([mel_output_data, mel_output_postnet_data, alignments_data])
46
 
47
- # Audio Synthesis begins
48
- print("Starting audio synthesis")
49
- buffer = BytesIO()
50
- np.save(buffer, mel_output_data)
51
- input_mel = buffer.getvalue()
52
-
53
- hifigan_checkpoint = "generator_v1"
54
-
55
- # Generate audio using Hifigan
56
- audio_data = hifi_gan_inference(input_mel, hifigan_checkpoint)
57
 
58
- print("Creating time-domain waveform")
59
  # Plot the waveform
60
- wave_base64 = plot_waveforms(audio_data)
61
 
62
  # Encode audio content as Base64
63
- audio_base64 = base64.b64encode(audio_data).decode('utf-8')
 
64
 
65
  # Customize the response based on the information you want to send to the frontend
66
  response_data = {
67
  'mel_spectrogram': mel_output_base64,
68
  'audio_data': audio_base64,
69
  'waveform': wave_base64,
 
70
  }
71
 
72
- return JSONResponse(content=response_data)
 
3
  from fastapi.middleware.cors import CORSMiddleware
4
  from logic import synthesize_voice, plot_data, plot_waveforms
5
  import base64
6
+ from typing import Dict
 
 
 
7
 
8
  app = FastAPI()
9
 
 
 
 
 
10
 
11
+ # You need to replace the placeholders above with the actual URLs for the models.
12
+
13
+ # Allow requests from your Vercel domain
14
+ origins = [
15
+ "https://host-test-smoky.vercel.app",
16
+ # Add other allowed origins if needed
17
+ ]
18
+
19
+ # Set up CORS middleware
20
  app.add_middleware(
21
  CORSMiddleware,
22
+ allow_origins=origins,
23
  allow_credentials=True,
24
  allow_methods=["*"],
25
  allow_headers=["*"],
26
  )
27
 
28
+ @app.post("/synthesize", response_model=Dict[str, str])
29
+ async def synthesize(request_data: Dict[str, str]):
30
+ font_type = request_data['font_select']
31
+ input_text = request_data['input_text']
32
 
33
+ # Font selection logic (customize based on your requirements)
34
+ if font_type == 'Preeti':
35
+ # Implement Preeti font logic
36
+ pass
37
+ elif font_type == 'Unicode':
38
+ # Implement Unicode font logic
39
+ pass
 
 
40
 
 
41
  # Generate mel-spectrogram using Tacotron2
42
+ mel_output_data, mel_output_postnet_data, alignments_data = synthesize_voice(input_text, "Shruti_finetuned")
43
+
 
 
44
  # Convert mel-spectrogram to base64 for display in HTML
45
  mel_output_base64 = plot_data([mel_output_data, mel_output_postnet_data, alignments_data])
46
 
47
+ # Save the generated audio file
48
+ audio_file_path = 'audio_output/mel1_generated_e2e.wav'
 
 
 
 
 
 
 
 
49
 
 
50
  # Plot the waveform
51
+ wave_base64 = plot_waveforms(audio_file_path)
52
 
53
  # Encode audio content as Base64
54
+ with open(audio_file_path, 'rb') as audio_file:
55
+ audio_base64 = base64.b64encode(audio_file.read()).decode('utf-8')
56
 
57
  # Customize the response based on the information you want to send to the frontend
58
  response_data = {
59
  'mel_spectrogram': mel_output_base64,
60
  'audio_data': audio_base64,
61
  'waveform': wave_base64,
62
+ 'some_other_data': 'example_value',
63
  }
64
 
65
+ return JSONResponse(content=response_data)
distributed.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.distributed as dist
3
+ from torch.nn.modules import Module
4
+ from torch.autograd import Variable
5
+
6
+ def _flatten_dense_tensors(tensors):
7
+ """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
8
+ same dense type.
9
+ Since inputs are dense, the resulting tensor will be a concatenated 1D
10
+ buffer. Element-wise operation on this buffer will be equivalent to
11
+ operating individually.
12
+ Arguments:
13
+ tensors (Iterable[Tensor]): dense tensors to flatten.
14
+ Returns:
15
+ A contiguous 1D buffer containing input tensors.
16
+ """
17
+ if len(tensors) == 1:
18
+ return tensors[0].contiguous().view(-1)
19
+ flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0)
20
+ return flat
21
+
22
+ def _unflatten_dense_tensors(flat, tensors):
23
+ """View a flat buffer using the sizes of tensors. Assume that tensors are of
24
+ same dense type, and that flat is given by _flatten_dense_tensors.
25
+ Arguments:
26
+ flat (Tensor): flattened dense tensors to unflatten.
27
+ tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
28
+ unflatten flat.
29
+ Returns:
30
+ Unflattened dense tensors with sizes same as tensors and values from
31
+ flat.
32
+ """
33
+ outputs = []
34
+ offset = 0
35
+ for tensor in tensors:
36
+ numel = tensor.numel()
37
+ outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
38
+ offset += numel
39
+ return tuple(outputs)
40
+
41
+
42
+ '''
43
+ This version of DistributedDataParallel is designed to be used in conjunction with the multiproc.py
44
+ launcher included with this example. It assumes that your run is using multiprocess with 1
45
+ GPU/process, that the model is on the correct device, and that torch.set_device has been
46
+ used to set the device.
47
+
48
+ Parameters are broadcasted to the other processes on initialization of DistributedDataParallel,
49
+ and will be allreduced at the finish of the backward pass.
50
+ '''
51
+ class DistributedDataParallel(Module):
52
+
53
+ def __init__(self, module):
54
+ super(DistributedDataParallel, self).__init__()
55
+ #fallback for PyTorch 0.3
56
+ if not hasattr(dist, '_backend'):
57
+ self.warn_on_half = True
58
+ else:
59
+ self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
60
+
61
+ self.module = module
62
+
63
+ for p in self.module.state_dict().values():
64
+ if not torch.is_tensor(p):
65
+ continue
66
+ dist.broadcast(p, 0)
67
+
68
+ def allreduce_params():
69
+ if(self.needs_reduction):
70
+ self.needs_reduction = False
71
+ buckets = {}
72
+ for param in self.module.parameters():
73
+ if param.requires_grad and param.grad is not None:
74
+ tp = type(param.data)
75
+ if tp not in buckets:
76
+ buckets[tp] = []
77
+ buckets[tp].append(param)
78
+ if self.warn_on_half:
79
+ if torch.cuda.HalfTensor in buckets:
80
+ print("WARNING: gloo dist backend for half parameters may be extremely slow." +
81
+ " It is recommended to use the NCCL backend in this case. This currently requires" +
82
+ "PyTorch built from top of tree master.")
83
+ self.warn_on_half = False
84
+
85
+ for tp in buckets:
86
+ bucket = buckets[tp]
87
+ grads = [param.grad.data for param in bucket]
88
+ coalesced = _flatten_dense_tensors(grads)
89
+ dist.all_reduce(coalesced)
90
+ coalesced /= dist.get_world_size()
91
+ for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
92
+ buf.copy_(synced)
93
+
94
+ for param in list(self.module.parameters()):
95
+ def allreduce_hook(*unused):
96
+ param._execution_engine.queue_callback(allreduce_params)
97
+ if param.requires_grad:
98
+ param.register_hook(allreduce_hook)
99
+
100
+ def forward(self, *inputs, **kwargs):
101
+ self.needs_reduction = True
102
+ return self.module(*inputs, **kwargs)
103
+
104
+ '''
105
+ def _sync_buffers(self):
106
+ buffers = list(self.module._all_buffers())
107
+ if len(buffers) > 0:
108
+ # cross-node buffer sync
109
+ flat_buffers = _flatten_dense_tensors(buffers)
110
+ dist.broadcast(flat_buffers, 0)
111
+ for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
112
+ buf.copy_(synced)
113
+ def train(self, mode=True):
114
+ # Clear NCCL communicator and CUDA event cache of the default group ID,
115
+ # These cache will be recreated at the later call. This is currently a
116
+ # work-around for a potential NCCL deadlock.
117
+ if dist._backend == dist.dist_backend.NCCL:
118
+ dist._clear_group_cache()
119
+ super(DistributedDataParallel, self).train(mode)
120
+ self.module.train(mode)
121
+ '''
122
+ '''
123
+ Modifies existing model to do gradient allreduce, but doesn't change class
124
+ so you don't need "module"
125
+ '''
126
+ def apply_gradient_allreduce(module):
127
+ if not hasattr(dist, '_backend'):
128
+ module.warn_on_half = True
129
+ else:
130
+ module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
131
+
132
+ for p in module.state_dict().values():
133
+ if not torch.is_tensor(p):
134
+ continue
135
+ dist.broadcast(p, 0)
136
+
137
+ def allreduce_params():
138
+ if(module.needs_reduction):
139
+ module.needs_reduction = False
140
+ buckets = {}
141
+ for param in module.parameters():
142
+ if param.requires_grad and param.grad is not None:
143
+ tp = param.data.dtype
144
+ if tp not in buckets:
145
+ buckets[tp] = []
146
+ buckets[tp].append(param)
147
+ if module.warn_on_half:
148
+ if torch.cuda.HalfTensor in buckets:
149
+ print("WARNING: gloo dist backend for half parameters may be extremely slow." +
150
+ " It is recommended to use the NCCL backend in this case. This currently requires" +
151
+ "PyTorch built from top of tree master.")
152
+ module.warn_on_half = False
153
+
154
+ for tp in buckets:
155
+ bucket = buckets[tp]
156
+ grads = [param.grad.data for param in bucket]
157
+ coalesced = _flatten_dense_tensors(grads)
158
+ dist.all_reduce(coalesced)
159
+ coalesced /= dist.get_world_size()
160
+ for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
161
+ buf.copy_(synced)
162
+
163
+ for param in list(module.parameters()):
164
+ def allreduce_hook(*unused):
165
+ Variable._execution_engine.queue_callback(allreduce_params)
166
+ if param.requires_grad:
167
+ param.register_hook(allreduce_hook)
168
+
169
+ def set_needs_reduction(self, input, output):
170
+ self.needs_reduction = True
171
+
172
+ module.register_forward_hook(set_needs_reduction)
173
+ return module
encoder/audio.py DELETED
@@ -1,117 +0,0 @@
1
- from scipy.ndimage.morphology import binary_dilation
2
- from encoder.params_data import *
3
- from pathlib import Path
4
- from typing import Optional, Union
5
- from warnings import warn
6
- import numpy as np
7
- import librosa
8
- import struct
9
-
10
- try:
11
- import webrtcvad
12
- except:
13
- warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")
14
- webrtcvad=None
15
-
16
- int16_max = (2 ** 15) - 1
17
-
18
-
19
- def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
20
- source_sr: Optional[int] = None,
21
- normalize: Optional[bool] = True,
22
- trim_silence: Optional[bool] = True):
23
- """
24
- Applies the preprocessing operations used in training the Speaker Encoder to a waveform
25
- either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
26
-
27
- :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
28
- just .wav), either the waveform as a numpy array of floats.
29
- :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
30
- preprocessing. After preprocessing, the waveform's sampling rate will match the data
31
- hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
32
- this argument will be ignored.
33
- """
34
- # Load the wav from disk if needed
35
- if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
36
- wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
37
- else:
38
- wav = fpath_or_wav
39
-
40
- # Resample the wav if needed
41
- if source_sr is not None and source_sr != sampling_rate:
42
- wav = librosa.resample(wav, source_sr, sampling_rate)
43
-
44
- # Apply the preprocessing: normalize volume and shorten long silences
45
- if normalize:
46
- wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
47
- if webrtcvad and trim_silence:
48
- wav = trim_long_silences(wav)
49
-
50
- return wav
51
-
52
-
53
- def wav_to_mel_spectrogram(wav):
54
- """
55
- Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
56
- Note: this not a log-mel spectrogram.
57
- """
58
- frames = librosa.feature.melspectrogram(
59
- wav,
60
- sampling_rate,
61
- n_fft=int(sampling_rate * mel_window_length / 1000),
62
- hop_length=int(sampling_rate * mel_window_step / 1000),
63
- n_mels=mel_n_channels
64
- )
65
- return frames.astype(np.float32).T
66
-
67
-
68
- def trim_long_silences(wav):
69
- """
70
- Ensures that segments without voice in the waveform remain no longer than a
71
- threshold determined by the VAD parameters in params.py.
72
-
73
- :param wav: the raw waveform as a numpy array of floats
74
- :return: the same waveform with silences trimmed away (length <= original wav length)
75
- """
76
- # Compute the voice detection window size
77
- samples_per_window = (vad_window_length * sampling_rate) // 1000
78
-
79
- # Trim the end of the audio to have a multiple of the window size
80
- wav = wav[:len(wav) - (len(wav) % samples_per_window)]
81
-
82
- # Convert the float waveform to 16-bit mono PCM
83
- pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
84
-
85
- # Perform voice activation detection
86
- voice_flags = []
87
- vad = webrtcvad.Vad(mode=3)
88
- for window_start in range(0, len(wav), samples_per_window):
89
- window_end = window_start + samples_per_window
90
- voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
91
- sample_rate=sampling_rate))
92
- voice_flags = np.array(voice_flags)
93
-
94
- # Smooth the voice detection with a moving average
95
- def moving_average(array, width):
96
- array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
97
- ret = np.cumsum(array_padded, dtype=float)
98
- ret[width:] = ret[width:] - ret[:-width]
99
- return ret[width - 1:] / width
100
-
101
- audio_mask = moving_average(voice_flags, vad_moving_average_width)
102
- audio_mask = np.round(audio_mask).astype(np.bool)
103
-
104
- # Dilate the voiced regions
105
- audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
106
- audio_mask = np.repeat(audio_mask, samples_per_window)
107
-
108
- return wav[audio_mask == True]
109
-
110
-
111
- def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
112
- if increase_only and decrease_only:
113
- raise ValueError("Both increase only and decrease only are set")
114
- dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
115
- if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
116
- return wav
117
- return wav * (10 ** (dBFS_change / 20))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
encoder/config.py DELETED
@@ -1,45 +0,0 @@
1
- librispeech_datasets = {
2
- "train": {
3
- "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
4
- "other": ["LibriSpeech/train-other-500"]
5
- },
6
- "test": {
7
- "clean": ["LibriSpeech/test-clean"],
8
- "other": ["LibriSpeech/test-other"]
9
- },
10
- "dev": {
11
- "clean": ["LibriSpeech/dev-clean"],
12
- "other": ["LibriSpeech/dev-other"]
13
- },
14
- }
15
- libritts_datasets = {
16
- "train": {
17
- "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
18
- "other": ["LibriTTS/train-other-500"]
19
- },
20
- "test": {
21
- "clean": ["LibriTTS/test-clean"],
22
- "other": ["LibriTTS/test-other"]
23
- },
24
- "dev": {
25
- "clean": ["LibriTTS/dev-clean"],
26
- "other": ["LibriTTS/dev-other"]
27
- },
28
- }
29
- voxceleb_datasets = {
30
- "voxceleb1" : {
31
- "train": ["VoxCeleb1/wav"],
32
- "test": ["VoxCeleb1/test_wav"]
33
- },
34
- "voxceleb2" : {
35
- "train": ["VoxCeleb2/dev/aac"],
36
- "test": ["VoxCeleb2/test_wav"]
37
- }
38
- }
39
-
40
- other_datasets = [
41
- "LJSpeech-1.1",
42
- "VCTK-Corpus/wav48",
43
- ]
44
-
45
- anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
encoder/data_objects/__init__.py DELETED
@@ -1,2 +0,0 @@
1
- from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
2
- from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader
 
 
 
encoder/data_objects/random_cycler.py DELETED
@@ -1,37 +0,0 @@
1
- import random
2
-
3
- class RandomCycler:
4
- """
5
- Creates an internal copy of a sequence and allows access to its items in a constrained random
6
- order. For a source sequence of n items and one or several consecutive queries of a total
7
- of m items, the following guarantees hold (one implies the other):
8
- - Each item will be returned between m // n and ((m - 1) // n) + 1 times.
9
- - Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
10
- """
11
-
12
- def __init__(self, source):
13
- if len(source) == 0:
14
- raise Exception("Can't create RandomCycler from an empty collection")
15
- self.all_items = list(source)
16
- self.next_items = []
17
-
18
- def sample(self, count: int):
19
- shuffle = lambda l: random.sample(l, len(l))
20
-
21
- out = []
22
- while count > 0:
23
- if count >= len(self.all_items):
24
- out.extend(shuffle(list(self.all_items)))
25
- count -= len(self.all_items)
26
- continue
27
- n = min(count, len(self.next_items))
28
- out.extend(self.next_items[:n])
29
- count -= n
30
- self.next_items = self.next_items[n:]
31
- if len(self.next_items) == 0:
32
- self.next_items = shuffle(list(self.all_items))
33
- return out
34
-
35
- def __next__(self):
36
- return self.sample(1)[0]
37
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
encoder/data_objects/speaker.py DELETED
@@ -1,40 +0,0 @@
1
- from encoder.data_objects.random_cycler import RandomCycler
2
- from encoder.data_objects.utterance import Utterance
3
- from pathlib import Path
4
-
5
- # Contains the set of utterances of a single speaker
6
- class Speaker:
7
- def __init__(self, root: Path):
8
- self.root = root
9
- self.name = root.name
10
- self.utterances = None
11
- self.utterance_cycler = None
12
-
13
- def _load_utterances(self):
14
- with self.root.joinpath("_sources.txt").open("r") as sources_file:
15
- sources = [l.split(",") for l in sources_file]
16
- sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
17
- self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
18
- self.utterance_cycler = RandomCycler(self.utterances)
19
-
20
- def random_partial(self, count, n_frames):
21
- """
22
- Samples a batch of <count> unique partial utterances from the disk in a way that all
23
- utterances come up at least once every two cycles and in a random order every time.
24
-
25
- :param count: The number of partial utterances to sample from the set of utterances from
26
- that speaker. Utterances are guaranteed not to be repeated if <count> is not larger than
27
- the number of utterances available.
28
- :param n_frames: The number of frames in the partial utterance.
29
- :return: A list of tuples (utterance, frames, range) where utterance is an Utterance,
30
- frames are the frames of the partial utterances and range is the range of the partial
31
- utterance with regard to the complete utterance.
32
- """
33
- if self.utterances is None:
34
- self._load_utterances()
35
-
36
- utterances = self.utterance_cycler.sample(count)
37
-
38
- a = [(u,) + u.random_partial(n_frames) for u in utterances]
39
-
40
- return a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
encoder/data_objects/speaker_batch.py DELETED
@@ -1,13 +0,0 @@
1
- import numpy as np
2
- from typing import List
3
- from encoder.data_objects.speaker import Speaker
4
-
5
-
6
- class SpeakerBatch:
7
- def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
8
- self.speakers = speakers
9
- self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
10
-
11
- # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
12
- # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
13
- self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
encoder/data_objects/speaker_verification_dataset.py DELETED
@@ -1,56 +0,0 @@
1
- from encoder.data_objects.random_cycler import RandomCycler
2
- from encoder.data_objects.speaker_batch import SpeakerBatch
3
- from encoder.data_objects.speaker import Speaker
4
- from encoder.params_data import partials_n_frames
5
- from torch.utils.data import Dataset, DataLoader
6
- from pathlib import Path
7
-
8
- # TODO: improve with a pool of speakers for data efficiency
9
-
10
- class SpeakerVerificationDataset(Dataset):
11
- def __init__(self, datasets_root: Path):
12
- self.root = datasets_root
13
- speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
14
- if len(speaker_dirs) == 0:
15
- raise Exception("No speakers found. Make sure you are pointing to the directory "
16
- "containing all preprocessed speaker directories.")
17
- self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
18
- self.speaker_cycler = RandomCycler(self.speakers)
19
-
20
- def __len__(self):
21
- return int(1e10)
22
-
23
- def __getitem__(self, index):
24
- return next(self.speaker_cycler)
25
-
26
- def get_logs(self):
27
- log_string = ""
28
- for log_fpath in self.root.glob("*.txt"):
29
- with log_fpath.open("r") as log_file:
30
- log_string += "".join(log_file.readlines())
31
- return log_string
32
-
33
-
34
- class SpeakerVerificationDataLoader(DataLoader):
35
- def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None,
36
- batch_sampler=None, num_workers=0, pin_memory=False, timeout=0,
37
- worker_init_fn=None):
38
- self.utterances_per_speaker = utterances_per_speaker
39
-
40
- super().__init__(
41
- dataset=dataset,
42
- batch_size=speakers_per_batch,
43
- shuffle=False,
44
- sampler=sampler,
45
- batch_sampler=batch_sampler,
46
- num_workers=num_workers,
47
- collate_fn=self.collate,
48
- pin_memory=pin_memory,
49
- drop_last=False,
50
- timeout=timeout,
51
- worker_init_fn=worker_init_fn
52
- )
53
-
54
- def collate(self, speakers):
55
- return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames)
56
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
encoder/data_objects/utterance.py DELETED
@@ -1,26 +0,0 @@
1
- import numpy as np
2
-
3
-
4
- class Utterance:
5
- def __init__(self, frames_fpath, wave_fpath):
6
- self.frames_fpath = frames_fpath
7
- self.wave_fpath = wave_fpath
8
-
9
- def get_frames(self):
10
- return np.load(self.frames_fpath)
11
-
12
- def random_partial(self, n_frames):
13
- """
14
- Crops the frames into a partial utterance of n_frames
15
-
16
- :param n_frames: The number of frames of the partial utterance
17
- :return: the partial utterance frames and a tuple indicating the start and end of the
18
- partial utterance in the complete utterance.
19
- """
20
- frames = self.get_frames()
21
- if frames.shape[0] == n_frames:
22
- start = 0
23
- else:
24
- start = np.random.randint(0, frames.shape[0] - n_frames)
25
- end = start + n_frames
26
- return frames[start:end], (start, end)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
encoder/inference.py DELETED
@@ -1,178 +0,0 @@
1
- from encoder.params_data import *
2
- from encoder.model import SpeakerEncoder
3
- from encoder.audio import preprocess_wav # We want to expose this function from here
4
- from matplotlib import cm
5
- from encoder import audio
6
- from pathlib import Path
7
- import numpy as np
8
- import torch
9
-
10
- _model = None # type: SpeakerEncoder
11
- _device = None # type: torch.device
12
-
13
-
14
- def load_model(weights_fpath: Path, device=None):
15
- """
16
- Loads the model in memory. If this function is not explicitely called, it will be run on the
17
- first call to embed_frames() with the default weights file.
18
-
19
- :param weights_fpath: the path to saved model weights.
20
- :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
21
- model will be loaded and will run on this device. Outputs will however always be on the cpu.
22
- If None, will default to your GPU if it"s available, otherwise your CPU.
23
- """
24
- # TODO: I think the slow loading of the encoder might have something to do with the device it
25
- # was saved on. Worth investigating.
26
- global _model, _device
27
- if device is None:
28
- _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
29
- elif isinstance(device, str):
30
- _device = torch.device(device)
31
- _model = SpeakerEncoder(_device, torch.device("cpu"))
32
- checkpoint = torch.load(weights_fpath, _device)
33
- _model.load_state_dict(checkpoint["model_state"])
34
- _model.eval()
35
- print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
36
-
37
-
38
- def is_loaded():
39
- return _model is not None
40
-
41
-
42
- def embed_frames_batch(frames_batch):
43
- """
44
- Computes embeddings for a batch of mel spectrogram.
45
-
46
- :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
47
- (batch_size, n_frames, n_channels)
48
- :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
49
- """
50
- if _model is None:
51
- raise Exception("Model was not loaded. Call load_model() before inference.")
52
-
53
- frames = torch.from_numpy(frames_batch).to(_device)
54
- embed = _model.forward(frames).detach().cpu().numpy()
55
- return embed
56
-
57
-
58
- def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
59
- min_pad_coverage=0.75, overlap=0.5):
60
- """
61
- Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
62
- partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
63
- spectrogram slices are returned, so as to make each partial utterance waveform correspond to
64
- its spectrogram. This function assumes that the mel spectrogram parameters used are those
65
- defined in params_data.py.
66
-
67
- The returned ranges may be indexing further than the length of the waveform. It is
68
- recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
69
-
70
- :param n_samples: the number of samples in the waveform
71
- :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
72
- utterance
73
- :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
74
- enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
75
- then the last partial utterance will be considered, as if we padded the audio. Otherwise,
76
- it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
77
- utterance, this parameter is ignored so that the function always returns at least 1 slice.
78
- :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
79
- utterances are entirely disjoint.
80
- :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
81
- respectively the waveform and the mel spectrogram with these slices to obtain the partial
82
- utterances.
83
- """
84
- assert 0 <= overlap < 1
85
- assert 0 < min_pad_coverage <= 1
86
-
87
- samples_per_frame = int((sampling_rate * mel_window_step / 1000))
88
- n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
89
- frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
90
-
91
- # Compute the slices
92
- wav_slices, mel_slices = [], []
93
- steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
94
- for i in range(0, steps, frame_step):
95
- mel_range = np.array([i, i + partial_utterance_n_frames])
96
- wav_range = mel_range * samples_per_frame
97
- mel_slices.append(slice(*mel_range))
98
- wav_slices.append(slice(*wav_range))
99
-
100
- # Evaluate whether extra padding is warranted or not
101
- last_wav_range = wav_slices[-1]
102
- coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
103
- if coverage < min_pad_coverage and len(mel_slices) > 1:
104
- mel_slices = mel_slices[:-1]
105
- wav_slices = wav_slices[:-1]
106
-
107
- return wav_slices, mel_slices
108
-
109
-
110
- def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
111
- """
112
- Computes an embedding for a single utterance.
113
-
114
- # TODO: handle multiple wavs to benefit from batching on GPU
115
- :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
116
- :param using_partials: if True, then the utterance is split in partial utterances of
117
- <partial_utterance_n_frames> frames and the utterance embedding is computed from their
118
- normalized average. If False, the utterance is instead computed from feeding the entire
119
- spectogram to the network.
120
- :param return_partials: if True, the partial embeddings will also be returned along with the
121
- wav slices that correspond to the partial embeddings.
122
- :param kwargs: additional arguments to compute_partial_splits()
123
- :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
124
- <return_partials> is True, the partial utterances as a numpy array of float32 of shape
125
- (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
126
- returned. If <using_partials> is simultaneously set to False, both these values will be None
127
- instead.
128
- """
129
- # Process the entire utterance if not using partials
130
- if not using_partials:
131
- frames = audio.wav_to_mel_spectrogram(wav)
132
- embed = embed_frames_batch(frames[None, ...])[0]
133
- if return_partials:
134
- return embed, None, None
135
- return embed
136
-
137
- # Compute where to split the utterance into partials and pad if necessary
138
- wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
139
- max_wave_length = wave_slices[-1].stop
140
- if max_wave_length >= len(wav):
141
- wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
142
-
143
- # Split the utterance into partials
144
- frames = audio.wav_to_mel_spectrogram(wav)
145
- frames_batch = np.array([frames[s] for s in mel_slices])
146
- partial_embeds = embed_frames_batch(frames_batch)
147
-
148
- # Compute the utterance embedding from the partial embeddings
149
- raw_embed = np.mean(partial_embeds, axis=0)
150
- embed = raw_embed / np.linalg.norm(raw_embed, 2)
151
-
152
- if return_partials:
153
- return embed, partial_embeds, wave_slices
154
- return embed
155
-
156
-
157
- def embed_speaker(wavs, **kwargs):
158
- raise NotImplemented()
159
-
160
-
161
- def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
162
- import matplotlib.pyplot as plt
163
- if ax is None:
164
- ax = plt.gca()
165
-
166
- if shape is None:
167
- height = int(np.sqrt(len(embed)))
168
- shape = (height, -1)
169
- embed = embed.reshape(shape)
170
-
171
- cmap = cm.get_cmap()
172
- mappable = ax.imshow(embed, cmap=cmap)
173
- cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
174
- sm = cm.ScalarMappable(cmap=cmap)
175
- sm.set_clim(*color_range)
176
-
177
- ax.set_xticks([]), ax.set_yticks([])
178
- ax.set_title(title)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
encoder/model.py DELETED
@@ -1,135 +0,0 @@
1
- from encoder.params_model import *
2
- from encoder.params_data import *
3
- from scipy.interpolate import interp1d
4
- from sklearn.metrics import roc_curve
5
- from torch.nn.utils import clip_grad_norm_
6
- from scipy.optimize import brentq
7
- from torch import nn
8
- import numpy as np
9
- import torch
10
-
11
-
12
- class SpeakerEncoder(nn.Module):
13
- def __init__(self, device, loss_device):
14
- super().__init__()
15
- self.loss_device = loss_device
16
-
17
- # Network defition
18
- self.lstm = nn.LSTM(input_size=mel_n_channels,
19
- hidden_size=model_hidden_size,
20
- num_layers=model_num_layers,
21
- batch_first=True).to(device)
22
- self.linear = nn.Linear(in_features=model_hidden_size,
23
- out_features=model_embedding_size).to(device)
24
- self.relu = torch.nn.ReLU().to(device)
25
-
26
- # Cosine similarity scaling (with fixed initial parameter values)
27
- self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
28
- self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
29
-
30
- # Loss
31
- self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
32
-
33
- def do_gradient_ops(self):
34
- # Gradient scale
35
- self.similarity_weight.grad *= 0.01
36
- self.similarity_bias.grad *= 0.01
37
-
38
- # Gradient clipping
39
- clip_grad_norm_(self.parameters(), 3, norm_type=2)
40
-
41
- def forward(self, utterances, hidden_init=None):
42
- """
43
- Computes the embeddings of a batch of utterance spectrograms.
44
-
45
- :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
46
- (batch_size, n_frames, n_channels)
47
- :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
48
- batch_size, hidden_size). Will default to a tensor of zeros if None.
49
- :return: the embeddings as a tensor of shape (batch_size, embedding_size)
50
- """
51
- # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
52
- # and the final cell state.
53
- out, (hidden, cell) = self.lstm(utterances, hidden_init)
54
-
55
- # We take only the hidden state of the last layer
56
- embeds_raw = self.relu(self.linear(hidden[-1]))
57
-
58
- # L2-normalize it
59
- embeds = embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5)
60
-
61
- return embeds
62
-
63
- def similarity_matrix(self, embeds):
64
- """
65
- Computes the similarity matrix according the section 2.1 of GE2E.
66
-
67
- :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
68
- utterances_per_speaker, embedding_size)
69
- :return: the similarity matrix as a tensor of shape (speakers_per_batch,
70
- utterances_per_speaker, speakers_per_batch)
71
- """
72
- speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
73
-
74
- # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
75
- centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
76
- centroids_incl = centroids_incl.clone() / (torch.norm(centroids_incl, dim=2, keepdim=True) + 1e-5)
77
-
78
- # Exclusive centroids (1 per utterance)
79
- centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
80
- centroids_excl /= (utterances_per_speaker - 1)
81
- centroids_excl = centroids_excl.clone() / (torch.norm(centroids_excl, dim=2, keepdim=True) + 1e-5)
82
-
83
- # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
84
- # product of these vectors (which is just an element-wise multiplication reduced by a sum).
85
- # We vectorize the computation for efficiency.
86
- sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
87
- speakers_per_batch).to(self.loss_device)
88
- mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
89
- for j in range(speakers_per_batch):
90
- mask = np.where(mask_matrix[j])[0]
91
- sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
92
- sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
93
-
94
- ## Even more vectorized version (slower maybe because of transpose)
95
- # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
96
- # ).to(self.loss_device)
97
- # eye = np.eye(speakers_per_batch, dtype=np.int)
98
- # mask = np.where(1 - eye)
99
- # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
100
- # mask = np.where(eye)
101
- # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
102
- # sim_matrix2 = sim_matrix2.transpose(1, 2)
103
-
104
- sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
105
- return sim_matrix
106
-
107
- def loss(self, embeds):
108
- """
109
- Computes the softmax loss according the section 2.1 of GE2E.
110
-
111
- :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
112
- utterances_per_speaker, embedding_size)
113
- :return: the loss and the EER for this batch of embeddings.
114
- """
115
- speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
116
-
117
- # Loss
118
- sim_matrix = self.similarity_matrix(embeds)
119
- sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker,
120
- speakers_per_batch))
121
- ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
122
- target = torch.from_numpy(ground_truth).long().to(self.loss_device)
123
- loss = self.loss_fn(sim_matrix, target)
124
-
125
- # EER (not backpropagated)
126
- with torch.no_grad():
127
- inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
128
- labels = np.array([inv_argmax(i) for i in ground_truth])
129
- preds = sim_matrix.detach().cpu().numpy()
130
-
131
- # Snippet from https://yangcha.github.io/EER-ROC/
132
- fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
133
- eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
134
-
135
- return loss, eer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
encoder/params_data.py DELETED
@@ -1,29 +0,0 @@
1
-
2
- ## Mel-filterbank
3
- mel_window_length = 25 # In milliseconds
4
- mel_window_step = 10 # In milliseconds
5
- mel_n_channels = 40
6
-
7
-
8
- ## Audio
9
- sampling_rate = 16000
10
- # Number of spectrogram frames in a partial utterance
11
- partials_n_frames = 160 # 1600 ms
12
- # Number of spectrogram frames at inference
13
- inference_n_frames = 80 # 800 ms
14
-
15
-
16
- ## Voice Activation Detection
17
- # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
18
- # This sets the granularity of the VAD. Should not need to be changed.
19
- vad_window_length = 30 # In milliseconds
20
- # Number of frames to average together when performing the moving average smoothing.
21
- # The larger this value, the larger the VAD variations must be to not get smoothed out.
22
- vad_moving_average_width = 8
23
- # Maximum number of consecutive silent frames a segment can have.
24
- vad_max_silence_length = 6
25
-
26
-
27
- ## Audio volume normalization
28
- audio_norm_target_dBFS = -30
29
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
encoder/params_model.py DELETED
@@ -1,11 +0,0 @@
1
-
2
- ## Model parameters
3
- model_hidden_size = 256
4
- model_embedding_size = 256
5
- model_num_layers = 3
6
-
7
-
8
- ## Training parameters
9
- learning_rate_init = 1e-4
10
- speakers_per_batch = 64
11
- utterances_per_speaker = 10
 
 
 
 
 
 
 
 
 
 
 
 
encoder/preprocess.py DELETED
@@ -1,184 +0,0 @@
1
- from datetime import datetime
2
- from functools import partial
3
- from multiprocessing import Pool
4
- from pathlib import Path
5
-
6
- import numpy as np
7
- from tqdm import tqdm
8
-
9
- from encoder import audio
10
- from encoder.config import librispeech_datasets, anglophone_nationalites
11
- from encoder.params_data import *
12
-
13
-
14
- _AUDIO_EXTENSIONS = ("wav", "flac", "m4a", "mp3")
15
-
16
- class DatasetLog:
17
- """
18
- Registers metadata about the dataset in a text file.
19
- """
20
- def __init__(self, root, name):
21
- self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
22
- self.sample_data = dict()
23
-
24
- start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
25
- self.write_line("Creating dataset %s on %s" % (name, start_time))
26
- self.write_line("-----")
27
- self._log_params()
28
-
29
- def _log_params(self):
30
- from encoder import params_data
31
- self.write_line("Parameter values:")
32
- for param_name in (p for p in dir(params_data) if not p.startswith("__")):
33
- value = getattr(params_data, param_name)
34
- self.write_line("\t%s: %s" % (param_name, value))
35
- self.write_line("-----")
36
-
37
- def write_line(self, line):
38
- self.text_file.write("%s\n" % line)
39
-
40
- def add_sample(self, **kwargs):
41
- for param_name, value in kwargs.items():
42
- if not param_name in self.sample_data:
43
- self.sample_data[param_name] = []
44
- self.sample_data[param_name].append(value)
45
-
46
- def finalize(self):
47
- self.write_line("Statistics:")
48
- for param_name, values in self.sample_data.items():
49
- self.write_line("\t%s:" % param_name)
50
- self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
51
- self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
52
- self.write_line("-----")
53
- end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
54
- self.write_line("Finished on %s" % end_time)
55
- self.text_file.close()
56
-
57
-
58
- def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
59
- dataset_root = datasets_root.joinpath(dataset_name)
60
- if not dataset_root.exists():
61
- print("Couldn\'t find %s, skipping this dataset." % dataset_root)
62
- return None, None
63
- return dataset_root, DatasetLog(out_dir, dataset_name)
64
-
65
-
66
- def _preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, skip_existing: bool):
67
- # Give a name to the speaker that includes its dataset
68
- speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
69
-
70
- # Create an output directory with that name, as well as a txt file containing a
71
- # reference to each source file.
72
- speaker_out_dir = out_dir.joinpath(speaker_name)
73
- speaker_out_dir.mkdir(exist_ok=True)
74
- sources_fpath = speaker_out_dir.joinpath("_sources.txt")
75
-
76
- # There's a possibility that the preprocessing was interrupted earlier, check if
77
- # there already is a sources file.
78
- if sources_fpath.exists():
79
- try:
80
- with sources_fpath.open("r") as sources_file:
81
- existing_fnames = {line.split(",")[0] for line in sources_file}
82
- except:
83
- existing_fnames = {}
84
- else:
85
- existing_fnames = {}
86
-
87
- # Gather all audio files for that speaker recursively
88
- sources_file = sources_fpath.open("a" if skip_existing else "w")
89
- audio_durs = []
90
- for extension in _AUDIO_EXTENSIONS:
91
- for in_fpath in speaker_dir.glob("**/*.%s" % extension):
92
- # Check if the target output file already exists
93
- out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
94
- out_fname = out_fname.replace(".%s" % extension, ".npy")
95
- if skip_existing and out_fname in existing_fnames:
96
- continue
97
-
98
- # Load and preprocess the waveform
99
- wav = audio.preprocess_wav(in_fpath)
100
- if len(wav) == 0:
101
- continue
102
-
103
- # Create the mel spectrogram, discard those that are too short
104
- frames = audio.wav_to_mel_spectrogram(wav)
105
- if len(frames) < partials_n_frames:
106
- continue
107
-
108
- out_fpath = speaker_out_dir.joinpath(out_fname)
109
- np.save(out_fpath, frames)
110
- sources_file.write("%s,%s\n" % (out_fname, in_fpath))
111
- audio_durs.append(len(wav) / sampling_rate)
112
-
113
- sources_file.close()
114
-
115
- return audio_durs
116
-
117
-
118
- def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger):
119
- print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
120
-
121
- # Process the utterances for each speaker
122
- work_fn = partial(_preprocess_speaker, datasets_root=datasets_root, out_dir=out_dir, skip_existing=skip_existing)
123
- with Pool(4) as pool:
124
- tasks = pool.imap(work_fn, speaker_dirs)
125
- for sample_durs in tqdm(tasks, dataset_name, len(speaker_dirs), unit="speakers"):
126
- for sample_dur in sample_durs:
127
- logger.add_sample(duration=sample_dur)
128
-
129
- logger.finalize()
130
- print("Done preprocessing %s.\n" % dataset_name)
131
-
132
-
133
- def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
134
- for dataset_name in librispeech_datasets["train"]["other"]:
135
- # Initialize the preprocessing
136
- dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
137
- if not dataset_root:
138
- return
139
-
140
- # Preprocess all speakers
141
- speaker_dirs = list(dataset_root.glob("*"))
142
- _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)
143
-
144
-
145
- def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
146
- # Initialize the preprocessing
147
- dataset_name = "VoxCeleb1"
148
- dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
149
- if not dataset_root:
150
- return
151
-
152
- # Get the contents of the meta file
153
- with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
154
- metadata = [line.split("\t") for line in metafile][1:]
155
-
156
- # Select the ID and the nationality, filter out non-anglophone speakers
157
- nationalities = {line[0]: line[3] for line in metadata}
158
- keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if
159
- nationality.lower() in anglophone_nationalites]
160
- print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." %
161
- (len(keep_speaker_ids), len(nationalities)))
162
-
163
- # Get the speaker directories for anglophone speakers only
164
- speaker_dirs = dataset_root.joinpath("wav").glob("*")
165
- speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
166
- speaker_dir.name in keep_speaker_ids]
167
- print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." %
168
- (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
169
-
170
- # Preprocess all speakers
171
- _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)
172
-
173
-
174
- def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
175
- # Initialize the preprocessing
176
- dataset_name = "VoxCeleb2"
177
- dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
178
- if not dataset_root:
179
- return
180
-
181
- # Get the speaker directories
182
- # Preprocess all speakers
183
- speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
184
- _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
encoder/train.py DELETED
@@ -1,125 +0,0 @@
1
- from pathlib import Path
2
-
3
- import torch
4
-
5
- from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
6
- from encoder.model import SpeakerEncoder
7
- from encoder.params_model import *
8
- from encoder.visualizations import Visualizations
9
- from utils.profiler import Profiler
10
-
11
-
12
- def sync(device: torch.device):
13
- # For correct profiling (cuda operations are async)
14
- if device.type == "cuda":
15
- torch.cuda.synchronize(device)
16
-
17
-
18
- def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
19
- backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
20
- no_visdom: bool):
21
- # Create a dataset and a dataloader
22
- dataset = SpeakerVerificationDataset(clean_data_root)
23
- loader = SpeakerVerificationDataLoader(
24
- dataset,
25
- speakers_per_batch,
26
- utterances_per_speaker,
27
- num_workers=4,
28
- )
29
-
30
- # Setup the device on which to run the forward pass and the loss. These can be different,
31
- # because the forward pass is faster on the GPU whereas the loss is often (depending on your
32
- # hyperparameters) faster on the CPU.
33
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
34
- # FIXME: currently, the gradient is None if loss_device is cuda
35
- loss_device = torch.device("cpu")
36
-
37
- # Create the model and the optimizer
38
- model = SpeakerEncoder(device, loss_device)
39
- optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
40
- init_step = 1
41
-
42
- # Configure file path for the model
43
- model_dir = models_dir / run_id
44
- model_dir.mkdir(exist_ok=True, parents=True)
45
- state_fpath = model_dir / "encoder.pt"
46
-
47
- # Load any existing model
48
- if not force_restart:
49
- if state_fpath.exists():
50
- print("Found existing model \"%s\", loading it and resuming training." % run_id)
51
- checkpoint = torch.load(state_fpath)
52
- init_step = checkpoint["step"]
53
- model.load_state_dict(checkpoint["model_state"])
54
- optimizer.load_state_dict(checkpoint["optimizer_state"])
55
- optimizer.param_groups[0]["lr"] = learning_rate_init
56
- else:
57
- print("No model \"%s\" found, starting training from scratch." % run_id)
58
- else:
59
- print("Starting the training from scratch.")
60
- model.train()
61
-
62
- # Initialize the visualization environment
63
- vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
64
- vis.log_dataset(dataset)
65
- vis.log_params()
66
- device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
67
- vis.log_implementation({"Device": device_name})
68
-
69
- # Training loop
70
- profiler = Profiler(summarize_every=10, disabled=False)
71
- for step, speaker_batch in enumerate(loader, init_step):
72
- profiler.tick("Blocking, waiting for batch (threaded)")
73
-
74
- # Forward pass
75
- inputs = torch.from_numpy(speaker_batch.data).to(device)
76
- sync(device)
77
- profiler.tick("Data to %s" % device)
78
- embeds = model(inputs)
79
- sync(device)
80
- profiler.tick("Forward pass")
81
- embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
82
- loss, eer = model.loss(embeds_loss)
83
- sync(loss_device)
84
- profiler.tick("Loss")
85
-
86
- # Backward pass
87
- model.zero_grad()
88
- loss.backward()
89
- profiler.tick("Backward pass")
90
- model.do_gradient_ops()
91
- optimizer.step()
92
- profiler.tick("Parameter update")
93
-
94
- # Update visualizations
95
- # learning_rate = optimizer.param_groups[0]["lr"]
96
- vis.update(loss.item(), eer, step)
97
-
98
- # Draw projections and save them to the backup folder
99
- if umap_every != 0 and step % umap_every == 0:
100
- print("Drawing and saving projections (step %d)" % step)
101
- projection_fpath = model_dir / f"umap_{step:06d}.png"
102
- embeds = embeds.detach().cpu().numpy()
103
- vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
104
- vis.save()
105
-
106
- # Overwrite the latest version of the model
107
- if save_every != 0 and step % save_every == 0:
108
- print("Saving the model (step %d)" % step)
109
- torch.save({
110
- "step": step + 1,
111
- "model_state": model.state_dict(),
112
- "optimizer_state": optimizer.state_dict(),
113
- }, state_fpath)
114
-
115
- # Make a backup
116
- if backup_every != 0 and step % backup_every == 0:
117
- print("Making a backup (step %d)" % step)
118
- backup_fpath = model_dir / f"encoder_{step:06d}.bak"
119
- torch.save({
120
- "step": step + 1,
121
- "model_state": model.state_dict(),
122
- "optimizer_state": optimizer.state_dict(),
123
- }, backup_fpath)
124
-
125
- profiler.tick("Extras (visualizations, saving)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
encoder/visualizations.py DELETED
@@ -1,179 +0,0 @@
1
- from datetime import datetime
2
- from time import perf_counter as timer
3
-
4
- import numpy as np
5
- import umap
6
- import visdom
7
-
8
- from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
9
-
10
-
11
- colormap = np.array([
12
- [76, 255, 0],
13
- [0, 127, 70],
14
- [255, 0, 0],
15
- [255, 217, 38],
16
- [0, 135, 255],
17
- [165, 0, 165],
18
- [255, 167, 255],
19
- [0, 255, 255],
20
- [255, 96, 38],
21
- [142, 76, 0],
22
- [33, 0, 127],
23
- [0, 0, 0],
24
- [183, 183, 183],
25
- ], dtype=np.float) / 255
26
-
27
-
28
- class Visualizations:
29
- def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
30
- # Tracking data
31
- self.last_update_timestamp = timer()
32
- self.update_every = update_every
33
- self.step_times = []
34
- self.losses = []
35
- self.eers = []
36
- print("Updating the visualizations every %d steps." % update_every)
37
-
38
- # If visdom is disabled TODO: use a better paradigm for that
39
- self.disabled = disabled
40
- if self.disabled:
41
- return
42
-
43
- # Set the environment name
44
- now = str(datetime.now().strftime("%d-%m %Hh%M"))
45
- if env_name is None:
46
- self.env_name = now
47
- else:
48
- self.env_name = "%s (%s)" % (env_name, now)
49
-
50
- # Connect to visdom and open the corresponding window in the browser
51
- try:
52
- self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
53
- except ConnectionError:
54
- raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
55
- "start it.")
56
- # webbrowser.open("http://localhost:8097/env/" + self.env_name)
57
-
58
- # Create the windows
59
- self.loss_win = None
60
- self.eer_win = None
61
- # self.lr_win = None
62
- self.implementation_win = None
63
- self.projection_win = None
64
- self.implementation_string = ""
65
-
66
- def log_params(self):
67
- if self.disabled:
68
- return
69
- from encoder import params_data
70
- from encoder import params_model
71
- param_string = "<b>Model parameters</b>:<br>"
72
- for param_name in (p for p in dir(params_model) if not p.startswith("__")):
73
- value = getattr(params_model, param_name)
74
- param_string += "\t%s: %s<br>" % (param_name, value)
75
- param_string += "<b>Data parameters</b>:<br>"
76
- for param_name in (p for p in dir(params_data) if not p.startswith("__")):
77
- value = getattr(params_data, param_name)
78
- param_string += "\t%s: %s<br>" % (param_name, value)
79
- self.vis.text(param_string, opts={"title": "Parameters"})
80
-
81
- def log_dataset(self, dataset: SpeakerVerificationDataset):
82
- if self.disabled:
83
- return
84
- dataset_string = ""
85
- dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
86
- dataset_string += "\n" + dataset.get_logs()
87
- dataset_string = dataset_string.replace("\n", "<br>")
88
- self.vis.text(dataset_string, opts={"title": "Dataset"})
89
-
90
- def log_implementation(self, params):
91
- if self.disabled:
92
- return
93
- implementation_string = ""
94
- for param, value in params.items():
95
- implementation_string += "<b>%s</b>: %s\n" % (param, value)
96
- implementation_string = implementation_string.replace("\n", "<br>")
97
- self.implementation_string = implementation_string
98
- self.implementation_win = self.vis.text(
99
- implementation_string,
100
- opts={"title": "Training implementation"}
101
- )
102
-
103
- def update(self, loss, eer, step):
104
- # Update the tracking data
105
- now = timer()
106
- self.step_times.append(1000 * (now - self.last_update_timestamp))
107
- self.last_update_timestamp = now
108
- self.losses.append(loss)
109
- self.eers.append(eer)
110
- print(".", end="")
111
-
112
- # Update the plots every <update_every> steps
113
- if step % self.update_every != 0:
114
- return
115
- time_string = "Step time: mean: %5dms std: %5dms" % \
116
- (int(np.mean(self.step_times)), int(np.std(self.step_times)))
117
- print("\nStep %6d Loss: %.4f EER: %.4f %s" %
118
- (step, np.mean(self.losses), np.mean(self.eers), time_string))
119
- if not self.disabled:
120
- self.loss_win = self.vis.line(
121
- [np.mean(self.losses)],
122
- [step],
123
- win=self.loss_win,
124
- update="append" if self.loss_win else None,
125
- opts=dict(
126
- legend=["Avg. loss"],
127
- xlabel="Step",
128
- ylabel="Loss",
129
- title="Loss",
130
- )
131
- )
132
- self.eer_win = self.vis.line(
133
- [np.mean(self.eers)],
134
- [step],
135
- win=self.eer_win,
136
- update="append" if self.eer_win else None,
137
- opts=dict(
138
- legend=["Avg. EER"],
139
- xlabel="Step",
140
- ylabel="EER",
141
- title="Equal error rate"
142
- )
143
- )
144
- if self.implementation_win is not None:
145
- self.vis.text(
146
- self.implementation_string + ("<b>%s</b>" % time_string),
147
- win=self.implementation_win,
148
- opts={"title": "Training implementation"},
149
- )
150
-
151
- # Reset the tracking
152
- self.losses.clear()
153
- self.eers.clear()
154
- self.step_times.clear()
155
-
156
- def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None, max_speakers=10):
157
- import matplotlib.pyplot as plt
158
-
159
- max_speakers = min(max_speakers, len(colormap))
160
- embeds = embeds[:max_speakers * utterances_per_speaker]
161
-
162
- n_speakers = len(embeds) // utterances_per_speaker
163
- ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
164
- colors = [colormap[i] for i in ground_truth]
165
-
166
- reducer = umap.UMAP()
167
- projected = reducer.fit_transform(embeds)
168
- plt.scatter(projected[:, 0], projected[:, 1], c=colors)
169
- plt.gca().set_aspect("equal", "datalim")
170
- plt.title("UMAP projection (step %d)" % step)
171
- if not self.disabled:
172
- self.projection_win = self.vis.matplot(plt, win=self.projection_win)
173
- if out_fpath is not None:
174
- plt.savefig(out_fpath)
175
- plt.clf()
176
-
177
- def save(self):
178
- if not self.disabled:
179
- self.vis.save([self.env_name])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
encoderCoren.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:39373b86598fa3da9fcddee6142382efe09777e8d37dc9c0561f41f0070f134e
3
- size 17090379
 
 
 
 
{hifigan β†’ hifi-gan}/LICENSE RENAMED
File without changes
hifi-gan/README.md ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis
2
+
3
+ ### Jungil Kong, Jaehyeon Kim, Jaekyoung Bae
4
+
5
+ In our [paper](https://arxiv.org/abs/2010.05646),
6
+ we proposed HiFi-GAN: a GAN-based model capable of generating high fidelity speech efficiently.<br/>
7
+ We provide our implementation and pretrained models as open source in this repository.
8
+
9
+ **Abstract :**
10
+ Several recent work on speech synthesis have employed generative adversarial networks (GANs) to produce raw waveforms.
11
+ Although such methods improve the sampling efficiency and memory usage,
12
+ their sample quality has not yet reached that of autoregressive and flow-based generative models.
13
+ In this work, we propose HiFi-GAN, which achieves both efficient and high-fidelity speech synthesis.
14
+ As speech audio consists of sinusoidal signals with various periods,
15
+ we demonstrate that modeling periodic patterns of an audio is crucial for enhancing sample quality.
16
+ A subjective human evaluation (mean opinion score, MOS) of a single speaker dataset indicates that our proposed method
17
+ demonstrates similarity to human quality while generating 22.05 kHz high-fidelity audio 167.9 times faster than
18
+ real-time on a single V100 GPU. We further show the generality of HiFi-GAN to the mel-spectrogram inversion of unseen
19
+ speakers and end-to-end speech synthesis. Finally, a small footprint version of HiFi-GAN generates samples 13.4 times
20
+ faster than real-time on CPU with comparable quality to an autoregressive counterpart.
21
+
22
+ Visit our [demo website](https://jik876.github.io/hifi-gan-demo/) for audio samples.
23
+
24
+
25
+ ## Pre-requisites
26
+ 1. Python >= 3.6
27
+ 2. Clone this repository.
28
+ 3. Install python requirements. Please refer [requirements.txt](requirements.txt)
29
+ 4. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/).
30
+ And move all wav files to `LJSpeech-1.1/wavs`
31
+
32
+
33
+ ## Training
34
+ ```
35
+ python train.py --config config_v1.json
36
+ ```
37
+ To train V2 or V3 Generator, replace `config_v1.json` with `config_v2.json` or `config_v3.json`.<br>
38
+ Checkpoints and copy of the configuration file are saved in `cp_hifigan` directory by default.<br>
39
+ You can change the path by adding `--checkpoint_path` option.
40
+
41
+ Validation loss during training with V1 generator.<br>
42
+ ![validation loss](./validation_loss.png)
43
+
44
+ ## Pretrained Model
45
+ You can also use pretrained models we provide.<br/>
46
+ [Download pretrained models](https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing)<br/>
47
+ Details of each folder are as in follows:
48
+
49
+ |Folder Name|Generator|Dataset|Fine-Tuned|
50
+ |------|---|---|---|
51
+ |LJ_V1|V1|LJSpeech|No|
52
+ |LJ_V2|V2|LJSpeech|No|
53
+ |LJ_V3|V3|LJSpeech|No|
54
+ |LJ_FT_T2_V1|V1|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
55
+ |LJ_FT_T2_V2|V2|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
56
+ |LJ_FT_T2_V3|V3|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
57
+ |VCTK_V1|V1|VCTK|No|
58
+ |VCTK_V2|V2|VCTK|No|
59
+ |VCTK_V3|V3|VCTK|No|
60
+ |UNIVERSAL_V1|V1|Universal|No|
61
+
62
+ We provide the universal model with discriminator weights that can be used as a base for transfer learning to other datasets.
63
+
64
+ ## Fine-Tuning
65
+ 1. Generate mel-spectrograms in numpy format using [Tacotron2](https://github.com/NVIDIA/tacotron2) with teacher-forcing.<br/>
66
+ The file name of the generated mel-spectrogram should match the audio file and the extension should be `.npy`.<br/>
67
+ Example:
68
+ ```
69
+ Audio File : LJ001-0001.wav
70
+ Mel-Spectrogram File : LJ001-0001.npy
71
+ ```
72
+ 2. Create `ft_dataset` folder and copy the generated mel-spectrogram files into it.<br/>
73
+ 3. Run the following command.
74
+ ```
75
+ python train.py --fine_tuning True --config config_v1.json
76
+ ```
77
+ For other command line options, please refer to the training section.
78
+
79
+
80
+ ## Inference from wav file
81
+ 1. Make `test_files` directory and copy wav files into the directory.
82
+ 2. Run the following command.
83
+ ```
84
+ python inference.py --checkpoint_file [generator checkpoint file path]
85
+ ```
86
+ Generated wav files are saved in `generated_files` by default.<br>
87
+ You can change the path by adding `--output_dir` option.
88
+
89
+
90
+ ## Inference for end-to-end speech synthesis
91
+ 1. Make `test_mel_files` directory and copy generated mel-spectrogram files into the directory.<br>
92
+ You can generate mel-spectrograms using [Tacotron2](https://github.com/NVIDIA/tacotron2),
93
+ [Glow-TTS](https://github.com/jaywalnut310/glow-tts) and so forth.
94
+ 2. Run the following command.
95
+ ```
96
+ python inference_e2e.py --checkpoint_file [generator checkpoint file path]
97
+ ```
98
+ Generated wav files are saved in `generated_files_from_mel` by default.<br>
99
+ You can change the path by adding `--output_dir` option.
100
+
101
+
102
+ ## Acknowledgements
103
+ We referred to [WaveGlow](https://github.com/NVIDIA/waveglow), [MelGAN](https://github.com/descriptinc/melgan-neurips)
104
+ and [Tacotron2](https://github.com/NVIDIA/tacotron2) to implement this.
105
+
diagrams/apple.txt β†’ hifi-gan/apple.py RENAMED
File without changes
{hifigan β†’ hifi-gan}/env.py RENAMED
File without changes
{hifigan β†’ hifi-gan}/inference.py RENAMED
@@ -6,9 +6,9 @@ import argparse
6
  import json
7
  import torch
8
  from scipy.io.wavfile import write
9
- from hifigan.env import AttrDict
10
- from hifigan.meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav
11
- from hifigan.models import Generator
12
 
13
  h = None
14
  device = None
 
6
  import json
7
  import torch
8
  from scipy.io.wavfile import write
9
+ from env import AttrDict
10
+ from meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav
11
+ from models import Generator
12
 
13
  h = None
14
  device = None
{hifigan β†’ hifi-gan}/inference_e2e.py RENAMED
@@ -1,13 +1,15 @@
1
  from __future__ import absolute_import, division, print_function, unicode_literals
2
 
 
3
  import os
4
  import numpy as np
 
5
  import json
6
  import torch
7
  from scipy.io.wavfile import write
8
- from hifigan.env import AttrDict
9
- from hifigan.models import Generator
10
- from io import BytesIO
11
 
12
  h = None
13
  device = None
@@ -21,9 +23,50 @@ def load_checkpoint(filepath, device):
21
  return checkpoint_dict
22
 
23
 
24
- def hifi_gan_inference(input_mel, checkpoint_file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  print('Initializing Inference Process..')
26
- config_file = os.path.join(os.path.split(checkpoint_file)[0], 'config.json')
 
 
 
 
 
 
 
27
  with open(config_file) as f:
28
  data = f.read()
29
 
@@ -31,10 +74,6 @@ def hifi_gan_inference(input_mel, checkpoint_file):
31
  json_config = json.loads(data)
32
  h = AttrDict(json_config)
33
 
34
- # Set MAX_WAV_VALUE if not present
35
- if 'MAX_WAV_VALUE' not in h:
36
- h.MAX_WAV_VALUE = 32768.0 # Adjust this value based on your requirements
37
-
38
  torch.manual_seed(h.seed)
39
  global device
40
  if torch.cuda.is_available():
@@ -43,34 +82,9 @@ def hifi_gan_inference(input_mel, checkpoint_file):
43
  else:
44
  device = torch.device('cpu')
45
 
46
- generator = Generator(h).to(device)
47
 
48
- state_dict_g = load_checkpoint(checkpoint_file, device)
49
- generator.load_state_dict(state_dict_g['generator'])
50
 
51
- generator.eval()
52
- generator.remove_weight_norm()
53
 
54
- # Load data from BytesIO
55
- buffer = BytesIO(input_mel)
56
- x = np.load(buffer)
57
-
58
- x = torch.FloatTensor(x).to(device)
59
- y_g_hat = generator(x)
60
-
61
- # Detach tensor before converting to numpy
62
- audio = y_g_hat.squeeze().detach().numpy()
63
-
64
- # Set MAX_WAV_VALUE if not present
65
- if 'MAX_WAV_VALUE' not in h:
66
- h.MAX_WAV_VALUE = 32768.0 # Adjust this value based on your requirements
67
-
68
- audio = audio * h.MAX_WAV_VALUE
69
- audio = audio.astype('int16')
70
-
71
- # Save audio to BytesIO
72
- output_buffer = BytesIO()
73
- write(output_buffer, h.sampling_rate, audio)
74
-
75
- return output_buffer.getvalue()
76
-
 
1
  from __future__ import absolute_import, division, print_function, unicode_literals
2
 
3
+ import glob
4
  import os
5
  import numpy as np
6
+ import argparse
7
  import json
8
  import torch
9
  from scipy.io.wavfile import write
10
+ from env import AttrDict
11
+ from meldataset import MAX_WAV_VALUE
12
+ from models import Generator
13
 
14
  h = None
15
  device = None
 
23
  return checkpoint_dict
24
 
25
 
26
+ def scan_checkpoint(cp_dir, prefix):
27
+ pattern = os.path.join(cp_dir, prefix + '*')
28
+ cp_list = glob.glob(pattern)
29
+ if len(cp_list) == 0:
30
+ return ''
31
+ return sorted(cp_list)[-1]
32
+
33
+
34
+ def inference(a):
35
+ generator = Generator(h).to(device)
36
+
37
+ state_dict_g = load_checkpoint(a.checkpoint_file, device)
38
+ generator.load_state_dict(state_dict_g['generator'])
39
+
40
+ filelist = os.listdir(a.input_mels_dir)
41
+
42
+ os.makedirs(a.output_dir, exist_ok=True)
43
+
44
+ generator.eval()
45
+ generator.remove_weight_norm()
46
+ with torch.no_grad():
47
+ for i, filname in enumerate(filelist):
48
+ x = np.load(os.path.join(a.input_mels_dir, filname))
49
+ x = torch.FloatTensor(x).to(device)
50
+ y_g_hat = generator(x)
51
+ audio = y_g_hat.squeeze()
52
+ audio = audio * MAX_WAV_VALUE
53
+ audio = audio.cpu().numpy().astype('int16')
54
+
55
+ output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + '_generated_e2e.wav')
56
+ write(output_file, h.sampling_rate, audio)
57
+ print(output_file)
58
+
59
+
60
+ def main():
61
  print('Initializing Inference Process..')
62
+
63
+ parser = argparse.ArgumentParser()
64
+ parser.add_argument('--input_mels_dir', default='test_mel_files')
65
+ parser.add_argument('--output_dir', default='generated_files_from_mel')
66
+ parser.add_argument('--checkpoint_file', required=True)
67
+ a = parser.parse_args()
68
+
69
+ config_file = os.path.join(os.path.split(a.checkpoint_file)[0], 'config.json')
70
  with open(config_file) as f:
71
  data = f.read()
72
 
 
74
  json_config = json.loads(data)
75
  h = AttrDict(json_config)
76
 
 
 
 
 
77
  torch.manual_seed(h.seed)
78
  global device
79
  if torch.cuda.is_available():
 
82
  else:
83
  device = torch.device('cpu')
84
 
85
+ inference(a)
86
 
 
 
87
 
88
+ if __name__ == '__main__':
89
+ main()
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
{hifigan β†’ hifi-gan}/meldataset.py RENAMED
File without changes
{hifigan β†’ hifi-gan}/models.py RENAMED
@@ -3,7 +3,7 @@ import torch.nn.functional as F
3
  import torch.nn as nn
4
  from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
5
  from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
6
- from hifigan.hifigan_utils import init_weights, get_padding
7
 
8
  LRELU_SLOPE = 0.1
9
 
 
3
  import torch.nn as nn
4
  from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
5
  from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
6
+ from utils import init_weights, get_padding
7
 
8
  LRELU_SLOPE = 0.1
9
 
hifi-gan/requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch==1.4.0
2
+ numpy==1.17.4
3
+ librosa==0.7.2
4
+ scipy==1.4.1
5
+ tensorboard==2.0
6
+ soundfile==0.10.3.post1
7
+ matplotlib==3.1.3
{hifigan β†’ hifi-gan}/train.py RENAMED
@@ -12,11 +12,11 @@ from torch.utils.data import DistributedSampler, DataLoader
12
  import torch.multiprocessing as mp
13
  from torch.distributed import init_process_group
14
  from torch.nn.parallel import DistributedDataParallel
15
- from hifigan.env import AttrDict, build_env
16
- from hifigan.meldataset import MelDataset, mel_spectrogram, get_dataset_filelist
17
- from hifigan.models import Generator, MultiPeriodDiscriminator, MultiScaleDiscriminator, feature_loss, generator_loss,\
18
  discriminator_loss
19
- from hifigan.hifigan_utils import plot_spectrogram, scan_checkpoint, load_checkpoint, save_checkpoint
20
 
21
  torch.backends.cudnn.benchmark = True
22
 
 
12
  import torch.multiprocessing as mp
13
  from torch.distributed import init_process_group
14
  from torch.nn.parallel import DistributedDataParallel
15
+ from env import AttrDict, build_env
16
+ from meldataset import MelDataset, mel_spectrogram, get_dataset_filelist
17
+ from models import Generator, MultiPeriodDiscriminator, MultiScaleDiscriminator, feature_loss, generator_loss,\
18
  discriminator_loss
19
+ from utils import plot_spectrogram, scan_checkpoint, load_checkpoint, save_checkpoint
20
 
21
  torch.backends.cudnn.benchmark = True
22
 
hifigan/hifigan_utils.py β†’ hifi-gan/utils.py RENAMED
File without changes
hparams.py CHANGED
@@ -61,7 +61,6 @@ def create_hparams(hparams_string=None, verbose=False):
61
  "encoder_kernel_size":5,
62
  "encoder_n_convolutions":3,
63
  "encoder_embedding_dim":512,
64
- "speaker_embedding_dim":256,
65
 
66
  # Decoder parameters
67
  "n_frames_per_step":1, # currently only 1 is supported
 
61
  "encoder_kernel_size":5,
62
  "encoder_n_convolutions":3,
63
  "encoder_embedding_dim":512,
 
64
 
65
  # Decoder parameters
66
  "n_frames_per_step":1, # currently only 1 is supported
kaggle_12000.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:27d4936bff68d3fe37053ec3110486bdea9f23bf137f07477c28bbd4f36b85ae
3
- size 338426303
 
 
 
 
logger.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import torch
3
+ from torch.utils.tensorboard import SummaryWriter
4
+ from plotting_utils import plot_alignment_to_numpy, plot_spectrogram_to_numpy
5
+ from plotting_utils import plot_gate_outputs_to_numpy
6
+
7
+
8
+ class Tacotron2Logger(SummaryWriter):
9
+ def __init__(self, logdir):
10
+ super(Tacotron2Logger, self).__init__(logdir)
11
+
12
+ def log_training(self, reduced_loss, grad_norm, learning_rate, duration,
13
+ iteration):
14
+ self.add_scalar("training.loss", reduced_loss, iteration)
15
+ self.add_scalar("grad.norm", grad_norm, iteration)
16
+ self.add_scalar("learning.rate", learning_rate, iteration)
17
+ self.add_scalar("duration", duration, iteration)
18
+
19
+ def log_validation(self, reduced_loss, model, y, y_pred, iteration):
20
+ self.add_scalar("validation.loss", reduced_loss, iteration)
21
+ _, mel_outputs, gate_outputs, alignments = y_pred
22
+ mel_targets, gate_targets = y
23
+
24
+ # plot distribution of parameters
25
+ for tag, value in model.named_parameters():
26
+ tag = tag.replace('.', '/')
27
+ self.add_histogram(tag, value.data.cpu().numpy(), iteration)
28
+
29
+ # plot alignment, mel target and predicted, gate target and predicted
30
+ idx = random.randint(0, alignments.size(0) - 1)
31
+ self.add_image(
32
+ "alignment",
33
+ plot_alignment_to_numpy(alignments[idx].data.cpu().numpy().T),
34
+ iteration, dataformats='HWC')
35
+ self.add_image(
36
+ "mel_target",
37
+ plot_spectrogram_to_numpy(mel_targets[idx].data.cpu().numpy()),
38
+ iteration, dataformats='HWC')
39
+ self.add_image(
40
+ "mel_predicted",
41
+ plot_spectrogram_to_numpy(mel_outputs[idx].data.cpu().numpy()),
42
+ iteration, dataformats='HWC')
43
+ self.add_image(
44
+ "gate",
45
+ plot_gate_outputs_to_numpy(
46
+ gate_targets[idx].data.cpu().numpy(),
47
+ torch.sigmoid(gate_outputs[idx]).data.cpu().numpy()),
48
+ iteration, dataformats='HWC')
logic.py CHANGED
@@ -3,28 +3,15 @@ import numpy as np
3
  import torch
4
  import base64
5
  import io
6
- from io import BytesIO
7
  import matplotlib.pyplot as plt
8
  from hparams import create_hparams
9
  from model import Tacotron2
10
- from layers import TacotronSTFT
11
  from train import load_model
12
  from text import text_to_sequence
13
- from utils import load_wav_to_torch
14
  import os
15
- import random
16
- import librosa
17
  import librosa.display
18
 
19
- use_cuda = torch.cuda.is_available()
20
- device = torch.device('cuda' if use_cuda else 'cpu')
21
-
22
- hparams = create_hparams()
23
- hparams.sampling_rate = 22050
24
- stft = TacotronSTFT(
25
- hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels,
26
- hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax).to(device)
27
-
28
  # Function to plot data
29
  def plot_data(data, figsize=(16, 4), titles=['Mel Spectrogram (Original)', 'Mel Spectrogram (Postnet)', 'Alignment'],
30
  xlabel=['Time Steps', 'Time Steps', 'Decoder Time Steps'],
@@ -55,84 +42,59 @@ def plot_data(data, figsize=(16, 4), titles=['Mel Spectrogram (Original)', 'Mel
55
  return img_base64
56
 
57
  #Function to plot timedomain waveform
58
- def plot_waveforms(audio_data):
59
- # Load the audio from BytesIO
60
- buffer = BytesIO(audio_data)
61
- y, sr = librosa.load(buffer, sr=None)
62
-
63
- # Create waveform plot
64
- plt.figure(figsize=(10, 4))
65
- librosa.display.waveshow(y, sr=sr)
66
- plt.xlabel("Time (s)")
67
- plt.ylabel("Amplitude")
68
- plt.title("Waveform")
69
-
70
- # Save the plot to a BytesIO object
71
- wave_buffer = BytesIO()
72
- plt.savefig(wave_buffer, format="png")
73
- wave_buffer.seek(0)
74
- plt.close()
75
-
76
- # Encode the plot as base64
77
- wave_base64 = base64.b64encode(wave_buffer.read()).decode('utf-8')
78
-
79
- return wave_base64
80
 
 
 
81
 
82
- # load speaker model
83
- def load_speaker_model(speaker_model_path):
84
- from speaker.model import SpeakerEncoder
85
- device = torch.device('cuda' if use_cuda else 'cpu')
86
- loss_device = torch.device("cpu")
87
-
88
- model = SpeakerEncoder(device, loss_device)
89
- speaker_dict = torch.load(speaker_model_path, map_location='cpu')
90
- model.load_state_dict(speaker_dict)
91
 
92
- # Freeze the weights of the speaker model
93
- for param in model.parameters():
94
- param.requires_grad = False
 
 
95
 
96
- return model
97
 
98
- speaker_model = load_speaker_model('speaker/saved_models/saved_model_e273_LargeBatch.pt').to(device).eval().float()
99
 
100
- def extract_speech_embedding(audio_path: str):
101
- audio, sampling_rate = load_wav_to_torch(audio_path)
102
- if sampling_rate != stft.sampling_rate:
103
- raise ValueError("{} SR doesn't match target {} SR".format(sampling_rate, stft.sampling_rate))
104
-
105
- audio_norm = audio / 32768.0
106
- audio_norm = audio_norm.unsqueeze(0)
107
- audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False).to(device)
108
- melspec = stft.mel_spectrogram(audio_norm).transpose(1,2).float()
109
-
110
- if melspec.shape[1] <= 128:
111
- mel_slice = mel
112
- else:
113
- slice_start = random.randint(0,melspec.shape[1]-128)
114
- mel_slice = melspec[:,slice_start:slice_start+128]
115
- speaker_embedding = speaker_model(mel_slice)
116
- return speaker_embedding
117
-
118
  def synthesize_voice(text_input, checkpoint_path):
119
- # Load Tacotron2 model from checkpoint
120
- model = load_model(hparams)
121
- checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
122
- model.load_state_dict(checkpoint['state_dict'])
123
- model = model.to(device).eval().float()
124
 
 
 
 
 
125
 
126
  # Nepali text
127
- speaker_audio_path='speaker_audio/ariana.wav'
128
  sequence = np.array(text_to_sequence(text_input, ['transliteration_cleaners']))[None, :]
129
- sequence = torch.autograd.Variable(torch.from_numpy(sequence)).to(device).long()
130
- speaker_embedding = extract_speech_embedding(speaker_audio_path)
131
-
132
  # Melspectrogram and Alignment graph
133
- mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, speaker_embedding)
134
  mel_output_data = mel_outputs.data.cpu().numpy()[0]
135
  mel_output_postnet_data = mel_outputs_postnet.data.cpu().numpy()[0]
136
  alignments_data = alignments.data.cpu().numpy()[0].T
137
 
138
- return mel_output_data, mel_output_postnet_data, alignments_data
 
 
 
 
 
 
 
 
 
 
 
 
3
  import torch
4
  import base64
5
  import io
 
6
  import matplotlib.pyplot as plt
7
  from hparams import create_hparams
8
  from model import Tacotron2
 
9
  from train import load_model
10
  from text import text_to_sequence
 
11
  import os
12
+ import subprocess
 
13
  import librosa.display
14
 
 
 
 
 
 
 
 
 
 
15
  # Function to plot data
16
  def plot_data(data, figsize=(16, 4), titles=['Mel Spectrogram (Original)', 'Mel Spectrogram (Postnet)', 'Alignment'],
17
  xlabel=['Time Steps', 'Time Steps', 'Decoder Time Steps'],
 
42
  return img_base64
43
 
44
  #Function to plot timedomain waveform
45
+ def plot_waveforms(audio_file, sr=22050):
46
+ # Load audio waveform
47
+ y, sr = librosa.load(audio_file, sr=sr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ # Create time vector
50
+ time = librosa.times_like(y, sr=sr)
51
 
52
+ # Plot the waveform
53
+ plt.figure(figsize=(16, 4))
54
+ librosa.display.waveshow(y, sr=sr)
55
+ plt.title('Time vs Amplitude')
56
+ plt.xlabel('Time (s)')
57
+ plt.ylabel('Amplitude')
 
 
 
58
 
59
+ plt.tight_layout()
60
+ # plt.savefig('static/waveform.png')
61
+ img_buffer = io.BytesIO()
62
+ plt.savefig(img_buffer, format='png', bbox_inches='tight', pad_inches=0)
63
+ plt.close()
64
 
65
+ img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
66
 
67
+ return img_base64
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def synthesize_voice(text_input, checkpoint_path):
70
+ # Load Tacotron2 model
71
+ hparams = create_hparams()
72
+ hparams.sampling_rate = 22050
 
 
73
 
74
+ # Load model from checkpoint
75
+ model = load_model(hparams)
76
+ model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
77
+ model = model.cuda().eval().half()
78
 
79
  # Nepali text
 
80
  sequence = np.array(text_to_sequence(text_input, ['transliteration_cleaners']))[None, :]
81
+ sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
82
+
 
83
  # Melspectrogram and Alignment graph
84
+ mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
85
  mel_output_data = mel_outputs.data.cpu().numpy()[0]
86
  mel_output_postnet_data = mel_outputs_postnet.data.cpu().numpy()[0]
87
  alignments_data = alignments.data.cpu().numpy()[0].T
88
 
89
+ np.save('mel_files/mel1'+'.npy', mel_output_data)
90
+
91
+ input_mels_dir = 'mel_files/'
92
+ output_dir = 'audio_output/'
93
+ run_hifigan_inference(input_mels_dir, output_dir)
94
+
95
+ return mel_output_data, mel_output_postnet_data, alignments_data
96
+
97
+
98
+ def run_hifigan_inference(input_mels_dir, output_dir):
99
+ script_path = os.path.join(os.path.dirname("hifigan/"), "inference_e2e.py") # Assuming both scripts are in the same directory
100
+ subprocess.run(["python", script_path, "--checkpoint_file", "generator_v1", "--input_mels_dir", input_mels_dir, "--output_dir", output_dir])
loss_function.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+
3
+
4
+ class Tacotron2Loss(nn.Module):
5
+ def __init__(self):
6
+ super(Tacotron2Loss, self).__init__()
7
+
8
+ def forward(self, model_output, targets):
9
+ mel_target, gate_target = targets[0], targets[1]
10
+ mel_target.requires_grad = False
11
+ gate_target.requires_grad = False
12
+ gate_target = gate_target.view(-1, 1)
13
+
14
+ mel_out, mel_out_postnet, gate_out, _ = model_output
15
+ gate_out = gate_out.view(-1, 1)
16
+ mel_loss = nn.MSELoss()(mel_out, mel_target) + \
17
+ nn.MSELoss()(mel_out_postnet, mel_target)
18
+ gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target)
19
+ return mel_loss + gate_loss
loss_scaler.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ class LossScaler:
4
+
5
+ def __init__(self, scale=1):
6
+ self.cur_scale = scale
7
+
8
+ # `params` is a list / generator of torch.Variable
9
+ def has_overflow(self, params):
10
+ return False
11
+
12
+ # `x` is a torch.Tensor
13
+ def _has_inf_or_nan(x):
14
+ return False
15
+
16
+ # `overflow` is boolean indicating whether we overflowed in gradient
17
+ def update_scale(self, overflow):
18
+ pass
19
+
20
+ @property
21
+ def loss_scale(self):
22
+ return self.cur_scale
23
+
24
+ def scale_gradient(self, module, grad_in, grad_out):
25
+ return tuple(self.loss_scale * g for g in grad_in)
26
+
27
+ def backward(self, loss):
28
+ scaled_loss = loss*self.loss_scale
29
+ scaled_loss.backward()
30
+
31
+ class DynamicLossScaler:
32
+
33
+ def __init__(self,
34
+ init_scale=2**32,
35
+ scale_factor=2.,
36
+ scale_window=1000):
37
+ self.cur_scale = init_scale
38
+ self.cur_iter = 0
39
+ self.last_overflow_iter = -1
40
+ self.scale_factor = scale_factor
41
+ self.scale_window = scale_window
42
+
43
+ # `params` is a list / generator of torch.Variable
44
+ def has_overflow(self, params):
45
+ # return False
46
+ for p in params:
47
+ if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
48
+ return True
49
+
50
+ return False
51
+
52
+ # `x` is a torch.Tensor
53
+ def _has_inf_or_nan(x):
54
+ cpu_sum = float(x.float().sum())
55
+ if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
56
+ return True
57
+ return False
58
+
59
+ # `overflow` is boolean indicating whether we overflowed in gradient
60
+ def update_scale(self, overflow):
61
+ if overflow:
62
+ #self.cur_scale /= self.scale_factor
63
+ self.cur_scale = max(self.cur_scale/self.scale_factor, 1)
64
+ self.last_overflow_iter = self.cur_iter
65
+ else:
66
+ if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
67
+ self.cur_scale *= self.scale_factor
68
+ # self.cur_scale = 1
69
+ self.cur_iter += 1
70
+
71
+ @property
72
+ def loss_scale(self):
73
+ return self.cur_scale
74
+
75
+ def scale_gradient(self, module, grad_in, grad_out):
76
+ return tuple(self.loss_scale * g for g in grad_in)
77
+
78
+ def backward(self, loss):
79
+ scaled_loss = loss*self.loss_scale
80
+ scaled_loss.backward()
81
+
82
+ ##############################################################
83
+ # Example usage below here -- assuming it's in a separate file
84
+ ##############################################################
85
+ if __name__ == "__main__":
86
+ import torch
87
+ from torch.autograd import Variable
88
+ from dynamic_loss_scaler import DynamicLossScaler
89
+
90
+ # N is batch size; D_in is input dimension;
91
+ # H is hidden dimension; D_out is output dimension.
92
+ N, D_in, H, D_out = 64, 1000, 100, 10
93
+
94
+ # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
95
+ x = Variable(torch.randn(N, D_in), requires_grad=False)
96
+ y = Variable(torch.randn(N, D_out), requires_grad=False)
97
+
98
+ w1 = Variable(torch.randn(D_in, H), requires_grad=True)
99
+ w2 = Variable(torch.randn(H, D_out), requires_grad=True)
100
+ parameters = [w1, w2]
101
+
102
+ learning_rate = 1e-6
103
+ optimizer = torch.optim.SGD(parameters, lr=learning_rate)
104
+ loss_scaler = DynamicLossScaler()
105
+
106
+ for t in range(500):
107
+ y_pred = x.mm(w1).clamp(min=0).mm(w2)
108
+ loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
109
+ print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
110
+ print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
111
+ print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
112
+
113
+ # Run backprop
114
+ optimizer.zero_grad()
115
+ loss.backward()
116
+
117
+ # Check for overflow
118
+ has_overflow = DynamicLossScaler.has_overflow(parameters)
119
+
120
+ # If no overflow, unscale grad and update as usual
121
+ if not has_overflow:
122
+ for param in parameters:
123
+ param.grad.data.mul_(1. / loss_scaler.loss_scale)
124
+ optimizer.step()
125
+ # Otherwise, don't do anything -- ie, skip iteration
126
+ else:
127
+ print('OVERFLOW!')
128
+
129
+ # Update loss scale for next iteration
130
+ loss_scaler.update_scale(has_overflow)
131
+
model.py CHANGED
@@ -147,8 +147,13 @@ class Postnet(nn.Module):
147
 
148
 
149
  class Encoder(nn.Module):
 
 
 
 
150
  def __init__(self, hparams):
151
  super(Encoder, self).__init__()
 
152
  convolutions = []
153
  for _ in range(hparams.encoder_n_convolutions):
154
  conv_layer = nn.Sequential(
@@ -165,15 +170,13 @@ class Encoder(nn.Module):
165
  int(hparams.encoder_embedding_dim / 2), 1,
166
  batch_first=True, bidirectional=True)
167
 
168
- def forward(self, x, input_lengths, speaker_embedding):
169
- # Modify the input x to concatenate the speaker embedding
170
- x = torch.cat((x, speaker_embedding.unsqueeze(1).expand(-1, x.size(1), -1)), dim=-1)
171
-
172
  for conv in self.convolutions:
173
  x = F.dropout(F.relu(conv(x)), 0.5, self.training)
174
 
175
  x = x.transpose(1, 2)
176
 
 
177
  input_lengths = input_lengths.cpu().numpy()
178
  x = nn.utils.rnn.pack_padded_sequence(
179
  x, input_lengths, batch_first=True)
@@ -186,10 +189,7 @@ class Encoder(nn.Module):
186
 
187
  return outputs
188
 
189
- def inference(self, x, speaker_embedding):
190
- # Modify the input x to concatenate the speaker embedding
191
- x = torch.cat((x, speaker_embedding.unsqueeze(1).expand(-1, x.size(1), -1)), dim=-1)
192
-
193
  for conv in self.convolutions:
194
  x = F.dropout(F.relu(conv(x)), 0.5, self.training)
195
 
@@ -496,14 +496,13 @@ class Tacotron2(nn.Module):
496
 
497
  return outputs
498
 
499
- def forward(self, inputs, speaker_embedding):
500
  text_inputs, text_lengths, mels, max_len, output_lengths = inputs
501
  text_lengths, output_lengths = text_lengths.data, output_lengths.data
502
 
503
  embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
504
 
505
- # Pass the speaker embedding to the Encoder
506
- encoder_outputs = self.encoder(embedded_inputs, text_lengths, speaker_embedding)
507
 
508
  mel_outputs, gate_outputs, alignments = self.decoder(
509
  encoder_outputs, mels, memory_lengths=text_lengths)
@@ -515,11 +514,9 @@ class Tacotron2(nn.Module):
515
  [mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
516
  output_lengths)
517
 
518
- def inference(self, inputs, speaker_embedding):
519
  embedded_inputs = self.embedding(inputs).transpose(1, 2)
520
- # Pass the speaker embedding to the Encoder
521
- encoder_outputs = self.encoder.inference(embedded_inputs, speaker_embedding)
522
-
523
  mel_outputs, gate_outputs, alignments = self.decoder.inference(
524
  encoder_outputs)
525
 
@@ -530,4 +527,3 @@ class Tacotron2(nn.Module):
530
  [mel_outputs, mel_outputs_postnet, gate_outputs, alignments])
531
 
532
  return outputs
533
-
 
147
 
148
 
149
  class Encoder(nn.Module):
150
+ """Encoder module:
151
+ - Three 1-d convolution banks
152
+ - Bidirectional LSTM
153
+ """
154
  def __init__(self, hparams):
155
  super(Encoder, self).__init__()
156
+
157
  convolutions = []
158
  for _ in range(hparams.encoder_n_convolutions):
159
  conv_layer = nn.Sequential(
 
170
  int(hparams.encoder_embedding_dim / 2), 1,
171
  batch_first=True, bidirectional=True)
172
 
173
+ def forward(self, x, input_lengths):
 
 
 
174
  for conv in self.convolutions:
175
  x = F.dropout(F.relu(conv(x)), 0.5, self.training)
176
 
177
  x = x.transpose(1, 2)
178
 
179
+ # pytorch tensor are not reversible, hence the conversion
180
  input_lengths = input_lengths.cpu().numpy()
181
  x = nn.utils.rnn.pack_padded_sequence(
182
  x, input_lengths, batch_first=True)
 
189
 
190
  return outputs
191
 
192
+ def inference(self, x):
 
 
 
193
  for conv in self.convolutions:
194
  x = F.dropout(F.relu(conv(x)), 0.5, self.training)
195
 
 
496
 
497
  return outputs
498
 
499
+ def forward(self, inputs):
500
  text_inputs, text_lengths, mels, max_len, output_lengths = inputs
501
  text_lengths, output_lengths = text_lengths.data, output_lengths.data
502
 
503
  embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
504
 
505
+ encoder_outputs = self.encoder(embedded_inputs, text_lengths)
 
506
 
507
  mel_outputs, gate_outputs, alignments = self.decoder(
508
  encoder_outputs, mels, memory_lengths=text_lengths)
 
514
  [mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
515
  output_lengths)
516
 
517
+ def inference(self, inputs):
518
  embedded_inputs = self.embedding(inputs).transpose(1, 2)
519
+ encoder_outputs = self.encoder.inference(embedded_inputs)
 
 
520
  mel_outputs, gate_outputs, alignments = self.decoder.inference(
521
  encoder_outputs)
522
 
 
527
  [mel_outputs, mel_outputs_postnet, gate_outputs, alignments])
528
 
529
  return outputs
 
multiproc.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import torch
3
+ import sys
4
+ import subprocess
5
+
6
+ argslist = list(sys.argv)[1:]
7
+ num_gpus = torch.cuda.device_count()
8
+ argslist.append('--n_gpus={}'.format(num_gpus))
9
+ workers = []
10
+ job_id = time.strftime("%Y_%m_%d-%H%M%S")
11
+ argslist.append("--group_name=group_{}".format(job_id))
12
+
13
+ for i in range(num_gpus):
14
+ argslist.append('--rank={}'.format(i))
15
+ stdout = None if i == 0 else open("logs/{}_GPU_{}.log".format(job_id, i),
16
+ "w")
17
+ print(argslist)
18
+ p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout)
19
+ workers.append(p)
20
+ argslist = argslist[:-1]
21
+
22
+ for p in workers:
23
+ p.wait()
requirements.txt CHANGED
@@ -1,8 +1,11 @@
1
- fastapi[all]
 
 
 
2
  gunicorn
3
- torch==1.12.1+cu113
4
- torchaudio==0.12.1+cu113
5
- torchvision==0.13.1+cu113
6
  matplotlib==3.5.3
7
  numpy==1.18.5
8
  inflect
@@ -11,6 +14,4 @@ scipy==1.7.3
11
  tensorboard==2.11.2
12
  Unidecode
13
  pillow
14
- uvicorn
15
- httpx==0.19.0
16
- --extra-index-url https://download.pytorch.org/whl/cu113
 
1
+ flask
2
+ flask_cors
3
+ typing
4
+ fastapi
5
  gunicorn
6
+ torch==1.12.1
7
+ torchaudio==0.12.1
8
+ torchvision==0.13.1
9
  matplotlib==3.5.3
10
  numpy==1.18.5
11
  inflect
 
14
  tensorboard==2.11.2
15
  Unidecode
16
  pillow
17
+ uvicorn
 
 
saved_model.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ccc0abcd0fb77104be73e6675454a06e7797bf1d4a1177181c32b648e9d75a9
3
- size 5697243
 
 
 
 
speaker/__init__.py DELETED
File without changes
speaker/bana.txt DELETED
File without changes
speaker/data.py DELETED
@@ -1,109 +0,0 @@
1
- import torch
2
- import torchaudio.datasets as datasets
3
- import torchaudio.transforms as transforms
4
- from collections import defaultdict
5
- import random
6
- import layers
7
-
8
- import warnings
9
-
10
- class SpeakerMelLoader(torch.utils.data.Dataset):
11
- """
12
- computes mel-spectrograms from audio file and pulls the speaker ID from the
13
- dataset
14
- """
15
-
16
- def __init__(self, dataset, format='speaker', speaker_utterances=4, mel_length = 128, mel_type = 'Tacotron'):
17
- self.dataset = dataset
18
- self.set_format(format)
19
- self.speaker_utterances = speaker_utterances
20
- self.mel_length = mel_length
21
- self.mel_type = mel_type
22
- self.mel_generators = dict()
23
-
24
- def set_format(self,format):
25
- self.format = format
26
-
27
- if format == 'speaker':
28
- self.create_speaker_index()
29
-
30
- def create_speaker_index(self):
31
- vals = [x.split('-',1) for x in self.dataset._walker]
32
- speaker_map = defaultdict(list)
33
-
34
- for i,v in enumerate(vals):
35
- speaker_map[v[0]].append(i)
36
-
37
- self.speaker_map = speaker_map
38
- self.speaker_keys = list(speaker_map.keys())
39
-
40
- def apply_mel_gen(self, waveform, sampling_rate, channels=80):
41
- if (sampling_rate, channels) not in self.mel_generators:
42
- if self.mel_type == 'MFCC':
43
- mel_gen = transforms.MFCC(sample_rate=sampling_rate, n_mfcc=channels)
44
- elif self.mel_type == 'Mel':
45
- mel_gen = transforms.MelSpectrogram(sample_rate=sampling_rate, n_mels=channels)
46
- elif self.mel_type == 'Tacotron':
47
- mel_gen = layers.TacotronSTFT(sampling_rate=sampling_rate,n_mel_channels=channels)
48
- else:
49
- raise NotImplementedError('Unsupported mel_type in MelSpeakerLoader: '+self.mel_type)
50
- self.mel_generators[(sampling_rate,channels)] = mel_gen
51
- else:
52
- mel_gen = self.mel_generators[(sampling_rate, channels)]
53
-
54
- if self.mel_type == 'Tacotron':
55
- #Replicating from Tacotron2 data loader
56
- max_wav_value=32768.0
57
- #skip normalization from Tacotron2, LibriSpeech data looks pre-normalized (all vals between 0-1)
58
- audio_norm = waveform #/ max_wav_value
59
- audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
60
- melspec = mel_gen.mel_spectrogram(audio_norm)
61
- else:
62
- audio = waveform.unsqueeze(0)
63
- audio = torch.autograd.Variable(audio, requires_grad=False)
64
- melspec = mel_gen(audio)
65
-
66
- return melspec
67
-
68
- def get_mel(self, waveform, sampling_rate, channels=80):
69
- # We previously identified that these warnings were ok.
70
- with warnings.catch_warnings():
71
- warnings.filterwarnings('ignore', message=r'At least one mel filterbank has all zero values.*', module=r'torchaudio.*')
72
- melspec = self.apply_mel_gen(waveform, sampling_rate, channels)
73
- # melspec is (1,1,channels, time) by default
74
- # return (time, channels)
75
- melspec = torch.squeeze(melspec).T
76
- return melspec
77
-
78
- def __getitem__(self, index):
79
- if self.format == 'utterance':
80
- (waveform, sample_rate, _, speaker_id, _, _) = self.dataset[index]
81
- mel = self.get_mel(waveform, sample_rate)
82
- return (speaker_id, mel)
83
- elif self.format == 'speaker':
84
- speaker_id = self.speaker_keys[index]
85
- utter_indexes = random.sample(self.speaker_map[speaker_id], self.speaker_utterances)
86
- mels = []
87
- for i in utter_indexes:
88
- (waveform, sample_rate, _, speaker_id, _, _) = self.dataset[i]
89
- mel = self.get_mel(waveform, sample_rate)
90
- if mel.shape[0] < self.mel_length:
91
- #Zero pad mel on the right to mel_length
92
- #pad_tuple is (dn start, dn end, dn-1 start, dn-1 end, ... , d1 start, d1 end)
93
- pad_tuple = (0,0,0,self.mel_length-mel.shape[0])
94
- mel=torch.nn.functional.pad(mel,pad_tuple)
95
- mel_frame = 0
96
- else:
97
- mel_frame = random.randint(0,mel.shape[0]-self.mel_length)
98
- mels.append(mel[mel_frame:mel_frame+self.mel_length,:])
99
- return (speaker_id, torch.stack(mels,0))
100
- else:
101
- raise NotImplementedError()
102
-
103
- def __len__(self):
104
- if self.format == 'utterance':
105
- return len(self.dataset)
106
- elif self.format == 'speaker':
107
- return len(self.speaker_keys)
108
- else:
109
- raise NotImplementedError()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
speaker/model.py DELETED
@@ -1,191 +0,0 @@
1
- from torch import nn
2
- import numpy as np
3
- import torch
4
- from torch.nn.utils import clip_grad_norm_
5
-
6
- class SpeakerEncoder(nn.Module):
7
- """ Learn speaker representation from speech utterance of arbitrary lengths.
8
- """
9
- def __init__(self, device, loss_device):
10
- super().__init__()
11
- self.loss_device = loss_device
12
-
13
- # lstm block consisting of 3 layers
14
- # takes input 80 channel log-mel spectrograms, projected to 256 dimensions
15
- self.lstm = nn.LSTM(
16
- input_size=80,
17
- hidden_size=256,
18
- num_layers=3,
19
- batch_first=True,
20
- dropout=0,
21
- bidirectional=False
22
- ).to(device)
23
-
24
- self.linear = nn.Linear(in_features=256, out_features=256).to(device)
25
- self.relu = nn.ReLU().to(device)
26
- # epsilon term for numerical stability ( ie - division by 0)
27
- self.epsilon = 1e-5
28
-
29
- #Cosine similarity weights
30
- self.sim_weight = nn.Parameter(torch.tensor([5.])).to(loss_device)
31
- self.sim_bias = nn.Parameter(torch.tensor([-1.])).to(loss_device)
32
-
33
- def forward(self, utterances, h_init=None, c_init=None):
34
- # implement section 2.1 from https://arxiv.org/pdf/1806.04558.pdf
35
- if h_init is None or c_init is None:
36
- out, (hidden, cell) = self.lstm(utterances)
37
- else:
38
- out, (hidden, cell) = self.lstm(utterances, (h_init, c_init))
39
-
40
- # compute speaker embedding from hidden state of final layer
41
- final_hidden = hidden[-1]
42
- speaker_embedding = self.relu(self.linear(final_hidden))
43
-
44
- # l2 norm of speaker embedding
45
- speaker_embedding = speaker_embedding / (torch.norm(speaker_embedding, dim=1, keepdim=True) + self.epsilon)
46
- return speaker_embedding
47
-
48
- def gradient_clipping(self):
49
- self.sim_weight.grad *= 0.01
50
- self.sim_bias.grad *= 0.01
51
-
52
- #Pytorch to clip gradients if norm greater than max
53
- clip_grad_norm_(self.parameters(),max_norm=3,norm_type=2)
54
-
55
- def similarity_matrix(self, embeds, debug=False):
56
- # calculate s_ji,k from section 2.1 of GE2E paper
57
- # output matrix is cosine similarity between each utterance x centroid of each speaker
58
- # embeds input size: (speakers, utterances, embedding size)
59
-
60
- # Speaker centroids
61
- # Equal to average of utterance embeddings for the speaker
62
- # Used for neg examples (utterance comparing to false speaker)
63
- # Equation 1 in paper
64
- # size: (speakers, 1, embedding size)
65
- speaker_centroid = torch.mean(embeds,dim=1,keepdim=True)
66
-
67
- # Utterance exclusive centroids
68
- # Equal to average of utterance embeddings for the speaker, excluding ith utterance
69
- # Used for pos samples (utterance comparing to true speaker; speaker centroid exludes the utterance)
70
- # Equation 8 in paper
71
- # size: (speakers, utterances, embedding size)
72
- num_utterance = embeds.shape[1]
73
- utter_ex_centroid = (torch.sum(embeds,dim=1,keepdim=True) - embeds) / (num_utterance-1)
74
-
75
- if debug:
76
- print("e",embeds.shape)
77
- print(embeds)
78
- print("sc",speaker_centroid.shape)
79
- print(speaker_centroid)
80
- print("uc",utter_ex_centroid.shape)
81
- print(utter_ex_centroid)
82
-
83
- # Create pos and neg masks
84
- num_speaker = embeds.shape[0]
85
- i = torch.eye(num_speaker, dtype=torch.int)
86
- pos_mask = torch.where(i)
87
- neg_mask = torch.where(1-i)
88
-
89
- if debug:
90
- print("pm",len(pos_mask),len(pos_mask[0]))
91
- print(pos_mask)
92
- print("nm",len(neg_mask),len(neg_mask[0]))
93
- print(neg_mask)
94
-
95
- # Compile similarity matrix
96
- # size: (speakers, utterances, speakers)
97
- # initial size is (speakers, speakers, utterances for easier vectorization)
98
- sim_matrix = torch.zeros(num_speaker, num_speaker, num_utterance).to(self.loss_device)
99
- sim_matrix[pos_mask] = nn.functional.cosine_similarity(embeds,utter_ex_centroid,dim=2)
100
- sim_matrix[neg_mask] = nn.functional.cosine_similarity(embeds[neg_mask[0]],speaker_centroid[neg_mask[1]],dim=2)
101
- if debug:
102
- print("sm",sim_matrix.shape)
103
- print("pos vals",sim_matrix[pos_mask])
104
- print("neg vals",sim_matrix[neg_mask])
105
- print(sim_matrix)
106
-
107
- sim_matrix = sim_matrix.permute(0,2,1)
108
-
109
- if debug:
110
- print("sm",sim_matrix.shape)
111
- print(sim_matrix)
112
- print("cos sim weight", self.sim_weight)
113
- print("cos sim bias", self.sim_bias)
114
-
115
- # Apply weight / bias
116
- sim_matrix = sim_matrix * self.sim_weight + self.sim_bias
117
- return sim_matrix
118
-
119
- def softmax_loss(self, embeds):
120
- """
121
- computes softmax loss as defined by equ 6 in the GE2E paper
122
- :param embeds: shape (speakers, utterances, embedding size)
123
- :return: computed softmax loss
124
- """
125
- # per the GE2E paper, softmax loss as defined by equ 6
126
- # performs slightly better over Text-Independent Speaker
127
- # Verification tasks.
128
- # ref section 2.1 of the GE2E paper
129
- speaker_count = embeds.shape[0]
130
-
131
- # speaker, utterance, speaker
132
- similarities = self.similarity_matrix(embeds)
133
-
134
- # equ 6
135
- loss_matrix = -similarities[torch.arange(0, speaker_count), :, torch.arange(0, speaker_count)] + \
136
- torch.log(torch.sum(torch.exp(similarities), dim=2))
137
-
138
- # equ 10
139
- return torch.sum(loss_matrix)
140
-
141
- def contrast_loss(self, embeds):
142
- """
143
- computes contrast loss as defined by equ 7 in the GE2E paper
144
- :param embeds: shape (speakers, utterances, embedding size)
145
- :return: computed softmax loss
146
- """
147
- # per the GE2E paper, contrast loss as defined by equ 7
148
- # performs slightly better over Text-Dependent Speaker
149
- # Verification tasks.
150
- # ref section 2.1 of the GE2E paper
151
- speaker_count, utterance_count = embeds.shape[0:2]
152
-
153
- # speaker, utterance, speaker
154
- similarities = self.similarity_matrix(embeds)
155
-
156
- # Janky indexing to resolve k != j
157
- mask = torch.ones(similarities.shape, dtype=torch.bool)
158
- mask[torch.arange(speaker_count), :, torch.arange(speaker_count)] = False
159
- closest_neighbors, _ = torch.max(similarities[mask].reshape(speaker_count, utterance_count, speaker_count - 1), dim=2)
160
-
161
- # Positive influence over matching embeddings
162
- matching_embedding = similarities[torch.arange(0, speaker_count), :, torch.arange(0, speaker_count)]
163
-
164
- # equ 7
165
- loss_matrix = 1 - torch.sigmoid(matching_embedding) + torch.sigmoid(closest_neighbors)
166
-
167
- # equ 10
168
- return torch.sum(loss_matrix)
169
-
170
- def accuracy(self, embeds):
171
- """
172
- computes argmax accuracy
173
- :param embeds: shape (speakers, utterances, speakers)
174
- :return: accuracy
175
- """
176
- num_speaker, num_utter = embeds.shape[:2]
177
-
178
- similarities = self.similarity_matrix(embeds)
179
- preds = torch.argmax(similarities, dim=2)
180
- preds_one_hot = torch.nn.functional.one_hot(preds,num_classes = num_speaker)
181
-
182
- actual = torch.arange(num_speaker).unsqueeze(1).repeat(1,num_utter)
183
- actual_one_hot = torch.nn.functional.one_hot(actual,num_classes=num_speaker)
184
-
185
- return torch.sum(preds_one_hot * actual_one_hot)/(num_speaker*num_utter)
186
-
187
-
188
-
189
-
190
-
191
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
speaker/preprocess.py DELETED
@@ -1 +0,0 @@
1
- # Reference https://github.com/CorentinJ/Real-Time-Voice-Cloning/blob/0713f860a3dd41afb56e83cff84dbdf589d5e11a/encoder/preprocess.py#L16
 
 
speaker/saved_model.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ccc0abcd0fb77104be73e6675454a06e7797bf1d4a1177181c32b648e9d75a9
3
- size 5697243
 
 
 
 
speaker/saved_model_e175.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:52ba80266b9f45fc3d825942aae40858eeaaa73994ba86e9ed017a533dc13323
3
- size 5861083
 
 
 
 
speaker/saved_models/dog.txt DELETED
File without changes
speaker/saved_models/saved_model_e175.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:52ba80266b9f45fc3d825942aae40858eeaaa73994ba86e9ed017a533dc13323
3
- size 5861083
 
 
 
 
speaker/saved_models/saved_model_e273_LargeBatch.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fbaaaa28a7d58b1316f322e1f33a5a68c00046b7b89a823ae7d987a632b8c7d6
3
- size 5861083
 
 
 
 
speaker/saved_models/saved_model_e300.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9be127fb61b6d2306ff877ab2184f187450953a5555a6751b3616b5ed84e78a
3
- size 5698805