Spaces:
Sleeping
Sleeping
help
#1
by
projanshakya
- opened
This view is limited to 50 files because it contains too many changes.Β
See the raw diff here.
- app.py +31 -38
- distributed.py +173 -0
- encoder/audio.py +0 -117
- encoder/config.py +0 -45
- encoder/data_objects/__init__.py +0 -2
- encoder/data_objects/random_cycler.py +0 -37
- encoder/data_objects/speaker.py +0 -40
- encoder/data_objects/speaker_batch.py +0 -13
- encoder/data_objects/speaker_verification_dataset.py +0 -56
- encoder/data_objects/utterance.py +0 -26
- encoder/inference.py +0 -178
- encoder/model.py +0 -135
- encoder/params_data.py +0 -29
- encoder/params_model.py +0 -11
- encoder/preprocess.py +0 -184
- encoder/train.py +0 -125
- encoder/visualizations.py +0 -179
- encoderCoren.pt +0 -3
- {hifigan β hifi-gan}/LICENSE +0 -0
- hifi-gan/README.md +105 -0
- diagrams/apple.txt β hifi-gan/apple.py +0 -0
- {hifigan β hifi-gan}/env.py +0 -0
- {hifigan β hifi-gan}/inference.py +3 -3
- {hifigan β hifi-gan}/inference_e2e.py +51 -37
- {hifigan β hifi-gan}/meldataset.py +0 -0
- {hifigan β hifi-gan}/models.py +1 -1
- hifi-gan/requirements.txt +7 -0
- {hifigan β hifi-gan}/train.py +4 -4
- hifigan/hifigan_utils.py β hifi-gan/utils.py +0 -0
- hparams.py +0 -1
- kaggle_12000.pt +0 -3
- logger.py +48 -0
- logic.py +41 -79
- loss_function.py +19 -0
- loss_scaler.py +131 -0
- model.py +12 -16
- multiproc.py +23 -0
- requirements.txt +8 -7
- saved_model.pt +0 -3
- speaker/__init__.py +0 -0
- speaker/bana.txt +0 -0
- speaker/data.py +0 -109
- speaker/model.py +0 -191
- speaker/preprocess.py +0 -1
- speaker/saved_model.pt +0 -3
- speaker/saved_model_e175.pt +0 -3
- speaker/saved_models/dog.txt +0 -0
- speaker/saved_models/saved_model_e175.pt +0 -3
- speaker/saved_models/saved_model_e273_LargeBatch.pt +0 -3
- speaker/saved_models/saved_model_e300.pt +0 -3
app.py
CHANGED
@@ -3,70 +3,63 @@ from fastapi.responses import JSONResponse
|
|
3 |
from fastapi.middleware.cors import CORSMiddleware
|
4 |
from logic import synthesize_voice, plot_data, plot_waveforms
|
5 |
import base64
|
6 |
-
import
|
7 |
-
import numpy as np
|
8 |
-
from io import BytesIO
|
9 |
-
from hifigan.inference_e2e import hifi_gan_inference
|
10 |
|
11 |
app = FastAPI()
|
12 |
|
13 |
-
@app.get("/")
|
14 |
-
def read_root():
|
15 |
-
data = {"Voice": "Cloning", "Status": "Success"}
|
16 |
-
return JSONResponse(content=data)
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
app.add_middleware(
|
19 |
CORSMiddleware,
|
20 |
-
allow_origins=
|
21 |
allow_credentials=True,
|
22 |
allow_methods=["*"],
|
23 |
allow_headers=["*"],
|
24 |
)
|
25 |
|
26 |
-
|
|
|
|
|
|
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
font_type = json['font_select']
|
36 |
-
input_text = json['input_text']
|
37 |
|
38 |
-
print("generating mel-spectrogram")
|
39 |
# Generate mel-spectrogram using Tacotron2
|
40 |
-
|
41 |
-
|
42 |
-
print("mel generation successful")
|
43 |
-
|
44 |
# Convert mel-spectrogram to base64 for display in HTML
|
45 |
mel_output_base64 = plot_data([mel_output_data, mel_output_postnet_data, alignments_data])
|
46 |
|
47 |
-
#
|
48 |
-
|
49 |
-
buffer = BytesIO()
|
50 |
-
np.save(buffer, mel_output_data)
|
51 |
-
input_mel = buffer.getvalue()
|
52 |
-
|
53 |
-
hifigan_checkpoint = "generator_v1"
|
54 |
-
|
55 |
-
# Generate audio using Hifigan
|
56 |
-
audio_data = hifi_gan_inference(input_mel, hifigan_checkpoint)
|
57 |
|
58 |
-
print("Creating time-domain waveform")
|
59 |
# Plot the waveform
|
60 |
-
wave_base64 = plot_waveforms(
|
61 |
|
62 |
# Encode audio content as Base64
|
63 |
-
|
|
|
64 |
|
65 |
# Customize the response based on the information you want to send to the frontend
|
66 |
response_data = {
|
67 |
'mel_spectrogram': mel_output_base64,
|
68 |
'audio_data': audio_base64,
|
69 |
'waveform': wave_base64,
|
|
|
70 |
}
|
71 |
|
72 |
-
return JSONResponse(content=response_data)
|
|
|
3 |
from fastapi.middleware.cors import CORSMiddleware
|
4 |
from logic import synthesize_voice, plot_data, plot_waveforms
|
5 |
import base64
|
6 |
+
from typing import Dict
|
|
|
|
|
|
|
7 |
|
8 |
app = FastAPI()
|
9 |
|
|
|
|
|
|
|
|
|
10 |
|
11 |
+
# You need to replace the placeholders above with the actual URLs for the models.
|
12 |
+
|
13 |
+
# Allow requests from your Vercel domain
|
14 |
+
origins = [
|
15 |
+
"https://host-test-smoky.vercel.app",
|
16 |
+
# Add other allowed origins if needed
|
17 |
+
]
|
18 |
+
|
19 |
+
# Set up CORS middleware
|
20 |
app.add_middleware(
|
21 |
CORSMiddleware,
|
22 |
+
allow_origins=origins,
|
23 |
allow_credentials=True,
|
24 |
allow_methods=["*"],
|
25 |
allow_headers=["*"],
|
26 |
)
|
27 |
|
28 |
+
@app.post("/synthesize", response_model=Dict[str, str])
|
29 |
+
async def synthesize(request_data: Dict[str, str]):
|
30 |
+
font_type = request_data['font_select']
|
31 |
+
input_text = request_data['input_text']
|
32 |
|
33 |
+
# Font selection logic (customize based on your requirements)
|
34 |
+
if font_type == 'Preeti':
|
35 |
+
# Implement Preeti font logic
|
36 |
+
pass
|
37 |
+
elif font_type == 'Unicode':
|
38 |
+
# Implement Unicode font logic
|
39 |
+
pass
|
|
|
|
|
40 |
|
|
|
41 |
# Generate mel-spectrogram using Tacotron2
|
42 |
+
mel_output_data, mel_output_postnet_data, alignments_data = synthesize_voice(input_text, "Shruti_finetuned")
|
43 |
+
|
|
|
|
|
44 |
# Convert mel-spectrogram to base64 for display in HTML
|
45 |
mel_output_base64 = plot_data([mel_output_data, mel_output_postnet_data, alignments_data])
|
46 |
|
47 |
+
# Save the generated audio file
|
48 |
+
audio_file_path = 'audio_output/mel1_generated_e2e.wav'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
|
|
50 |
# Plot the waveform
|
51 |
+
wave_base64 = plot_waveforms(audio_file_path)
|
52 |
|
53 |
# Encode audio content as Base64
|
54 |
+
with open(audio_file_path, 'rb') as audio_file:
|
55 |
+
audio_base64 = base64.b64encode(audio_file.read()).decode('utf-8')
|
56 |
|
57 |
# Customize the response based on the information you want to send to the frontend
|
58 |
response_data = {
|
59 |
'mel_spectrogram': mel_output_base64,
|
60 |
'audio_data': audio_base64,
|
61 |
'waveform': wave_base64,
|
62 |
+
'some_other_data': 'example_value',
|
63 |
}
|
64 |
|
65 |
+
return JSONResponse(content=response_data)
|
distributed.py
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.distributed as dist
|
3 |
+
from torch.nn.modules import Module
|
4 |
+
from torch.autograd import Variable
|
5 |
+
|
6 |
+
def _flatten_dense_tensors(tensors):
|
7 |
+
"""Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
|
8 |
+
same dense type.
|
9 |
+
Since inputs are dense, the resulting tensor will be a concatenated 1D
|
10 |
+
buffer. Element-wise operation on this buffer will be equivalent to
|
11 |
+
operating individually.
|
12 |
+
Arguments:
|
13 |
+
tensors (Iterable[Tensor]): dense tensors to flatten.
|
14 |
+
Returns:
|
15 |
+
A contiguous 1D buffer containing input tensors.
|
16 |
+
"""
|
17 |
+
if len(tensors) == 1:
|
18 |
+
return tensors[0].contiguous().view(-1)
|
19 |
+
flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0)
|
20 |
+
return flat
|
21 |
+
|
22 |
+
def _unflatten_dense_tensors(flat, tensors):
|
23 |
+
"""View a flat buffer using the sizes of tensors. Assume that tensors are of
|
24 |
+
same dense type, and that flat is given by _flatten_dense_tensors.
|
25 |
+
Arguments:
|
26 |
+
flat (Tensor): flattened dense tensors to unflatten.
|
27 |
+
tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
|
28 |
+
unflatten flat.
|
29 |
+
Returns:
|
30 |
+
Unflattened dense tensors with sizes same as tensors and values from
|
31 |
+
flat.
|
32 |
+
"""
|
33 |
+
outputs = []
|
34 |
+
offset = 0
|
35 |
+
for tensor in tensors:
|
36 |
+
numel = tensor.numel()
|
37 |
+
outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
|
38 |
+
offset += numel
|
39 |
+
return tuple(outputs)
|
40 |
+
|
41 |
+
|
42 |
+
'''
|
43 |
+
This version of DistributedDataParallel is designed to be used in conjunction with the multiproc.py
|
44 |
+
launcher included with this example. It assumes that your run is using multiprocess with 1
|
45 |
+
GPU/process, that the model is on the correct device, and that torch.set_device has been
|
46 |
+
used to set the device.
|
47 |
+
|
48 |
+
Parameters are broadcasted to the other processes on initialization of DistributedDataParallel,
|
49 |
+
and will be allreduced at the finish of the backward pass.
|
50 |
+
'''
|
51 |
+
class DistributedDataParallel(Module):
|
52 |
+
|
53 |
+
def __init__(self, module):
|
54 |
+
super(DistributedDataParallel, self).__init__()
|
55 |
+
#fallback for PyTorch 0.3
|
56 |
+
if not hasattr(dist, '_backend'):
|
57 |
+
self.warn_on_half = True
|
58 |
+
else:
|
59 |
+
self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
|
60 |
+
|
61 |
+
self.module = module
|
62 |
+
|
63 |
+
for p in self.module.state_dict().values():
|
64 |
+
if not torch.is_tensor(p):
|
65 |
+
continue
|
66 |
+
dist.broadcast(p, 0)
|
67 |
+
|
68 |
+
def allreduce_params():
|
69 |
+
if(self.needs_reduction):
|
70 |
+
self.needs_reduction = False
|
71 |
+
buckets = {}
|
72 |
+
for param in self.module.parameters():
|
73 |
+
if param.requires_grad and param.grad is not None:
|
74 |
+
tp = type(param.data)
|
75 |
+
if tp not in buckets:
|
76 |
+
buckets[tp] = []
|
77 |
+
buckets[tp].append(param)
|
78 |
+
if self.warn_on_half:
|
79 |
+
if torch.cuda.HalfTensor in buckets:
|
80 |
+
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
|
81 |
+
" It is recommended to use the NCCL backend in this case. This currently requires" +
|
82 |
+
"PyTorch built from top of tree master.")
|
83 |
+
self.warn_on_half = False
|
84 |
+
|
85 |
+
for tp in buckets:
|
86 |
+
bucket = buckets[tp]
|
87 |
+
grads = [param.grad.data for param in bucket]
|
88 |
+
coalesced = _flatten_dense_tensors(grads)
|
89 |
+
dist.all_reduce(coalesced)
|
90 |
+
coalesced /= dist.get_world_size()
|
91 |
+
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
|
92 |
+
buf.copy_(synced)
|
93 |
+
|
94 |
+
for param in list(self.module.parameters()):
|
95 |
+
def allreduce_hook(*unused):
|
96 |
+
param._execution_engine.queue_callback(allreduce_params)
|
97 |
+
if param.requires_grad:
|
98 |
+
param.register_hook(allreduce_hook)
|
99 |
+
|
100 |
+
def forward(self, *inputs, **kwargs):
|
101 |
+
self.needs_reduction = True
|
102 |
+
return self.module(*inputs, **kwargs)
|
103 |
+
|
104 |
+
'''
|
105 |
+
def _sync_buffers(self):
|
106 |
+
buffers = list(self.module._all_buffers())
|
107 |
+
if len(buffers) > 0:
|
108 |
+
# cross-node buffer sync
|
109 |
+
flat_buffers = _flatten_dense_tensors(buffers)
|
110 |
+
dist.broadcast(flat_buffers, 0)
|
111 |
+
for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
|
112 |
+
buf.copy_(synced)
|
113 |
+
def train(self, mode=True):
|
114 |
+
# Clear NCCL communicator and CUDA event cache of the default group ID,
|
115 |
+
# These cache will be recreated at the later call. This is currently a
|
116 |
+
# work-around for a potential NCCL deadlock.
|
117 |
+
if dist._backend == dist.dist_backend.NCCL:
|
118 |
+
dist._clear_group_cache()
|
119 |
+
super(DistributedDataParallel, self).train(mode)
|
120 |
+
self.module.train(mode)
|
121 |
+
'''
|
122 |
+
'''
|
123 |
+
Modifies existing model to do gradient allreduce, but doesn't change class
|
124 |
+
so you don't need "module"
|
125 |
+
'''
|
126 |
+
def apply_gradient_allreduce(module):
|
127 |
+
if not hasattr(dist, '_backend'):
|
128 |
+
module.warn_on_half = True
|
129 |
+
else:
|
130 |
+
module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
|
131 |
+
|
132 |
+
for p in module.state_dict().values():
|
133 |
+
if not torch.is_tensor(p):
|
134 |
+
continue
|
135 |
+
dist.broadcast(p, 0)
|
136 |
+
|
137 |
+
def allreduce_params():
|
138 |
+
if(module.needs_reduction):
|
139 |
+
module.needs_reduction = False
|
140 |
+
buckets = {}
|
141 |
+
for param in module.parameters():
|
142 |
+
if param.requires_grad and param.grad is not None:
|
143 |
+
tp = param.data.dtype
|
144 |
+
if tp not in buckets:
|
145 |
+
buckets[tp] = []
|
146 |
+
buckets[tp].append(param)
|
147 |
+
if module.warn_on_half:
|
148 |
+
if torch.cuda.HalfTensor in buckets:
|
149 |
+
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
|
150 |
+
" It is recommended to use the NCCL backend in this case. This currently requires" +
|
151 |
+
"PyTorch built from top of tree master.")
|
152 |
+
module.warn_on_half = False
|
153 |
+
|
154 |
+
for tp in buckets:
|
155 |
+
bucket = buckets[tp]
|
156 |
+
grads = [param.grad.data for param in bucket]
|
157 |
+
coalesced = _flatten_dense_tensors(grads)
|
158 |
+
dist.all_reduce(coalesced)
|
159 |
+
coalesced /= dist.get_world_size()
|
160 |
+
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
|
161 |
+
buf.copy_(synced)
|
162 |
+
|
163 |
+
for param in list(module.parameters()):
|
164 |
+
def allreduce_hook(*unused):
|
165 |
+
Variable._execution_engine.queue_callback(allreduce_params)
|
166 |
+
if param.requires_grad:
|
167 |
+
param.register_hook(allreduce_hook)
|
168 |
+
|
169 |
+
def set_needs_reduction(self, input, output):
|
170 |
+
self.needs_reduction = True
|
171 |
+
|
172 |
+
module.register_forward_hook(set_needs_reduction)
|
173 |
+
return module
|
encoder/audio.py
DELETED
@@ -1,117 +0,0 @@
|
|
1 |
-
from scipy.ndimage.morphology import binary_dilation
|
2 |
-
from encoder.params_data import *
|
3 |
-
from pathlib import Path
|
4 |
-
from typing import Optional, Union
|
5 |
-
from warnings import warn
|
6 |
-
import numpy as np
|
7 |
-
import librosa
|
8 |
-
import struct
|
9 |
-
|
10 |
-
try:
|
11 |
-
import webrtcvad
|
12 |
-
except:
|
13 |
-
warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")
|
14 |
-
webrtcvad=None
|
15 |
-
|
16 |
-
int16_max = (2 ** 15) - 1
|
17 |
-
|
18 |
-
|
19 |
-
def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
|
20 |
-
source_sr: Optional[int] = None,
|
21 |
-
normalize: Optional[bool] = True,
|
22 |
-
trim_silence: Optional[bool] = True):
|
23 |
-
"""
|
24 |
-
Applies the preprocessing operations used in training the Speaker Encoder to a waveform
|
25 |
-
either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
|
26 |
-
|
27 |
-
:param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
|
28 |
-
just .wav), either the waveform as a numpy array of floats.
|
29 |
-
:param source_sr: if passing an audio waveform, the sampling rate of the waveform before
|
30 |
-
preprocessing. After preprocessing, the waveform's sampling rate will match the data
|
31 |
-
hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
|
32 |
-
this argument will be ignored.
|
33 |
-
"""
|
34 |
-
# Load the wav from disk if needed
|
35 |
-
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
|
36 |
-
wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
|
37 |
-
else:
|
38 |
-
wav = fpath_or_wav
|
39 |
-
|
40 |
-
# Resample the wav if needed
|
41 |
-
if source_sr is not None and source_sr != sampling_rate:
|
42 |
-
wav = librosa.resample(wav, source_sr, sampling_rate)
|
43 |
-
|
44 |
-
# Apply the preprocessing: normalize volume and shorten long silences
|
45 |
-
if normalize:
|
46 |
-
wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
|
47 |
-
if webrtcvad and trim_silence:
|
48 |
-
wav = trim_long_silences(wav)
|
49 |
-
|
50 |
-
return wav
|
51 |
-
|
52 |
-
|
53 |
-
def wav_to_mel_spectrogram(wav):
|
54 |
-
"""
|
55 |
-
Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
|
56 |
-
Note: this not a log-mel spectrogram.
|
57 |
-
"""
|
58 |
-
frames = librosa.feature.melspectrogram(
|
59 |
-
wav,
|
60 |
-
sampling_rate,
|
61 |
-
n_fft=int(sampling_rate * mel_window_length / 1000),
|
62 |
-
hop_length=int(sampling_rate * mel_window_step / 1000),
|
63 |
-
n_mels=mel_n_channels
|
64 |
-
)
|
65 |
-
return frames.astype(np.float32).T
|
66 |
-
|
67 |
-
|
68 |
-
def trim_long_silences(wav):
|
69 |
-
"""
|
70 |
-
Ensures that segments without voice in the waveform remain no longer than a
|
71 |
-
threshold determined by the VAD parameters in params.py.
|
72 |
-
|
73 |
-
:param wav: the raw waveform as a numpy array of floats
|
74 |
-
:return: the same waveform with silences trimmed away (length <= original wav length)
|
75 |
-
"""
|
76 |
-
# Compute the voice detection window size
|
77 |
-
samples_per_window = (vad_window_length * sampling_rate) // 1000
|
78 |
-
|
79 |
-
# Trim the end of the audio to have a multiple of the window size
|
80 |
-
wav = wav[:len(wav) - (len(wav) % samples_per_window)]
|
81 |
-
|
82 |
-
# Convert the float waveform to 16-bit mono PCM
|
83 |
-
pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
|
84 |
-
|
85 |
-
# Perform voice activation detection
|
86 |
-
voice_flags = []
|
87 |
-
vad = webrtcvad.Vad(mode=3)
|
88 |
-
for window_start in range(0, len(wav), samples_per_window):
|
89 |
-
window_end = window_start + samples_per_window
|
90 |
-
voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
|
91 |
-
sample_rate=sampling_rate))
|
92 |
-
voice_flags = np.array(voice_flags)
|
93 |
-
|
94 |
-
# Smooth the voice detection with a moving average
|
95 |
-
def moving_average(array, width):
|
96 |
-
array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
|
97 |
-
ret = np.cumsum(array_padded, dtype=float)
|
98 |
-
ret[width:] = ret[width:] - ret[:-width]
|
99 |
-
return ret[width - 1:] / width
|
100 |
-
|
101 |
-
audio_mask = moving_average(voice_flags, vad_moving_average_width)
|
102 |
-
audio_mask = np.round(audio_mask).astype(np.bool)
|
103 |
-
|
104 |
-
# Dilate the voiced regions
|
105 |
-
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
|
106 |
-
audio_mask = np.repeat(audio_mask, samples_per_window)
|
107 |
-
|
108 |
-
return wav[audio_mask == True]
|
109 |
-
|
110 |
-
|
111 |
-
def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
|
112 |
-
if increase_only and decrease_only:
|
113 |
-
raise ValueError("Both increase only and decrease only are set")
|
114 |
-
dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
|
115 |
-
if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
|
116 |
-
return wav
|
117 |
-
return wav * (10 ** (dBFS_change / 20))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
encoder/config.py
DELETED
@@ -1,45 +0,0 @@
|
|
1 |
-
librispeech_datasets = {
|
2 |
-
"train": {
|
3 |
-
"clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
|
4 |
-
"other": ["LibriSpeech/train-other-500"]
|
5 |
-
},
|
6 |
-
"test": {
|
7 |
-
"clean": ["LibriSpeech/test-clean"],
|
8 |
-
"other": ["LibriSpeech/test-other"]
|
9 |
-
},
|
10 |
-
"dev": {
|
11 |
-
"clean": ["LibriSpeech/dev-clean"],
|
12 |
-
"other": ["LibriSpeech/dev-other"]
|
13 |
-
},
|
14 |
-
}
|
15 |
-
libritts_datasets = {
|
16 |
-
"train": {
|
17 |
-
"clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
|
18 |
-
"other": ["LibriTTS/train-other-500"]
|
19 |
-
},
|
20 |
-
"test": {
|
21 |
-
"clean": ["LibriTTS/test-clean"],
|
22 |
-
"other": ["LibriTTS/test-other"]
|
23 |
-
},
|
24 |
-
"dev": {
|
25 |
-
"clean": ["LibriTTS/dev-clean"],
|
26 |
-
"other": ["LibriTTS/dev-other"]
|
27 |
-
},
|
28 |
-
}
|
29 |
-
voxceleb_datasets = {
|
30 |
-
"voxceleb1" : {
|
31 |
-
"train": ["VoxCeleb1/wav"],
|
32 |
-
"test": ["VoxCeleb1/test_wav"]
|
33 |
-
},
|
34 |
-
"voxceleb2" : {
|
35 |
-
"train": ["VoxCeleb2/dev/aac"],
|
36 |
-
"test": ["VoxCeleb2/test_wav"]
|
37 |
-
}
|
38 |
-
}
|
39 |
-
|
40 |
-
other_datasets = [
|
41 |
-
"LJSpeech-1.1",
|
42 |
-
"VCTK-Corpus/wav48",
|
43 |
-
]
|
44 |
-
|
45 |
-
anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
encoder/data_objects/__init__.py
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
|
2 |
-
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader
|
|
|
|
|
|
encoder/data_objects/random_cycler.py
DELETED
@@ -1,37 +0,0 @@
|
|
1 |
-
import random
|
2 |
-
|
3 |
-
class RandomCycler:
|
4 |
-
"""
|
5 |
-
Creates an internal copy of a sequence and allows access to its items in a constrained random
|
6 |
-
order. For a source sequence of n items and one or several consecutive queries of a total
|
7 |
-
of m items, the following guarantees hold (one implies the other):
|
8 |
-
- Each item will be returned between m // n and ((m - 1) // n) + 1 times.
|
9 |
-
- Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
|
10 |
-
"""
|
11 |
-
|
12 |
-
def __init__(self, source):
|
13 |
-
if len(source) == 0:
|
14 |
-
raise Exception("Can't create RandomCycler from an empty collection")
|
15 |
-
self.all_items = list(source)
|
16 |
-
self.next_items = []
|
17 |
-
|
18 |
-
def sample(self, count: int):
|
19 |
-
shuffle = lambda l: random.sample(l, len(l))
|
20 |
-
|
21 |
-
out = []
|
22 |
-
while count > 0:
|
23 |
-
if count >= len(self.all_items):
|
24 |
-
out.extend(shuffle(list(self.all_items)))
|
25 |
-
count -= len(self.all_items)
|
26 |
-
continue
|
27 |
-
n = min(count, len(self.next_items))
|
28 |
-
out.extend(self.next_items[:n])
|
29 |
-
count -= n
|
30 |
-
self.next_items = self.next_items[n:]
|
31 |
-
if len(self.next_items) == 0:
|
32 |
-
self.next_items = shuffle(list(self.all_items))
|
33 |
-
return out
|
34 |
-
|
35 |
-
def __next__(self):
|
36 |
-
return self.sample(1)[0]
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
encoder/data_objects/speaker.py
DELETED
@@ -1,40 +0,0 @@
|
|
1 |
-
from encoder.data_objects.random_cycler import RandomCycler
|
2 |
-
from encoder.data_objects.utterance import Utterance
|
3 |
-
from pathlib import Path
|
4 |
-
|
5 |
-
# Contains the set of utterances of a single speaker
|
6 |
-
class Speaker:
|
7 |
-
def __init__(self, root: Path):
|
8 |
-
self.root = root
|
9 |
-
self.name = root.name
|
10 |
-
self.utterances = None
|
11 |
-
self.utterance_cycler = None
|
12 |
-
|
13 |
-
def _load_utterances(self):
|
14 |
-
with self.root.joinpath("_sources.txt").open("r") as sources_file:
|
15 |
-
sources = [l.split(",") for l in sources_file]
|
16 |
-
sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
|
17 |
-
self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
|
18 |
-
self.utterance_cycler = RandomCycler(self.utterances)
|
19 |
-
|
20 |
-
def random_partial(self, count, n_frames):
|
21 |
-
"""
|
22 |
-
Samples a batch of <count> unique partial utterances from the disk in a way that all
|
23 |
-
utterances come up at least once every two cycles and in a random order every time.
|
24 |
-
|
25 |
-
:param count: The number of partial utterances to sample from the set of utterances from
|
26 |
-
that speaker. Utterances are guaranteed not to be repeated if <count> is not larger than
|
27 |
-
the number of utterances available.
|
28 |
-
:param n_frames: The number of frames in the partial utterance.
|
29 |
-
:return: A list of tuples (utterance, frames, range) where utterance is an Utterance,
|
30 |
-
frames are the frames of the partial utterances and range is the range of the partial
|
31 |
-
utterance with regard to the complete utterance.
|
32 |
-
"""
|
33 |
-
if self.utterances is None:
|
34 |
-
self._load_utterances()
|
35 |
-
|
36 |
-
utterances = self.utterance_cycler.sample(count)
|
37 |
-
|
38 |
-
a = [(u,) + u.random_partial(n_frames) for u in utterances]
|
39 |
-
|
40 |
-
return a
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
encoder/data_objects/speaker_batch.py
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
from typing import List
|
3 |
-
from encoder.data_objects.speaker import Speaker
|
4 |
-
|
5 |
-
|
6 |
-
class SpeakerBatch:
|
7 |
-
def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
|
8 |
-
self.speakers = speakers
|
9 |
-
self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
|
10 |
-
|
11 |
-
# Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
|
12 |
-
# 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
|
13 |
-
self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
encoder/data_objects/speaker_verification_dataset.py
DELETED
@@ -1,56 +0,0 @@
|
|
1 |
-
from encoder.data_objects.random_cycler import RandomCycler
|
2 |
-
from encoder.data_objects.speaker_batch import SpeakerBatch
|
3 |
-
from encoder.data_objects.speaker import Speaker
|
4 |
-
from encoder.params_data import partials_n_frames
|
5 |
-
from torch.utils.data import Dataset, DataLoader
|
6 |
-
from pathlib import Path
|
7 |
-
|
8 |
-
# TODO: improve with a pool of speakers for data efficiency
|
9 |
-
|
10 |
-
class SpeakerVerificationDataset(Dataset):
|
11 |
-
def __init__(self, datasets_root: Path):
|
12 |
-
self.root = datasets_root
|
13 |
-
speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
|
14 |
-
if len(speaker_dirs) == 0:
|
15 |
-
raise Exception("No speakers found. Make sure you are pointing to the directory "
|
16 |
-
"containing all preprocessed speaker directories.")
|
17 |
-
self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
|
18 |
-
self.speaker_cycler = RandomCycler(self.speakers)
|
19 |
-
|
20 |
-
def __len__(self):
|
21 |
-
return int(1e10)
|
22 |
-
|
23 |
-
def __getitem__(self, index):
|
24 |
-
return next(self.speaker_cycler)
|
25 |
-
|
26 |
-
def get_logs(self):
|
27 |
-
log_string = ""
|
28 |
-
for log_fpath in self.root.glob("*.txt"):
|
29 |
-
with log_fpath.open("r") as log_file:
|
30 |
-
log_string += "".join(log_file.readlines())
|
31 |
-
return log_string
|
32 |
-
|
33 |
-
|
34 |
-
class SpeakerVerificationDataLoader(DataLoader):
|
35 |
-
def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None,
|
36 |
-
batch_sampler=None, num_workers=0, pin_memory=False, timeout=0,
|
37 |
-
worker_init_fn=None):
|
38 |
-
self.utterances_per_speaker = utterances_per_speaker
|
39 |
-
|
40 |
-
super().__init__(
|
41 |
-
dataset=dataset,
|
42 |
-
batch_size=speakers_per_batch,
|
43 |
-
shuffle=False,
|
44 |
-
sampler=sampler,
|
45 |
-
batch_sampler=batch_sampler,
|
46 |
-
num_workers=num_workers,
|
47 |
-
collate_fn=self.collate,
|
48 |
-
pin_memory=pin_memory,
|
49 |
-
drop_last=False,
|
50 |
-
timeout=timeout,
|
51 |
-
worker_init_fn=worker_init_fn
|
52 |
-
)
|
53 |
-
|
54 |
-
def collate(self, speakers):
|
55 |
-
return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames)
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
encoder/data_objects/utterance.py
DELETED
@@ -1,26 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
|
3 |
-
|
4 |
-
class Utterance:
|
5 |
-
def __init__(self, frames_fpath, wave_fpath):
|
6 |
-
self.frames_fpath = frames_fpath
|
7 |
-
self.wave_fpath = wave_fpath
|
8 |
-
|
9 |
-
def get_frames(self):
|
10 |
-
return np.load(self.frames_fpath)
|
11 |
-
|
12 |
-
def random_partial(self, n_frames):
|
13 |
-
"""
|
14 |
-
Crops the frames into a partial utterance of n_frames
|
15 |
-
|
16 |
-
:param n_frames: The number of frames of the partial utterance
|
17 |
-
:return: the partial utterance frames and a tuple indicating the start and end of the
|
18 |
-
partial utterance in the complete utterance.
|
19 |
-
"""
|
20 |
-
frames = self.get_frames()
|
21 |
-
if frames.shape[0] == n_frames:
|
22 |
-
start = 0
|
23 |
-
else:
|
24 |
-
start = np.random.randint(0, frames.shape[0] - n_frames)
|
25 |
-
end = start + n_frames
|
26 |
-
return frames[start:end], (start, end)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
encoder/inference.py
DELETED
@@ -1,178 +0,0 @@
|
|
1 |
-
from encoder.params_data import *
|
2 |
-
from encoder.model import SpeakerEncoder
|
3 |
-
from encoder.audio import preprocess_wav # We want to expose this function from here
|
4 |
-
from matplotlib import cm
|
5 |
-
from encoder import audio
|
6 |
-
from pathlib import Path
|
7 |
-
import numpy as np
|
8 |
-
import torch
|
9 |
-
|
10 |
-
_model = None # type: SpeakerEncoder
|
11 |
-
_device = None # type: torch.device
|
12 |
-
|
13 |
-
|
14 |
-
def load_model(weights_fpath: Path, device=None):
|
15 |
-
"""
|
16 |
-
Loads the model in memory. If this function is not explicitely called, it will be run on the
|
17 |
-
first call to embed_frames() with the default weights file.
|
18 |
-
|
19 |
-
:param weights_fpath: the path to saved model weights.
|
20 |
-
:param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
|
21 |
-
model will be loaded and will run on this device. Outputs will however always be on the cpu.
|
22 |
-
If None, will default to your GPU if it"s available, otherwise your CPU.
|
23 |
-
"""
|
24 |
-
# TODO: I think the slow loading of the encoder might have something to do with the device it
|
25 |
-
# was saved on. Worth investigating.
|
26 |
-
global _model, _device
|
27 |
-
if device is None:
|
28 |
-
_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
29 |
-
elif isinstance(device, str):
|
30 |
-
_device = torch.device(device)
|
31 |
-
_model = SpeakerEncoder(_device, torch.device("cpu"))
|
32 |
-
checkpoint = torch.load(weights_fpath, _device)
|
33 |
-
_model.load_state_dict(checkpoint["model_state"])
|
34 |
-
_model.eval()
|
35 |
-
print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
|
36 |
-
|
37 |
-
|
38 |
-
def is_loaded():
|
39 |
-
return _model is not None
|
40 |
-
|
41 |
-
|
42 |
-
def embed_frames_batch(frames_batch):
|
43 |
-
"""
|
44 |
-
Computes embeddings for a batch of mel spectrogram.
|
45 |
-
|
46 |
-
:param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
|
47 |
-
(batch_size, n_frames, n_channels)
|
48 |
-
:return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
|
49 |
-
"""
|
50 |
-
if _model is None:
|
51 |
-
raise Exception("Model was not loaded. Call load_model() before inference.")
|
52 |
-
|
53 |
-
frames = torch.from_numpy(frames_batch).to(_device)
|
54 |
-
embed = _model.forward(frames).detach().cpu().numpy()
|
55 |
-
return embed
|
56 |
-
|
57 |
-
|
58 |
-
def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
|
59 |
-
min_pad_coverage=0.75, overlap=0.5):
|
60 |
-
"""
|
61 |
-
Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
|
62 |
-
partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
|
63 |
-
spectrogram slices are returned, so as to make each partial utterance waveform correspond to
|
64 |
-
its spectrogram. This function assumes that the mel spectrogram parameters used are those
|
65 |
-
defined in params_data.py.
|
66 |
-
|
67 |
-
The returned ranges may be indexing further than the length of the waveform. It is
|
68 |
-
recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
|
69 |
-
|
70 |
-
:param n_samples: the number of samples in the waveform
|
71 |
-
:param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
|
72 |
-
utterance
|
73 |
-
:param min_pad_coverage: when reaching the last partial utterance, it may or may not have
|
74 |
-
enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
|
75 |
-
then the last partial utterance will be considered, as if we padded the audio. Otherwise,
|
76 |
-
it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
|
77 |
-
utterance, this parameter is ignored so that the function always returns at least 1 slice.
|
78 |
-
:param overlap: by how much the partial utterance should overlap. If set to 0, the partial
|
79 |
-
utterances are entirely disjoint.
|
80 |
-
:return: the waveform slices and mel spectrogram slices as lists of array slices. Index
|
81 |
-
respectively the waveform and the mel spectrogram with these slices to obtain the partial
|
82 |
-
utterances.
|
83 |
-
"""
|
84 |
-
assert 0 <= overlap < 1
|
85 |
-
assert 0 < min_pad_coverage <= 1
|
86 |
-
|
87 |
-
samples_per_frame = int((sampling_rate * mel_window_step / 1000))
|
88 |
-
n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
|
89 |
-
frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
|
90 |
-
|
91 |
-
# Compute the slices
|
92 |
-
wav_slices, mel_slices = [], []
|
93 |
-
steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
|
94 |
-
for i in range(0, steps, frame_step):
|
95 |
-
mel_range = np.array([i, i + partial_utterance_n_frames])
|
96 |
-
wav_range = mel_range * samples_per_frame
|
97 |
-
mel_slices.append(slice(*mel_range))
|
98 |
-
wav_slices.append(slice(*wav_range))
|
99 |
-
|
100 |
-
# Evaluate whether extra padding is warranted or not
|
101 |
-
last_wav_range = wav_slices[-1]
|
102 |
-
coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
|
103 |
-
if coverage < min_pad_coverage and len(mel_slices) > 1:
|
104 |
-
mel_slices = mel_slices[:-1]
|
105 |
-
wav_slices = wav_slices[:-1]
|
106 |
-
|
107 |
-
return wav_slices, mel_slices
|
108 |
-
|
109 |
-
|
110 |
-
def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
|
111 |
-
"""
|
112 |
-
Computes an embedding for a single utterance.
|
113 |
-
|
114 |
-
# TODO: handle multiple wavs to benefit from batching on GPU
|
115 |
-
:param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
|
116 |
-
:param using_partials: if True, then the utterance is split in partial utterances of
|
117 |
-
<partial_utterance_n_frames> frames and the utterance embedding is computed from their
|
118 |
-
normalized average. If False, the utterance is instead computed from feeding the entire
|
119 |
-
spectogram to the network.
|
120 |
-
:param return_partials: if True, the partial embeddings will also be returned along with the
|
121 |
-
wav slices that correspond to the partial embeddings.
|
122 |
-
:param kwargs: additional arguments to compute_partial_splits()
|
123 |
-
:return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
|
124 |
-
<return_partials> is True, the partial utterances as a numpy array of float32 of shape
|
125 |
-
(n_partials, model_embedding_size) and the wav partials as a list of slices will also be
|
126 |
-
returned. If <using_partials> is simultaneously set to False, both these values will be None
|
127 |
-
instead.
|
128 |
-
"""
|
129 |
-
# Process the entire utterance if not using partials
|
130 |
-
if not using_partials:
|
131 |
-
frames = audio.wav_to_mel_spectrogram(wav)
|
132 |
-
embed = embed_frames_batch(frames[None, ...])[0]
|
133 |
-
if return_partials:
|
134 |
-
return embed, None, None
|
135 |
-
return embed
|
136 |
-
|
137 |
-
# Compute where to split the utterance into partials and pad if necessary
|
138 |
-
wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
|
139 |
-
max_wave_length = wave_slices[-1].stop
|
140 |
-
if max_wave_length >= len(wav):
|
141 |
-
wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
|
142 |
-
|
143 |
-
# Split the utterance into partials
|
144 |
-
frames = audio.wav_to_mel_spectrogram(wav)
|
145 |
-
frames_batch = np.array([frames[s] for s in mel_slices])
|
146 |
-
partial_embeds = embed_frames_batch(frames_batch)
|
147 |
-
|
148 |
-
# Compute the utterance embedding from the partial embeddings
|
149 |
-
raw_embed = np.mean(partial_embeds, axis=0)
|
150 |
-
embed = raw_embed / np.linalg.norm(raw_embed, 2)
|
151 |
-
|
152 |
-
if return_partials:
|
153 |
-
return embed, partial_embeds, wave_slices
|
154 |
-
return embed
|
155 |
-
|
156 |
-
|
157 |
-
def embed_speaker(wavs, **kwargs):
|
158 |
-
raise NotImplemented()
|
159 |
-
|
160 |
-
|
161 |
-
def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
|
162 |
-
import matplotlib.pyplot as plt
|
163 |
-
if ax is None:
|
164 |
-
ax = plt.gca()
|
165 |
-
|
166 |
-
if shape is None:
|
167 |
-
height = int(np.sqrt(len(embed)))
|
168 |
-
shape = (height, -1)
|
169 |
-
embed = embed.reshape(shape)
|
170 |
-
|
171 |
-
cmap = cm.get_cmap()
|
172 |
-
mappable = ax.imshow(embed, cmap=cmap)
|
173 |
-
cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
|
174 |
-
sm = cm.ScalarMappable(cmap=cmap)
|
175 |
-
sm.set_clim(*color_range)
|
176 |
-
|
177 |
-
ax.set_xticks([]), ax.set_yticks([])
|
178 |
-
ax.set_title(title)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
encoder/model.py
DELETED
@@ -1,135 +0,0 @@
|
|
1 |
-
from encoder.params_model import *
|
2 |
-
from encoder.params_data import *
|
3 |
-
from scipy.interpolate import interp1d
|
4 |
-
from sklearn.metrics import roc_curve
|
5 |
-
from torch.nn.utils import clip_grad_norm_
|
6 |
-
from scipy.optimize import brentq
|
7 |
-
from torch import nn
|
8 |
-
import numpy as np
|
9 |
-
import torch
|
10 |
-
|
11 |
-
|
12 |
-
class SpeakerEncoder(nn.Module):
|
13 |
-
def __init__(self, device, loss_device):
|
14 |
-
super().__init__()
|
15 |
-
self.loss_device = loss_device
|
16 |
-
|
17 |
-
# Network defition
|
18 |
-
self.lstm = nn.LSTM(input_size=mel_n_channels,
|
19 |
-
hidden_size=model_hidden_size,
|
20 |
-
num_layers=model_num_layers,
|
21 |
-
batch_first=True).to(device)
|
22 |
-
self.linear = nn.Linear(in_features=model_hidden_size,
|
23 |
-
out_features=model_embedding_size).to(device)
|
24 |
-
self.relu = torch.nn.ReLU().to(device)
|
25 |
-
|
26 |
-
# Cosine similarity scaling (with fixed initial parameter values)
|
27 |
-
self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
|
28 |
-
self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
|
29 |
-
|
30 |
-
# Loss
|
31 |
-
self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
|
32 |
-
|
33 |
-
def do_gradient_ops(self):
|
34 |
-
# Gradient scale
|
35 |
-
self.similarity_weight.grad *= 0.01
|
36 |
-
self.similarity_bias.grad *= 0.01
|
37 |
-
|
38 |
-
# Gradient clipping
|
39 |
-
clip_grad_norm_(self.parameters(), 3, norm_type=2)
|
40 |
-
|
41 |
-
def forward(self, utterances, hidden_init=None):
|
42 |
-
"""
|
43 |
-
Computes the embeddings of a batch of utterance spectrograms.
|
44 |
-
|
45 |
-
:param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
|
46 |
-
(batch_size, n_frames, n_channels)
|
47 |
-
:param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
|
48 |
-
batch_size, hidden_size). Will default to a tensor of zeros if None.
|
49 |
-
:return: the embeddings as a tensor of shape (batch_size, embedding_size)
|
50 |
-
"""
|
51 |
-
# Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
|
52 |
-
# and the final cell state.
|
53 |
-
out, (hidden, cell) = self.lstm(utterances, hidden_init)
|
54 |
-
|
55 |
-
# We take only the hidden state of the last layer
|
56 |
-
embeds_raw = self.relu(self.linear(hidden[-1]))
|
57 |
-
|
58 |
-
# L2-normalize it
|
59 |
-
embeds = embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5)
|
60 |
-
|
61 |
-
return embeds
|
62 |
-
|
63 |
-
def similarity_matrix(self, embeds):
|
64 |
-
"""
|
65 |
-
Computes the similarity matrix according the section 2.1 of GE2E.
|
66 |
-
|
67 |
-
:param embeds: the embeddings as a tensor of shape (speakers_per_batch,
|
68 |
-
utterances_per_speaker, embedding_size)
|
69 |
-
:return: the similarity matrix as a tensor of shape (speakers_per_batch,
|
70 |
-
utterances_per_speaker, speakers_per_batch)
|
71 |
-
"""
|
72 |
-
speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
|
73 |
-
|
74 |
-
# Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
|
75 |
-
centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
|
76 |
-
centroids_incl = centroids_incl.clone() / (torch.norm(centroids_incl, dim=2, keepdim=True) + 1e-5)
|
77 |
-
|
78 |
-
# Exclusive centroids (1 per utterance)
|
79 |
-
centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
|
80 |
-
centroids_excl /= (utterances_per_speaker - 1)
|
81 |
-
centroids_excl = centroids_excl.clone() / (torch.norm(centroids_excl, dim=2, keepdim=True) + 1e-5)
|
82 |
-
|
83 |
-
# Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
|
84 |
-
# product of these vectors (which is just an element-wise multiplication reduced by a sum).
|
85 |
-
# We vectorize the computation for efficiency.
|
86 |
-
sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
|
87 |
-
speakers_per_batch).to(self.loss_device)
|
88 |
-
mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
|
89 |
-
for j in range(speakers_per_batch):
|
90 |
-
mask = np.where(mask_matrix[j])[0]
|
91 |
-
sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
|
92 |
-
sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
|
93 |
-
|
94 |
-
## Even more vectorized version (slower maybe because of transpose)
|
95 |
-
# sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
|
96 |
-
# ).to(self.loss_device)
|
97 |
-
# eye = np.eye(speakers_per_batch, dtype=np.int)
|
98 |
-
# mask = np.where(1 - eye)
|
99 |
-
# sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
|
100 |
-
# mask = np.where(eye)
|
101 |
-
# sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
|
102 |
-
# sim_matrix2 = sim_matrix2.transpose(1, 2)
|
103 |
-
|
104 |
-
sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
|
105 |
-
return sim_matrix
|
106 |
-
|
107 |
-
def loss(self, embeds):
|
108 |
-
"""
|
109 |
-
Computes the softmax loss according the section 2.1 of GE2E.
|
110 |
-
|
111 |
-
:param embeds: the embeddings as a tensor of shape (speakers_per_batch,
|
112 |
-
utterances_per_speaker, embedding_size)
|
113 |
-
:return: the loss and the EER for this batch of embeddings.
|
114 |
-
"""
|
115 |
-
speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
|
116 |
-
|
117 |
-
# Loss
|
118 |
-
sim_matrix = self.similarity_matrix(embeds)
|
119 |
-
sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker,
|
120 |
-
speakers_per_batch))
|
121 |
-
ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
|
122 |
-
target = torch.from_numpy(ground_truth).long().to(self.loss_device)
|
123 |
-
loss = self.loss_fn(sim_matrix, target)
|
124 |
-
|
125 |
-
# EER (not backpropagated)
|
126 |
-
with torch.no_grad():
|
127 |
-
inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
|
128 |
-
labels = np.array([inv_argmax(i) for i in ground_truth])
|
129 |
-
preds = sim_matrix.detach().cpu().numpy()
|
130 |
-
|
131 |
-
# Snippet from https://yangcha.github.io/EER-ROC/
|
132 |
-
fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
|
133 |
-
eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
|
134 |
-
|
135 |
-
return loss, eer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
encoder/params_data.py
DELETED
@@ -1,29 +0,0 @@
|
|
1 |
-
|
2 |
-
## Mel-filterbank
|
3 |
-
mel_window_length = 25 # In milliseconds
|
4 |
-
mel_window_step = 10 # In milliseconds
|
5 |
-
mel_n_channels = 40
|
6 |
-
|
7 |
-
|
8 |
-
## Audio
|
9 |
-
sampling_rate = 16000
|
10 |
-
# Number of spectrogram frames in a partial utterance
|
11 |
-
partials_n_frames = 160 # 1600 ms
|
12 |
-
# Number of spectrogram frames at inference
|
13 |
-
inference_n_frames = 80 # 800 ms
|
14 |
-
|
15 |
-
|
16 |
-
## Voice Activation Detection
|
17 |
-
# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
|
18 |
-
# This sets the granularity of the VAD. Should not need to be changed.
|
19 |
-
vad_window_length = 30 # In milliseconds
|
20 |
-
# Number of frames to average together when performing the moving average smoothing.
|
21 |
-
# The larger this value, the larger the VAD variations must be to not get smoothed out.
|
22 |
-
vad_moving_average_width = 8
|
23 |
-
# Maximum number of consecutive silent frames a segment can have.
|
24 |
-
vad_max_silence_length = 6
|
25 |
-
|
26 |
-
|
27 |
-
## Audio volume normalization
|
28 |
-
audio_norm_target_dBFS = -30
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
encoder/params_model.py
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
|
2 |
-
## Model parameters
|
3 |
-
model_hidden_size = 256
|
4 |
-
model_embedding_size = 256
|
5 |
-
model_num_layers = 3
|
6 |
-
|
7 |
-
|
8 |
-
## Training parameters
|
9 |
-
learning_rate_init = 1e-4
|
10 |
-
speakers_per_batch = 64
|
11 |
-
utterances_per_speaker = 10
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
encoder/preprocess.py
DELETED
@@ -1,184 +0,0 @@
|
|
1 |
-
from datetime import datetime
|
2 |
-
from functools import partial
|
3 |
-
from multiprocessing import Pool
|
4 |
-
from pathlib import Path
|
5 |
-
|
6 |
-
import numpy as np
|
7 |
-
from tqdm import tqdm
|
8 |
-
|
9 |
-
from encoder import audio
|
10 |
-
from encoder.config import librispeech_datasets, anglophone_nationalites
|
11 |
-
from encoder.params_data import *
|
12 |
-
|
13 |
-
|
14 |
-
_AUDIO_EXTENSIONS = ("wav", "flac", "m4a", "mp3")
|
15 |
-
|
16 |
-
class DatasetLog:
|
17 |
-
"""
|
18 |
-
Registers metadata about the dataset in a text file.
|
19 |
-
"""
|
20 |
-
def __init__(self, root, name):
|
21 |
-
self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
|
22 |
-
self.sample_data = dict()
|
23 |
-
|
24 |
-
start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
|
25 |
-
self.write_line("Creating dataset %s on %s" % (name, start_time))
|
26 |
-
self.write_line("-----")
|
27 |
-
self._log_params()
|
28 |
-
|
29 |
-
def _log_params(self):
|
30 |
-
from encoder import params_data
|
31 |
-
self.write_line("Parameter values:")
|
32 |
-
for param_name in (p for p in dir(params_data) if not p.startswith("__")):
|
33 |
-
value = getattr(params_data, param_name)
|
34 |
-
self.write_line("\t%s: %s" % (param_name, value))
|
35 |
-
self.write_line("-----")
|
36 |
-
|
37 |
-
def write_line(self, line):
|
38 |
-
self.text_file.write("%s\n" % line)
|
39 |
-
|
40 |
-
def add_sample(self, **kwargs):
|
41 |
-
for param_name, value in kwargs.items():
|
42 |
-
if not param_name in self.sample_data:
|
43 |
-
self.sample_data[param_name] = []
|
44 |
-
self.sample_data[param_name].append(value)
|
45 |
-
|
46 |
-
def finalize(self):
|
47 |
-
self.write_line("Statistics:")
|
48 |
-
for param_name, values in self.sample_data.items():
|
49 |
-
self.write_line("\t%s:" % param_name)
|
50 |
-
self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
|
51 |
-
self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
|
52 |
-
self.write_line("-----")
|
53 |
-
end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
|
54 |
-
self.write_line("Finished on %s" % end_time)
|
55 |
-
self.text_file.close()
|
56 |
-
|
57 |
-
|
58 |
-
def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
|
59 |
-
dataset_root = datasets_root.joinpath(dataset_name)
|
60 |
-
if not dataset_root.exists():
|
61 |
-
print("Couldn\'t find %s, skipping this dataset." % dataset_root)
|
62 |
-
return None, None
|
63 |
-
return dataset_root, DatasetLog(out_dir, dataset_name)
|
64 |
-
|
65 |
-
|
66 |
-
def _preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, skip_existing: bool):
|
67 |
-
# Give a name to the speaker that includes its dataset
|
68 |
-
speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
|
69 |
-
|
70 |
-
# Create an output directory with that name, as well as a txt file containing a
|
71 |
-
# reference to each source file.
|
72 |
-
speaker_out_dir = out_dir.joinpath(speaker_name)
|
73 |
-
speaker_out_dir.mkdir(exist_ok=True)
|
74 |
-
sources_fpath = speaker_out_dir.joinpath("_sources.txt")
|
75 |
-
|
76 |
-
# There's a possibility that the preprocessing was interrupted earlier, check if
|
77 |
-
# there already is a sources file.
|
78 |
-
if sources_fpath.exists():
|
79 |
-
try:
|
80 |
-
with sources_fpath.open("r") as sources_file:
|
81 |
-
existing_fnames = {line.split(",")[0] for line in sources_file}
|
82 |
-
except:
|
83 |
-
existing_fnames = {}
|
84 |
-
else:
|
85 |
-
existing_fnames = {}
|
86 |
-
|
87 |
-
# Gather all audio files for that speaker recursively
|
88 |
-
sources_file = sources_fpath.open("a" if skip_existing else "w")
|
89 |
-
audio_durs = []
|
90 |
-
for extension in _AUDIO_EXTENSIONS:
|
91 |
-
for in_fpath in speaker_dir.glob("**/*.%s" % extension):
|
92 |
-
# Check if the target output file already exists
|
93 |
-
out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
|
94 |
-
out_fname = out_fname.replace(".%s" % extension, ".npy")
|
95 |
-
if skip_existing and out_fname in existing_fnames:
|
96 |
-
continue
|
97 |
-
|
98 |
-
# Load and preprocess the waveform
|
99 |
-
wav = audio.preprocess_wav(in_fpath)
|
100 |
-
if len(wav) == 0:
|
101 |
-
continue
|
102 |
-
|
103 |
-
# Create the mel spectrogram, discard those that are too short
|
104 |
-
frames = audio.wav_to_mel_spectrogram(wav)
|
105 |
-
if len(frames) < partials_n_frames:
|
106 |
-
continue
|
107 |
-
|
108 |
-
out_fpath = speaker_out_dir.joinpath(out_fname)
|
109 |
-
np.save(out_fpath, frames)
|
110 |
-
sources_file.write("%s,%s\n" % (out_fname, in_fpath))
|
111 |
-
audio_durs.append(len(wav) / sampling_rate)
|
112 |
-
|
113 |
-
sources_file.close()
|
114 |
-
|
115 |
-
return audio_durs
|
116 |
-
|
117 |
-
|
118 |
-
def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger):
|
119 |
-
print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
|
120 |
-
|
121 |
-
# Process the utterances for each speaker
|
122 |
-
work_fn = partial(_preprocess_speaker, datasets_root=datasets_root, out_dir=out_dir, skip_existing=skip_existing)
|
123 |
-
with Pool(4) as pool:
|
124 |
-
tasks = pool.imap(work_fn, speaker_dirs)
|
125 |
-
for sample_durs in tqdm(tasks, dataset_name, len(speaker_dirs), unit="speakers"):
|
126 |
-
for sample_dur in sample_durs:
|
127 |
-
logger.add_sample(duration=sample_dur)
|
128 |
-
|
129 |
-
logger.finalize()
|
130 |
-
print("Done preprocessing %s.\n" % dataset_name)
|
131 |
-
|
132 |
-
|
133 |
-
def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
|
134 |
-
for dataset_name in librispeech_datasets["train"]["other"]:
|
135 |
-
# Initialize the preprocessing
|
136 |
-
dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
|
137 |
-
if not dataset_root:
|
138 |
-
return
|
139 |
-
|
140 |
-
# Preprocess all speakers
|
141 |
-
speaker_dirs = list(dataset_root.glob("*"))
|
142 |
-
_preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)
|
143 |
-
|
144 |
-
|
145 |
-
def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
|
146 |
-
# Initialize the preprocessing
|
147 |
-
dataset_name = "VoxCeleb1"
|
148 |
-
dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
|
149 |
-
if not dataset_root:
|
150 |
-
return
|
151 |
-
|
152 |
-
# Get the contents of the meta file
|
153 |
-
with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
|
154 |
-
metadata = [line.split("\t") for line in metafile][1:]
|
155 |
-
|
156 |
-
# Select the ID and the nationality, filter out non-anglophone speakers
|
157 |
-
nationalities = {line[0]: line[3] for line in metadata}
|
158 |
-
keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if
|
159 |
-
nationality.lower() in anglophone_nationalites]
|
160 |
-
print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." %
|
161 |
-
(len(keep_speaker_ids), len(nationalities)))
|
162 |
-
|
163 |
-
# Get the speaker directories for anglophone speakers only
|
164 |
-
speaker_dirs = dataset_root.joinpath("wav").glob("*")
|
165 |
-
speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
|
166 |
-
speaker_dir.name in keep_speaker_ids]
|
167 |
-
print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." %
|
168 |
-
(len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
|
169 |
-
|
170 |
-
# Preprocess all speakers
|
171 |
-
_preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)
|
172 |
-
|
173 |
-
|
174 |
-
def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
|
175 |
-
# Initialize the preprocessing
|
176 |
-
dataset_name = "VoxCeleb2"
|
177 |
-
dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
|
178 |
-
if not dataset_root:
|
179 |
-
return
|
180 |
-
|
181 |
-
# Get the speaker directories
|
182 |
-
# Preprocess all speakers
|
183 |
-
speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
|
184 |
-
_preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
encoder/train.py
DELETED
@@ -1,125 +0,0 @@
|
|
1 |
-
from pathlib import Path
|
2 |
-
|
3 |
-
import torch
|
4 |
-
|
5 |
-
from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
|
6 |
-
from encoder.model import SpeakerEncoder
|
7 |
-
from encoder.params_model import *
|
8 |
-
from encoder.visualizations import Visualizations
|
9 |
-
from utils.profiler import Profiler
|
10 |
-
|
11 |
-
|
12 |
-
def sync(device: torch.device):
|
13 |
-
# For correct profiling (cuda operations are async)
|
14 |
-
if device.type == "cuda":
|
15 |
-
torch.cuda.synchronize(device)
|
16 |
-
|
17 |
-
|
18 |
-
def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
|
19 |
-
backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
|
20 |
-
no_visdom: bool):
|
21 |
-
# Create a dataset and a dataloader
|
22 |
-
dataset = SpeakerVerificationDataset(clean_data_root)
|
23 |
-
loader = SpeakerVerificationDataLoader(
|
24 |
-
dataset,
|
25 |
-
speakers_per_batch,
|
26 |
-
utterances_per_speaker,
|
27 |
-
num_workers=4,
|
28 |
-
)
|
29 |
-
|
30 |
-
# Setup the device on which to run the forward pass and the loss. These can be different,
|
31 |
-
# because the forward pass is faster on the GPU whereas the loss is often (depending on your
|
32 |
-
# hyperparameters) faster on the CPU.
|
33 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
34 |
-
# FIXME: currently, the gradient is None if loss_device is cuda
|
35 |
-
loss_device = torch.device("cpu")
|
36 |
-
|
37 |
-
# Create the model and the optimizer
|
38 |
-
model = SpeakerEncoder(device, loss_device)
|
39 |
-
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
|
40 |
-
init_step = 1
|
41 |
-
|
42 |
-
# Configure file path for the model
|
43 |
-
model_dir = models_dir / run_id
|
44 |
-
model_dir.mkdir(exist_ok=True, parents=True)
|
45 |
-
state_fpath = model_dir / "encoder.pt"
|
46 |
-
|
47 |
-
# Load any existing model
|
48 |
-
if not force_restart:
|
49 |
-
if state_fpath.exists():
|
50 |
-
print("Found existing model \"%s\", loading it and resuming training." % run_id)
|
51 |
-
checkpoint = torch.load(state_fpath)
|
52 |
-
init_step = checkpoint["step"]
|
53 |
-
model.load_state_dict(checkpoint["model_state"])
|
54 |
-
optimizer.load_state_dict(checkpoint["optimizer_state"])
|
55 |
-
optimizer.param_groups[0]["lr"] = learning_rate_init
|
56 |
-
else:
|
57 |
-
print("No model \"%s\" found, starting training from scratch." % run_id)
|
58 |
-
else:
|
59 |
-
print("Starting the training from scratch.")
|
60 |
-
model.train()
|
61 |
-
|
62 |
-
# Initialize the visualization environment
|
63 |
-
vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
|
64 |
-
vis.log_dataset(dataset)
|
65 |
-
vis.log_params()
|
66 |
-
device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
|
67 |
-
vis.log_implementation({"Device": device_name})
|
68 |
-
|
69 |
-
# Training loop
|
70 |
-
profiler = Profiler(summarize_every=10, disabled=False)
|
71 |
-
for step, speaker_batch in enumerate(loader, init_step):
|
72 |
-
profiler.tick("Blocking, waiting for batch (threaded)")
|
73 |
-
|
74 |
-
# Forward pass
|
75 |
-
inputs = torch.from_numpy(speaker_batch.data).to(device)
|
76 |
-
sync(device)
|
77 |
-
profiler.tick("Data to %s" % device)
|
78 |
-
embeds = model(inputs)
|
79 |
-
sync(device)
|
80 |
-
profiler.tick("Forward pass")
|
81 |
-
embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
|
82 |
-
loss, eer = model.loss(embeds_loss)
|
83 |
-
sync(loss_device)
|
84 |
-
profiler.tick("Loss")
|
85 |
-
|
86 |
-
# Backward pass
|
87 |
-
model.zero_grad()
|
88 |
-
loss.backward()
|
89 |
-
profiler.tick("Backward pass")
|
90 |
-
model.do_gradient_ops()
|
91 |
-
optimizer.step()
|
92 |
-
profiler.tick("Parameter update")
|
93 |
-
|
94 |
-
# Update visualizations
|
95 |
-
# learning_rate = optimizer.param_groups[0]["lr"]
|
96 |
-
vis.update(loss.item(), eer, step)
|
97 |
-
|
98 |
-
# Draw projections and save them to the backup folder
|
99 |
-
if umap_every != 0 and step % umap_every == 0:
|
100 |
-
print("Drawing and saving projections (step %d)" % step)
|
101 |
-
projection_fpath = model_dir / f"umap_{step:06d}.png"
|
102 |
-
embeds = embeds.detach().cpu().numpy()
|
103 |
-
vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
|
104 |
-
vis.save()
|
105 |
-
|
106 |
-
# Overwrite the latest version of the model
|
107 |
-
if save_every != 0 and step % save_every == 0:
|
108 |
-
print("Saving the model (step %d)" % step)
|
109 |
-
torch.save({
|
110 |
-
"step": step + 1,
|
111 |
-
"model_state": model.state_dict(),
|
112 |
-
"optimizer_state": optimizer.state_dict(),
|
113 |
-
}, state_fpath)
|
114 |
-
|
115 |
-
# Make a backup
|
116 |
-
if backup_every != 0 and step % backup_every == 0:
|
117 |
-
print("Making a backup (step %d)" % step)
|
118 |
-
backup_fpath = model_dir / f"encoder_{step:06d}.bak"
|
119 |
-
torch.save({
|
120 |
-
"step": step + 1,
|
121 |
-
"model_state": model.state_dict(),
|
122 |
-
"optimizer_state": optimizer.state_dict(),
|
123 |
-
}, backup_fpath)
|
124 |
-
|
125 |
-
profiler.tick("Extras (visualizations, saving)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
encoder/visualizations.py
DELETED
@@ -1,179 +0,0 @@
|
|
1 |
-
from datetime import datetime
|
2 |
-
from time import perf_counter as timer
|
3 |
-
|
4 |
-
import numpy as np
|
5 |
-
import umap
|
6 |
-
import visdom
|
7 |
-
|
8 |
-
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
|
9 |
-
|
10 |
-
|
11 |
-
colormap = np.array([
|
12 |
-
[76, 255, 0],
|
13 |
-
[0, 127, 70],
|
14 |
-
[255, 0, 0],
|
15 |
-
[255, 217, 38],
|
16 |
-
[0, 135, 255],
|
17 |
-
[165, 0, 165],
|
18 |
-
[255, 167, 255],
|
19 |
-
[0, 255, 255],
|
20 |
-
[255, 96, 38],
|
21 |
-
[142, 76, 0],
|
22 |
-
[33, 0, 127],
|
23 |
-
[0, 0, 0],
|
24 |
-
[183, 183, 183],
|
25 |
-
], dtype=np.float) / 255
|
26 |
-
|
27 |
-
|
28 |
-
class Visualizations:
|
29 |
-
def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
|
30 |
-
# Tracking data
|
31 |
-
self.last_update_timestamp = timer()
|
32 |
-
self.update_every = update_every
|
33 |
-
self.step_times = []
|
34 |
-
self.losses = []
|
35 |
-
self.eers = []
|
36 |
-
print("Updating the visualizations every %d steps." % update_every)
|
37 |
-
|
38 |
-
# If visdom is disabled TODO: use a better paradigm for that
|
39 |
-
self.disabled = disabled
|
40 |
-
if self.disabled:
|
41 |
-
return
|
42 |
-
|
43 |
-
# Set the environment name
|
44 |
-
now = str(datetime.now().strftime("%d-%m %Hh%M"))
|
45 |
-
if env_name is None:
|
46 |
-
self.env_name = now
|
47 |
-
else:
|
48 |
-
self.env_name = "%s (%s)" % (env_name, now)
|
49 |
-
|
50 |
-
# Connect to visdom and open the corresponding window in the browser
|
51 |
-
try:
|
52 |
-
self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
|
53 |
-
except ConnectionError:
|
54 |
-
raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
|
55 |
-
"start it.")
|
56 |
-
# webbrowser.open("http://localhost:8097/env/" + self.env_name)
|
57 |
-
|
58 |
-
# Create the windows
|
59 |
-
self.loss_win = None
|
60 |
-
self.eer_win = None
|
61 |
-
# self.lr_win = None
|
62 |
-
self.implementation_win = None
|
63 |
-
self.projection_win = None
|
64 |
-
self.implementation_string = ""
|
65 |
-
|
66 |
-
def log_params(self):
|
67 |
-
if self.disabled:
|
68 |
-
return
|
69 |
-
from encoder import params_data
|
70 |
-
from encoder import params_model
|
71 |
-
param_string = "<b>Model parameters</b>:<br>"
|
72 |
-
for param_name in (p for p in dir(params_model) if not p.startswith("__")):
|
73 |
-
value = getattr(params_model, param_name)
|
74 |
-
param_string += "\t%s: %s<br>" % (param_name, value)
|
75 |
-
param_string += "<b>Data parameters</b>:<br>"
|
76 |
-
for param_name in (p for p in dir(params_data) if not p.startswith("__")):
|
77 |
-
value = getattr(params_data, param_name)
|
78 |
-
param_string += "\t%s: %s<br>" % (param_name, value)
|
79 |
-
self.vis.text(param_string, opts={"title": "Parameters"})
|
80 |
-
|
81 |
-
def log_dataset(self, dataset: SpeakerVerificationDataset):
|
82 |
-
if self.disabled:
|
83 |
-
return
|
84 |
-
dataset_string = ""
|
85 |
-
dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
|
86 |
-
dataset_string += "\n" + dataset.get_logs()
|
87 |
-
dataset_string = dataset_string.replace("\n", "<br>")
|
88 |
-
self.vis.text(dataset_string, opts={"title": "Dataset"})
|
89 |
-
|
90 |
-
def log_implementation(self, params):
|
91 |
-
if self.disabled:
|
92 |
-
return
|
93 |
-
implementation_string = ""
|
94 |
-
for param, value in params.items():
|
95 |
-
implementation_string += "<b>%s</b>: %s\n" % (param, value)
|
96 |
-
implementation_string = implementation_string.replace("\n", "<br>")
|
97 |
-
self.implementation_string = implementation_string
|
98 |
-
self.implementation_win = self.vis.text(
|
99 |
-
implementation_string,
|
100 |
-
opts={"title": "Training implementation"}
|
101 |
-
)
|
102 |
-
|
103 |
-
def update(self, loss, eer, step):
|
104 |
-
# Update the tracking data
|
105 |
-
now = timer()
|
106 |
-
self.step_times.append(1000 * (now - self.last_update_timestamp))
|
107 |
-
self.last_update_timestamp = now
|
108 |
-
self.losses.append(loss)
|
109 |
-
self.eers.append(eer)
|
110 |
-
print(".", end="")
|
111 |
-
|
112 |
-
# Update the plots every <update_every> steps
|
113 |
-
if step % self.update_every != 0:
|
114 |
-
return
|
115 |
-
time_string = "Step time: mean: %5dms std: %5dms" % \
|
116 |
-
(int(np.mean(self.step_times)), int(np.std(self.step_times)))
|
117 |
-
print("\nStep %6d Loss: %.4f EER: %.4f %s" %
|
118 |
-
(step, np.mean(self.losses), np.mean(self.eers), time_string))
|
119 |
-
if not self.disabled:
|
120 |
-
self.loss_win = self.vis.line(
|
121 |
-
[np.mean(self.losses)],
|
122 |
-
[step],
|
123 |
-
win=self.loss_win,
|
124 |
-
update="append" if self.loss_win else None,
|
125 |
-
opts=dict(
|
126 |
-
legend=["Avg. loss"],
|
127 |
-
xlabel="Step",
|
128 |
-
ylabel="Loss",
|
129 |
-
title="Loss",
|
130 |
-
)
|
131 |
-
)
|
132 |
-
self.eer_win = self.vis.line(
|
133 |
-
[np.mean(self.eers)],
|
134 |
-
[step],
|
135 |
-
win=self.eer_win,
|
136 |
-
update="append" if self.eer_win else None,
|
137 |
-
opts=dict(
|
138 |
-
legend=["Avg. EER"],
|
139 |
-
xlabel="Step",
|
140 |
-
ylabel="EER",
|
141 |
-
title="Equal error rate"
|
142 |
-
)
|
143 |
-
)
|
144 |
-
if self.implementation_win is not None:
|
145 |
-
self.vis.text(
|
146 |
-
self.implementation_string + ("<b>%s</b>" % time_string),
|
147 |
-
win=self.implementation_win,
|
148 |
-
opts={"title": "Training implementation"},
|
149 |
-
)
|
150 |
-
|
151 |
-
# Reset the tracking
|
152 |
-
self.losses.clear()
|
153 |
-
self.eers.clear()
|
154 |
-
self.step_times.clear()
|
155 |
-
|
156 |
-
def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None, max_speakers=10):
|
157 |
-
import matplotlib.pyplot as plt
|
158 |
-
|
159 |
-
max_speakers = min(max_speakers, len(colormap))
|
160 |
-
embeds = embeds[:max_speakers * utterances_per_speaker]
|
161 |
-
|
162 |
-
n_speakers = len(embeds) // utterances_per_speaker
|
163 |
-
ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
|
164 |
-
colors = [colormap[i] for i in ground_truth]
|
165 |
-
|
166 |
-
reducer = umap.UMAP()
|
167 |
-
projected = reducer.fit_transform(embeds)
|
168 |
-
plt.scatter(projected[:, 0], projected[:, 1], c=colors)
|
169 |
-
plt.gca().set_aspect("equal", "datalim")
|
170 |
-
plt.title("UMAP projection (step %d)" % step)
|
171 |
-
if not self.disabled:
|
172 |
-
self.projection_win = self.vis.matplot(plt, win=self.projection_win)
|
173 |
-
if out_fpath is not None:
|
174 |
-
plt.savefig(out_fpath)
|
175 |
-
plt.clf()
|
176 |
-
|
177 |
-
def save(self):
|
178 |
-
if not self.disabled:
|
179 |
-
self.vis.save([self.env_name])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
encoderCoren.pt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:39373b86598fa3da9fcddee6142382efe09777e8d37dc9c0561f41f0070f134e
|
3 |
-
size 17090379
|
|
|
|
|
|
|
|
{hifigan β hifi-gan}/LICENSE
RENAMED
File without changes
|
hifi-gan/README.md
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis
|
2 |
+
|
3 |
+
### Jungil Kong, Jaehyeon Kim, Jaekyoung Bae
|
4 |
+
|
5 |
+
In our [paper](https://arxiv.org/abs/2010.05646),
|
6 |
+
we proposed HiFi-GAN: a GAN-based model capable of generating high fidelity speech efficiently.<br/>
|
7 |
+
We provide our implementation and pretrained models as open source in this repository.
|
8 |
+
|
9 |
+
**Abstract :**
|
10 |
+
Several recent work on speech synthesis have employed generative adversarial networks (GANs) to produce raw waveforms.
|
11 |
+
Although such methods improve the sampling efficiency and memory usage,
|
12 |
+
their sample quality has not yet reached that of autoregressive and flow-based generative models.
|
13 |
+
In this work, we propose HiFi-GAN, which achieves both efficient and high-fidelity speech synthesis.
|
14 |
+
As speech audio consists of sinusoidal signals with various periods,
|
15 |
+
we demonstrate that modeling periodic patterns of an audio is crucial for enhancing sample quality.
|
16 |
+
A subjective human evaluation (mean opinion score, MOS) of a single speaker dataset indicates that our proposed method
|
17 |
+
demonstrates similarity to human quality while generating 22.05 kHz high-fidelity audio 167.9 times faster than
|
18 |
+
real-time on a single V100 GPU. We further show the generality of HiFi-GAN to the mel-spectrogram inversion of unseen
|
19 |
+
speakers and end-to-end speech synthesis. Finally, a small footprint version of HiFi-GAN generates samples 13.4 times
|
20 |
+
faster than real-time on CPU with comparable quality to an autoregressive counterpart.
|
21 |
+
|
22 |
+
Visit our [demo website](https://jik876.github.io/hifi-gan-demo/) for audio samples.
|
23 |
+
|
24 |
+
|
25 |
+
## Pre-requisites
|
26 |
+
1. Python >= 3.6
|
27 |
+
2. Clone this repository.
|
28 |
+
3. Install python requirements. Please refer [requirements.txt](requirements.txt)
|
29 |
+
4. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/).
|
30 |
+
And move all wav files to `LJSpeech-1.1/wavs`
|
31 |
+
|
32 |
+
|
33 |
+
## Training
|
34 |
+
```
|
35 |
+
python train.py --config config_v1.json
|
36 |
+
```
|
37 |
+
To train V2 or V3 Generator, replace `config_v1.json` with `config_v2.json` or `config_v3.json`.<br>
|
38 |
+
Checkpoints and copy of the configuration file are saved in `cp_hifigan` directory by default.<br>
|
39 |
+
You can change the path by adding `--checkpoint_path` option.
|
40 |
+
|
41 |
+
Validation loss during training with V1 generator.<br>
|
42 |
+
![validation loss](./validation_loss.png)
|
43 |
+
|
44 |
+
## Pretrained Model
|
45 |
+
You can also use pretrained models we provide.<br/>
|
46 |
+
[Download pretrained models](https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing)<br/>
|
47 |
+
Details of each folder are as in follows:
|
48 |
+
|
49 |
+
|Folder Name|Generator|Dataset|Fine-Tuned|
|
50 |
+
|------|---|---|---|
|
51 |
+
|LJ_V1|V1|LJSpeech|No|
|
52 |
+
|LJ_V2|V2|LJSpeech|No|
|
53 |
+
|LJ_V3|V3|LJSpeech|No|
|
54 |
+
|LJ_FT_T2_V1|V1|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
|
55 |
+
|LJ_FT_T2_V2|V2|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
|
56 |
+
|LJ_FT_T2_V3|V3|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
|
57 |
+
|VCTK_V1|V1|VCTK|No|
|
58 |
+
|VCTK_V2|V2|VCTK|No|
|
59 |
+
|VCTK_V3|V3|VCTK|No|
|
60 |
+
|UNIVERSAL_V1|V1|Universal|No|
|
61 |
+
|
62 |
+
We provide the universal model with discriminator weights that can be used as a base for transfer learning to other datasets.
|
63 |
+
|
64 |
+
## Fine-Tuning
|
65 |
+
1. Generate mel-spectrograms in numpy format using [Tacotron2](https://github.com/NVIDIA/tacotron2) with teacher-forcing.<br/>
|
66 |
+
The file name of the generated mel-spectrogram should match the audio file and the extension should be `.npy`.<br/>
|
67 |
+
Example:
|
68 |
+
```
|
69 |
+
Audio File : LJ001-0001.wav
|
70 |
+
Mel-Spectrogram File : LJ001-0001.npy
|
71 |
+
```
|
72 |
+
2. Create `ft_dataset` folder and copy the generated mel-spectrogram files into it.<br/>
|
73 |
+
3. Run the following command.
|
74 |
+
```
|
75 |
+
python train.py --fine_tuning True --config config_v1.json
|
76 |
+
```
|
77 |
+
For other command line options, please refer to the training section.
|
78 |
+
|
79 |
+
|
80 |
+
## Inference from wav file
|
81 |
+
1. Make `test_files` directory and copy wav files into the directory.
|
82 |
+
2. Run the following command.
|
83 |
+
```
|
84 |
+
python inference.py --checkpoint_file [generator checkpoint file path]
|
85 |
+
```
|
86 |
+
Generated wav files are saved in `generated_files` by default.<br>
|
87 |
+
You can change the path by adding `--output_dir` option.
|
88 |
+
|
89 |
+
|
90 |
+
## Inference for end-to-end speech synthesis
|
91 |
+
1. Make `test_mel_files` directory and copy generated mel-spectrogram files into the directory.<br>
|
92 |
+
You can generate mel-spectrograms using [Tacotron2](https://github.com/NVIDIA/tacotron2),
|
93 |
+
[Glow-TTS](https://github.com/jaywalnut310/glow-tts) and so forth.
|
94 |
+
2. Run the following command.
|
95 |
+
```
|
96 |
+
python inference_e2e.py --checkpoint_file [generator checkpoint file path]
|
97 |
+
```
|
98 |
+
Generated wav files are saved in `generated_files_from_mel` by default.<br>
|
99 |
+
You can change the path by adding `--output_dir` option.
|
100 |
+
|
101 |
+
|
102 |
+
## Acknowledgements
|
103 |
+
We referred to [WaveGlow](https://github.com/NVIDIA/waveglow), [MelGAN](https://github.com/descriptinc/melgan-neurips)
|
104 |
+
and [Tacotron2](https://github.com/NVIDIA/tacotron2) to implement this.
|
105 |
+
|
diagrams/apple.txt β hifi-gan/apple.py
RENAMED
File without changes
|
{hifigan β hifi-gan}/env.py
RENAMED
File without changes
|
{hifigan β hifi-gan}/inference.py
RENAMED
@@ -6,9 +6,9 @@ import argparse
|
|
6 |
import json
|
7 |
import torch
|
8 |
from scipy.io.wavfile import write
|
9 |
-
from
|
10 |
-
from
|
11 |
-
from
|
12 |
|
13 |
h = None
|
14 |
device = None
|
|
|
6 |
import json
|
7 |
import torch
|
8 |
from scipy.io.wavfile import write
|
9 |
+
from env import AttrDict
|
10 |
+
from meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav
|
11 |
+
from models import Generator
|
12 |
|
13 |
h = None
|
14 |
device = None
|
{hifigan β hifi-gan}/inference_e2e.py
RENAMED
@@ -1,13 +1,15 @@
|
|
1 |
from __future__ import absolute_import, division, print_function, unicode_literals
|
2 |
|
|
|
3 |
import os
|
4 |
import numpy as np
|
|
|
5 |
import json
|
6 |
import torch
|
7 |
from scipy.io.wavfile import write
|
8 |
-
from
|
9 |
-
from
|
10 |
-
from
|
11 |
|
12 |
h = None
|
13 |
device = None
|
@@ -21,9 +23,50 @@ def load_checkpoint(filepath, device):
|
|
21 |
return checkpoint_dict
|
22 |
|
23 |
|
24 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
print('Initializing Inference Process..')
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
with open(config_file) as f:
|
28 |
data = f.read()
|
29 |
|
@@ -31,10 +74,6 @@ def hifi_gan_inference(input_mel, checkpoint_file):
|
|
31 |
json_config = json.loads(data)
|
32 |
h = AttrDict(json_config)
|
33 |
|
34 |
-
# Set MAX_WAV_VALUE if not present
|
35 |
-
if 'MAX_WAV_VALUE' not in h:
|
36 |
-
h.MAX_WAV_VALUE = 32768.0 # Adjust this value based on your requirements
|
37 |
-
|
38 |
torch.manual_seed(h.seed)
|
39 |
global device
|
40 |
if torch.cuda.is_available():
|
@@ -43,34 +82,9 @@ def hifi_gan_inference(input_mel, checkpoint_file):
|
|
43 |
else:
|
44 |
device = torch.device('cpu')
|
45 |
|
46 |
-
|
47 |
|
48 |
-
state_dict_g = load_checkpoint(checkpoint_file, device)
|
49 |
-
generator.load_state_dict(state_dict_g['generator'])
|
50 |
|
51 |
-
|
52 |
-
|
53 |
|
54 |
-
# Load data from BytesIO
|
55 |
-
buffer = BytesIO(input_mel)
|
56 |
-
x = np.load(buffer)
|
57 |
-
|
58 |
-
x = torch.FloatTensor(x).to(device)
|
59 |
-
y_g_hat = generator(x)
|
60 |
-
|
61 |
-
# Detach tensor before converting to numpy
|
62 |
-
audio = y_g_hat.squeeze().detach().numpy()
|
63 |
-
|
64 |
-
# Set MAX_WAV_VALUE if not present
|
65 |
-
if 'MAX_WAV_VALUE' not in h:
|
66 |
-
h.MAX_WAV_VALUE = 32768.0 # Adjust this value based on your requirements
|
67 |
-
|
68 |
-
audio = audio * h.MAX_WAV_VALUE
|
69 |
-
audio = audio.astype('int16')
|
70 |
-
|
71 |
-
# Save audio to BytesIO
|
72 |
-
output_buffer = BytesIO()
|
73 |
-
write(output_buffer, h.sampling_rate, audio)
|
74 |
-
|
75 |
-
return output_buffer.getvalue()
|
76 |
-
|
|
|
1 |
from __future__ import absolute_import, division, print_function, unicode_literals
|
2 |
|
3 |
+
import glob
|
4 |
import os
|
5 |
import numpy as np
|
6 |
+
import argparse
|
7 |
import json
|
8 |
import torch
|
9 |
from scipy.io.wavfile import write
|
10 |
+
from env import AttrDict
|
11 |
+
from meldataset import MAX_WAV_VALUE
|
12 |
+
from models import Generator
|
13 |
|
14 |
h = None
|
15 |
device = None
|
|
|
23 |
return checkpoint_dict
|
24 |
|
25 |
|
26 |
+
def scan_checkpoint(cp_dir, prefix):
|
27 |
+
pattern = os.path.join(cp_dir, prefix + '*')
|
28 |
+
cp_list = glob.glob(pattern)
|
29 |
+
if len(cp_list) == 0:
|
30 |
+
return ''
|
31 |
+
return sorted(cp_list)[-1]
|
32 |
+
|
33 |
+
|
34 |
+
def inference(a):
|
35 |
+
generator = Generator(h).to(device)
|
36 |
+
|
37 |
+
state_dict_g = load_checkpoint(a.checkpoint_file, device)
|
38 |
+
generator.load_state_dict(state_dict_g['generator'])
|
39 |
+
|
40 |
+
filelist = os.listdir(a.input_mels_dir)
|
41 |
+
|
42 |
+
os.makedirs(a.output_dir, exist_ok=True)
|
43 |
+
|
44 |
+
generator.eval()
|
45 |
+
generator.remove_weight_norm()
|
46 |
+
with torch.no_grad():
|
47 |
+
for i, filname in enumerate(filelist):
|
48 |
+
x = np.load(os.path.join(a.input_mels_dir, filname))
|
49 |
+
x = torch.FloatTensor(x).to(device)
|
50 |
+
y_g_hat = generator(x)
|
51 |
+
audio = y_g_hat.squeeze()
|
52 |
+
audio = audio * MAX_WAV_VALUE
|
53 |
+
audio = audio.cpu().numpy().astype('int16')
|
54 |
+
|
55 |
+
output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + '_generated_e2e.wav')
|
56 |
+
write(output_file, h.sampling_rate, audio)
|
57 |
+
print(output_file)
|
58 |
+
|
59 |
+
|
60 |
+
def main():
|
61 |
print('Initializing Inference Process..')
|
62 |
+
|
63 |
+
parser = argparse.ArgumentParser()
|
64 |
+
parser.add_argument('--input_mels_dir', default='test_mel_files')
|
65 |
+
parser.add_argument('--output_dir', default='generated_files_from_mel')
|
66 |
+
parser.add_argument('--checkpoint_file', required=True)
|
67 |
+
a = parser.parse_args()
|
68 |
+
|
69 |
+
config_file = os.path.join(os.path.split(a.checkpoint_file)[0], 'config.json')
|
70 |
with open(config_file) as f:
|
71 |
data = f.read()
|
72 |
|
|
|
74 |
json_config = json.loads(data)
|
75 |
h = AttrDict(json_config)
|
76 |
|
|
|
|
|
|
|
|
|
77 |
torch.manual_seed(h.seed)
|
78 |
global device
|
79 |
if torch.cuda.is_available():
|
|
|
82 |
else:
|
83 |
device = torch.device('cpu')
|
84 |
|
85 |
+
inference(a)
|
86 |
|
|
|
|
|
87 |
|
88 |
+
if __name__ == '__main__':
|
89 |
+
main()
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
{hifigan β hifi-gan}/meldataset.py
RENAMED
File without changes
|
{hifigan β hifi-gan}/models.py
RENAMED
@@ -3,7 +3,7 @@ import torch.nn.functional as F
|
|
3 |
import torch.nn as nn
|
4 |
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
5 |
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
6 |
-
from
|
7 |
|
8 |
LRELU_SLOPE = 0.1
|
9 |
|
|
|
3 |
import torch.nn as nn
|
4 |
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
5 |
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
6 |
+
from utils import init_weights, get_padding
|
7 |
|
8 |
LRELU_SLOPE = 0.1
|
9 |
|
hifi-gan/requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch==1.4.0
|
2 |
+
numpy==1.17.4
|
3 |
+
librosa==0.7.2
|
4 |
+
scipy==1.4.1
|
5 |
+
tensorboard==2.0
|
6 |
+
soundfile==0.10.3.post1
|
7 |
+
matplotlib==3.1.3
|
{hifigan β hifi-gan}/train.py
RENAMED
@@ -12,11 +12,11 @@ from torch.utils.data import DistributedSampler, DataLoader
|
|
12 |
import torch.multiprocessing as mp
|
13 |
from torch.distributed import init_process_group
|
14 |
from torch.nn.parallel import DistributedDataParallel
|
15 |
-
from
|
16 |
-
from
|
17 |
-
from
|
18 |
discriminator_loss
|
19 |
-
from
|
20 |
|
21 |
torch.backends.cudnn.benchmark = True
|
22 |
|
|
|
12 |
import torch.multiprocessing as mp
|
13 |
from torch.distributed import init_process_group
|
14 |
from torch.nn.parallel import DistributedDataParallel
|
15 |
+
from env import AttrDict, build_env
|
16 |
+
from meldataset import MelDataset, mel_spectrogram, get_dataset_filelist
|
17 |
+
from models import Generator, MultiPeriodDiscriminator, MultiScaleDiscriminator, feature_loss, generator_loss,\
|
18 |
discriminator_loss
|
19 |
+
from utils import plot_spectrogram, scan_checkpoint, load_checkpoint, save_checkpoint
|
20 |
|
21 |
torch.backends.cudnn.benchmark = True
|
22 |
|
hifigan/hifigan_utils.py β hifi-gan/utils.py
RENAMED
File without changes
|
hparams.py
CHANGED
@@ -61,7 +61,6 @@ def create_hparams(hparams_string=None, verbose=False):
|
|
61 |
"encoder_kernel_size":5,
|
62 |
"encoder_n_convolutions":3,
|
63 |
"encoder_embedding_dim":512,
|
64 |
-
"speaker_embedding_dim":256,
|
65 |
|
66 |
# Decoder parameters
|
67 |
"n_frames_per_step":1, # currently only 1 is supported
|
|
|
61 |
"encoder_kernel_size":5,
|
62 |
"encoder_n_convolutions":3,
|
63 |
"encoder_embedding_dim":512,
|
|
|
64 |
|
65 |
# Decoder parameters
|
66 |
"n_frames_per_step":1, # currently only 1 is supported
|
kaggle_12000.pt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:27d4936bff68d3fe37053ec3110486bdea9f23bf137f07477c28bbd4f36b85ae
|
3 |
-
size 338426303
|
|
|
|
|
|
|
|
logger.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
import torch
|
3 |
+
from torch.utils.tensorboard import SummaryWriter
|
4 |
+
from plotting_utils import plot_alignment_to_numpy, plot_spectrogram_to_numpy
|
5 |
+
from plotting_utils import plot_gate_outputs_to_numpy
|
6 |
+
|
7 |
+
|
8 |
+
class Tacotron2Logger(SummaryWriter):
|
9 |
+
def __init__(self, logdir):
|
10 |
+
super(Tacotron2Logger, self).__init__(logdir)
|
11 |
+
|
12 |
+
def log_training(self, reduced_loss, grad_norm, learning_rate, duration,
|
13 |
+
iteration):
|
14 |
+
self.add_scalar("training.loss", reduced_loss, iteration)
|
15 |
+
self.add_scalar("grad.norm", grad_norm, iteration)
|
16 |
+
self.add_scalar("learning.rate", learning_rate, iteration)
|
17 |
+
self.add_scalar("duration", duration, iteration)
|
18 |
+
|
19 |
+
def log_validation(self, reduced_loss, model, y, y_pred, iteration):
|
20 |
+
self.add_scalar("validation.loss", reduced_loss, iteration)
|
21 |
+
_, mel_outputs, gate_outputs, alignments = y_pred
|
22 |
+
mel_targets, gate_targets = y
|
23 |
+
|
24 |
+
# plot distribution of parameters
|
25 |
+
for tag, value in model.named_parameters():
|
26 |
+
tag = tag.replace('.', '/')
|
27 |
+
self.add_histogram(tag, value.data.cpu().numpy(), iteration)
|
28 |
+
|
29 |
+
# plot alignment, mel target and predicted, gate target and predicted
|
30 |
+
idx = random.randint(0, alignments.size(0) - 1)
|
31 |
+
self.add_image(
|
32 |
+
"alignment",
|
33 |
+
plot_alignment_to_numpy(alignments[idx].data.cpu().numpy().T),
|
34 |
+
iteration, dataformats='HWC')
|
35 |
+
self.add_image(
|
36 |
+
"mel_target",
|
37 |
+
plot_spectrogram_to_numpy(mel_targets[idx].data.cpu().numpy()),
|
38 |
+
iteration, dataformats='HWC')
|
39 |
+
self.add_image(
|
40 |
+
"mel_predicted",
|
41 |
+
plot_spectrogram_to_numpy(mel_outputs[idx].data.cpu().numpy()),
|
42 |
+
iteration, dataformats='HWC')
|
43 |
+
self.add_image(
|
44 |
+
"gate",
|
45 |
+
plot_gate_outputs_to_numpy(
|
46 |
+
gate_targets[idx].data.cpu().numpy(),
|
47 |
+
torch.sigmoid(gate_outputs[idx]).data.cpu().numpy()),
|
48 |
+
iteration, dataformats='HWC')
|
logic.py
CHANGED
@@ -3,28 +3,15 @@ import numpy as np
|
|
3 |
import torch
|
4 |
import base64
|
5 |
import io
|
6 |
-
from io import BytesIO
|
7 |
import matplotlib.pyplot as plt
|
8 |
from hparams import create_hparams
|
9 |
from model import Tacotron2
|
10 |
-
from layers import TacotronSTFT
|
11 |
from train import load_model
|
12 |
from text import text_to_sequence
|
13 |
-
from utils import load_wav_to_torch
|
14 |
import os
|
15 |
-
import
|
16 |
-
import librosa
|
17 |
import librosa.display
|
18 |
|
19 |
-
use_cuda = torch.cuda.is_available()
|
20 |
-
device = torch.device('cuda' if use_cuda else 'cpu')
|
21 |
-
|
22 |
-
hparams = create_hparams()
|
23 |
-
hparams.sampling_rate = 22050
|
24 |
-
stft = TacotronSTFT(
|
25 |
-
hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels,
|
26 |
-
hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax).to(device)
|
27 |
-
|
28 |
# Function to plot data
|
29 |
def plot_data(data, figsize=(16, 4), titles=['Mel Spectrogram (Original)', 'Mel Spectrogram (Postnet)', 'Alignment'],
|
30 |
xlabel=['Time Steps', 'Time Steps', 'Decoder Time Steps'],
|
@@ -55,84 +42,59 @@ def plot_data(data, figsize=(16, 4), titles=['Mel Spectrogram (Original)', 'Mel
|
|
55 |
return img_base64
|
56 |
|
57 |
#Function to plot timedomain waveform
|
58 |
-
def plot_waveforms(
|
59 |
-
# Load
|
60 |
-
|
61 |
-
y, sr = librosa.load(buffer, sr=None)
|
62 |
-
|
63 |
-
# Create waveform plot
|
64 |
-
plt.figure(figsize=(10, 4))
|
65 |
-
librosa.display.waveshow(y, sr=sr)
|
66 |
-
plt.xlabel("Time (s)")
|
67 |
-
plt.ylabel("Amplitude")
|
68 |
-
plt.title("Waveform")
|
69 |
-
|
70 |
-
# Save the plot to a BytesIO object
|
71 |
-
wave_buffer = BytesIO()
|
72 |
-
plt.savefig(wave_buffer, format="png")
|
73 |
-
wave_buffer.seek(0)
|
74 |
-
plt.close()
|
75 |
-
|
76 |
-
# Encode the plot as base64
|
77 |
-
wave_base64 = base64.b64encode(wave_buffer.read()).decode('utf-8')
|
78 |
-
|
79 |
-
return wave_base64
|
80 |
|
|
|
|
|
81 |
|
82 |
-
#
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
model = SpeakerEncoder(device, loss_device)
|
89 |
-
speaker_dict = torch.load(speaker_model_path, map_location='cpu')
|
90 |
-
model.load_state_dict(speaker_dict)
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
|
|
95 |
|
96 |
-
|
97 |
|
98 |
-
|
99 |
|
100 |
-
def extract_speech_embedding(audio_path: str):
|
101 |
-
audio, sampling_rate = load_wav_to_torch(audio_path)
|
102 |
-
if sampling_rate != stft.sampling_rate:
|
103 |
-
raise ValueError("{} SR doesn't match target {} SR".format(sampling_rate, stft.sampling_rate))
|
104 |
-
|
105 |
-
audio_norm = audio / 32768.0
|
106 |
-
audio_norm = audio_norm.unsqueeze(0)
|
107 |
-
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False).to(device)
|
108 |
-
melspec = stft.mel_spectrogram(audio_norm).transpose(1,2).float()
|
109 |
-
|
110 |
-
if melspec.shape[1] <= 128:
|
111 |
-
mel_slice = mel
|
112 |
-
else:
|
113 |
-
slice_start = random.randint(0,melspec.shape[1]-128)
|
114 |
-
mel_slice = melspec[:,slice_start:slice_start+128]
|
115 |
-
speaker_embedding = speaker_model(mel_slice)
|
116 |
-
return speaker_embedding
|
117 |
-
|
118 |
def synthesize_voice(text_input, checkpoint_path):
|
119 |
-
# Load Tacotron2 model
|
120 |
-
|
121 |
-
|
122 |
-
model.load_state_dict(checkpoint['state_dict'])
|
123 |
-
model = model.to(device).eval().float()
|
124 |
|
|
|
|
|
|
|
|
|
125 |
|
126 |
# Nepali text
|
127 |
-
speaker_audio_path='speaker_audio/ariana.wav'
|
128 |
sequence = np.array(text_to_sequence(text_input, ['transliteration_cleaners']))[None, :]
|
129 |
-
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).
|
130 |
-
|
131 |
-
|
132 |
# Melspectrogram and Alignment graph
|
133 |
-
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence
|
134 |
mel_output_data = mel_outputs.data.cpu().numpy()[0]
|
135 |
mel_output_postnet_data = mel_outputs_postnet.data.cpu().numpy()[0]
|
136 |
alignments_data = alignments.data.cpu().numpy()[0].T
|
137 |
|
138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import torch
|
4 |
import base64
|
5 |
import io
|
|
|
6 |
import matplotlib.pyplot as plt
|
7 |
from hparams import create_hparams
|
8 |
from model import Tacotron2
|
|
|
9 |
from train import load_model
|
10 |
from text import text_to_sequence
|
|
|
11 |
import os
|
12 |
+
import subprocess
|
|
|
13 |
import librosa.display
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
# Function to plot data
|
16 |
def plot_data(data, figsize=(16, 4), titles=['Mel Spectrogram (Original)', 'Mel Spectrogram (Postnet)', 'Alignment'],
|
17 |
xlabel=['Time Steps', 'Time Steps', 'Decoder Time Steps'],
|
|
|
42 |
return img_base64
|
43 |
|
44 |
#Function to plot timedomain waveform
|
45 |
+
def plot_waveforms(audio_file, sr=22050):
|
46 |
+
# Load audio waveform
|
47 |
+
y, sr = librosa.load(audio_file, sr=sr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
+
# Create time vector
|
50 |
+
time = librosa.times_like(y, sr=sr)
|
51 |
|
52 |
+
# Plot the waveform
|
53 |
+
plt.figure(figsize=(16, 4))
|
54 |
+
librosa.display.waveshow(y, sr=sr)
|
55 |
+
plt.title('Time vs Amplitude')
|
56 |
+
plt.xlabel('Time (s)')
|
57 |
+
plt.ylabel('Amplitude')
|
|
|
|
|
|
|
58 |
|
59 |
+
plt.tight_layout()
|
60 |
+
# plt.savefig('static/waveform.png')
|
61 |
+
img_buffer = io.BytesIO()
|
62 |
+
plt.savefig(img_buffer, format='png', bbox_inches='tight', pad_inches=0)
|
63 |
+
plt.close()
|
64 |
|
65 |
+
img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
|
66 |
|
67 |
+
return img_base64
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
def synthesize_voice(text_input, checkpoint_path):
|
70 |
+
# Load Tacotron2 model
|
71 |
+
hparams = create_hparams()
|
72 |
+
hparams.sampling_rate = 22050
|
|
|
|
|
73 |
|
74 |
+
# Load model from checkpoint
|
75 |
+
model = load_model(hparams)
|
76 |
+
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
|
77 |
+
model = model.cuda().eval().half()
|
78 |
|
79 |
# Nepali text
|
|
|
80 |
sequence = np.array(text_to_sequence(text_input, ['transliteration_cleaners']))[None, :]
|
81 |
+
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
|
82 |
+
|
|
|
83 |
# Melspectrogram and Alignment graph
|
84 |
+
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
|
85 |
mel_output_data = mel_outputs.data.cpu().numpy()[0]
|
86 |
mel_output_postnet_data = mel_outputs_postnet.data.cpu().numpy()[0]
|
87 |
alignments_data = alignments.data.cpu().numpy()[0].T
|
88 |
|
89 |
+
np.save('mel_files/mel1'+'.npy', mel_output_data)
|
90 |
+
|
91 |
+
input_mels_dir = 'mel_files/'
|
92 |
+
output_dir = 'audio_output/'
|
93 |
+
run_hifigan_inference(input_mels_dir, output_dir)
|
94 |
+
|
95 |
+
return mel_output_data, mel_output_postnet_data, alignments_data
|
96 |
+
|
97 |
+
|
98 |
+
def run_hifigan_inference(input_mels_dir, output_dir):
|
99 |
+
script_path = os.path.join(os.path.dirname("hifigan/"), "inference_e2e.py") # Assuming both scripts are in the same directory
|
100 |
+
subprocess.run(["python", script_path, "--checkpoint_file", "generator_v1", "--input_mels_dir", input_mels_dir, "--output_dir", output_dir])
|
loss_function.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch import nn
|
2 |
+
|
3 |
+
|
4 |
+
class Tacotron2Loss(nn.Module):
|
5 |
+
def __init__(self):
|
6 |
+
super(Tacotron2Loss, self).__init__()
|
7 |
+
|
8 |
+
def forward(self, model_output, targets):
|
9 |
+
mel_target, gate_target = targets[0], targets[1]
|
10 |
+
mel_target.requires_grad = False
|
11 |
+
gate_target.requires_grad = False
|
12 |
+
gate_target = gate_target.view(-1, 1)
|
13 |
+
|
14 |
+
mel_out, mel_out_postnet, gate_out, _ = model_output
|
15 |
+
gate_out = gate_out.view(-1, 1)
|
16 |
+
mel_loss = nn.MSELoss()(mel_out, mel_target) + \
|
17 |
+
nn.MSELoss()(mel_out_postnet, mel_target)
|
18 |
+
gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target)
|
19 |
+
return mel_loss + gate_loss
|
loss_scaler.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
class LossScaler:
|
4 |
+
|
5 |
+
def __init__(self, scale=1):
|
6 |
+
self.cur_scale = scale
|
7 |
+
|
8 |
+
# `params` is a list / generator of torch.Variable
|
9 |
+
def has_overflow(self, params):
|
10 |
+
return False
|
11 |
+
|
12 |
+
# `x` is a torch.Tensor
|
13 |
+
def _has_inf_or_nan(x):
|
14 |
+
return False
|
15 |
+
|
16 |
+
# `overflow` is boolean indicating whether we overflowed in gradient
|
17 |
+
def update_scale(self, overflow):
|
18 |
+
pass
|
19 |
+
|
20 |
+
@property
|
21 |
+
def loss_scale(self):
|
22 |
+
return self.cur_scale
|
23 |
+
|
24 |
+
def scale_gradient(self, module, grad_in, grad_out):
|
25 |
+
return tuple(self.loss_scale * g for g in grad_in)
|
26 |
+
|
27 |
+
def backward(self, loss):
|
28 |
+
scaled_loss = loss*self.loss_scale
|
29 |
+
scaled_loss.backward()
|
30 |
+
|
31 |
+
class DynamicLossScaler:
|
32 |
+
|
33 |
+
def __init__(self,
|
34 |
+
init_scale=2**32,
|
35 |
+
scale_factor=2.,
|
36 |
+
scale_window=1000):
|
37 |
+
self.cur_scale = init_scale
|
38 |
+
self.cur_iter = 0
|
39 |
+
self.last_overflow_iter = -1
|
40 |
+
self.scale_factor = scale_factor
|
41 |
+
self.scale_window = scale_window
|
42 |
+
|
43 |
+
# `params` is a list / generator of torch.Variable
|
44 |
+
def has_overflow(self, params):
|
45 |
+
# return False
|
46 |
+
for p in params:
|
47 |
+
if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
|
48 |
+
return True
|
49 |
+
|
50 |
+
return False
|
51 |
+
|
52 |
+
# `x` is a torch.Tensor
|
53 |
+
def _has_inf_or_nan(x):
|
54 |
+
cpu_sum = float(x.float().sum())
|
55 |
+
if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
|
56 |
+
return True
|
57 |
+
return False
|
58 |
+
|
59 |
+
# `overflow` is boolean indicating whether we overflowed in gradient
|
60 |
+
def update_scale(self, overflow):
|
61 |
+
if overflow:
|
62 |
+
#self.cur_scale /= self.scale_factor
|
63 |
+
self.cur_scale = max(self.cur_scale/self.scale_factor, 1)
|
64 |
+
self.last_overflow_iter = self.cur_iter
|
65 |
+
else:
|
66 |
+
if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
|
67 |
+
self.cur_scale *= self.scale_factor
|
68 |
+
# self.cur_scale = 1
|
69 |
+
self.cur_iter += 1
|
70 |
+
|
71 |
+
@property
|
72 |
+
def loss_scale(self):
|
73 |
+
return self.cur_scale
|
74 |
+
|
75 |
+
def scale_gradient(self, module, grad_in, grad_out):
|
76 |
+
return tuple(self.loss_scale * g for g in grad_in)
|
77 |
+
|
78 |
+
def backward(self, loss):
|
79 |
+
scaled_loss = loss*self.loss_scale
|
80 |
+
scaled_loss.backward()
|
81 |
+
|
82 |
+
##############################################################
|
83 |
+
# Example usage below here -- assuming it's in a separate file
|
84 |
+
##############################################################
|
85 |
+
if __name__ == "__main__":
|
86 |
+
import torch
|
87 |
+
from torch.autograd import Variable
|
88 |
+
from dynamic_loss_scaler import DynamicLossScaler
|
89 |
+
|
90 |
+
# N is batch size; D_in is input dimension;
|
91 |
+
# H is hidden dimension; D_out is output dimension.
|
92 |
+
N, D_in, H, D_out = 64, 1000, 100, 10
|
93 |
+
|
94 |
+
# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
|
95 |
+
x = Variable(torch.randn(N, D_in), requires_grad=False)
|
96 |
+
y = Variable(torch.randn(N, D_out), requires_grad=False)
|
97 |
+
|
98 |
+
w1 = Variable(torch.randn(D_in, H), requires_grad=True)
|
99 |
+
w2 = Variable(torch.randn(H, D_out), requires_grad=True)
|
100 |
+
parameters = [w1, w2]
|
101 |
+
|
102 |
+
learning_rate = 1e-6
|
103 |
+
optimizer = torch.optim.SGD(parameters, lr=learning_rate)
|
104 |
+
loss_scaler = DynamicLossScaler()
|
105 |
+
|
106 |
+
for t in range(500):
|
107 |
+
y_pred = x.mm(w1).clamp(min=0).mm(w2)
|
108 |
+
loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
|
109 |
+
print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
|
110 |
+
print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
|
111 |
+
print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
|
112 |
+
|
113 |
+
# Run backprop
|
114 |
+
optimizer.zero_grad()
|
115 |
+
loss.backward()
|
116 |
+
|
117 |
+
# Check for overflow
|
118 |
+
has_overflow = DynamicLossScaler.has_overflow(parameters)
|
119 |
+
|
120 |
+
# If no overflow, unscale grad and update as usual
|
121 |
+
if not has_overflow:
|
122 |
+
for param in parameters:
|
123 |
+
param.grad.data.mul_(1. / loss_scaler.loss_scale)
|
124 |
+
optimizer.step()
|
125 |
+
# Otherwise, don't do anything -- ie, skip iteration
|
126 |
+
else:
|
127 |
+
print('OVERFLOW!')
|
128 |
+
|
129 |
+
# Update loss scale for next iteration
|
130 |
+
loss_scaler.update_scale(has_overflow)
|
131 |
+
|
model.py
CHANGED
@@ -147,8 +147,13 @@ class Postnet(nn.Module):
|
|
147 |
|
148 |
|
149 |
class Encoder(nn.Module):
|
|
|
|
|
|
|
|
|
150 |
def __init__(self, hparams):
|
151 |
super(Encoder, self).__init__()
|
|
|
152 |
convolutions = []
|
153 |
for _ in range(hparams.encoder_n_convolutions):
|
154 |
conv_layer = nn.Sequential(
|
@@ -165,15 +170,13 @@ class Encoder(nn.Module):
|
|
165 |
int(hparams.encoder_embedding_dim / 2), 1,
|
166 |
batch_first=True, bidirectional=True)
|
167 |
|
168 |
-
def forward(self, x, input_lengths
|
169 |
-
# Modify the input x to concatenate the speaker embedding
|
170 |
-
x = torch.cat((x, speaker_embedding.unsqueeze(1).expand(-1, x.size(1), -1)), dim=-1)
|
171 |
-
|
172 |
for conv in self.convolutions:
|
173 |
x = F.dropout(F.relu(conv(x)), 0.5, self.training)
|
174 |
|
175 |
x = x.transpose(1, 2)
|
176 |
|
|
|
177 |
input_lengths = input_lengths.cpu().numpy()
|
178 |
x = nn.utils.rnn.pack_padded_sequence(
|
179 |
x, input_lengths, batch_first=True)
|
@@ -186,10 +189,7 @@ class Encoder(nn.Module):
|
|
186 |
|
187 |
return outputs
|
188 |
|
189 |
-
def inference(self, x
|
190 |
-
# Modify the input x to concatenate the speaker embedding
|
191 |
-
x = torch.cat((x, speaker_embedding.unsqueeze(1).expand(-1, x.size(1), -1)), dim=-1)
|
192 |
-
|
193 |
for conv in self.convolutions:
|
194 |
x = F.dropout(F.relu(conv(x)), 0.5, self.training)
|
195 |
|
@@ -496,14 +496,13 @@ class Tacotron2(nn.Module):
|
|
496 |
|
497 |
return outputs
|
498 |
|
499 |
-
def forward(self, inputs
|
500 |
text_inputs, text_lengths, mels, max_len, output_lengths = inputs
|
501 |
text_lengths, output_lengths = text_lengths.data, output_lengths.data
|
502 |
|
503 |
embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
|
504 |
|
505 |
-
|
506 |
-
encoder_outputs = self.encoder(embedded_inputs, text_lengths, speaker_embedding)
|
507 |
|
508 |
mel_outputs, gate_outputs, alignments = self.decoder(
|
509 |
encoder_outputs, mels, memory_lengths=text_lengths)
|
@@ -515,11 +514,9 @@ class Tacotron2(nn.Module):
|
|
515 |
[mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
|
516 |
output_lengths)
|
517 |
|
518 |
-
def inference(self, inputs
|
519 |
embedded_inputs = self.embedding(inputs).transpose(1, 2)
|
520 |
-
|
521 |
-
encoder_outputs = self.encoder.inference(embedded_inputs, speaker_embedding)
|
522 |
-
|
523 |
mel_outputs, gate_outputs, alignments = self.decoder.inference(
|
524 |
encoder_outputs)
|
525 |
|
@@ -530,4 +527,3 @@ class Tacotron2(nn.Module):
|
|
530 |
[mel_outputs, mel_outputs_postnet, gate_outputs, alignments])
|
531 |
|
532 |
return outputs
|
533 |
-
|
|
|
147 |
|
148 |
|
149 |
class Encoder(nn.Module):
|
150 |
+
"""Encoder module:
|
151 |
+
- Three 1-d convolution banks
|
152 |
+
- Bidirectional LSTM
|
153 |
+
"""
|
154 |
def __init__(self, hparams):
|
155 |
super(Encoder, self).__init__()
|
156 |
+
|
157 |
convolutions = []
|
158 |
for _ in range(hparams.encoder_n_convolutions):
|
159 |
conv_layer = nn.Sequential(
|
|
|
170 |
int(hparams.encoder_embedding_dim / 2), 1,
|
171 |
batch_first=True, bidirectional=True)
|
172 |
|
173 |
+
def forward(self, x, input_lengths):
|
|
|
|
|
|
|
174 |
for conv in self.convolutions:
|
175 |
x = F.dropout(F.relu(conv(x)), 0.5, self.training)
|
176 |
|
177 |
x = x.transpose(1, 2)
|
178 |
|
179 |
+
# pytorch tensor are not reversible, hence the conversion
|
180 |
input_lengths = input_lengths.cpu().numpy()
|
181 |
x = nn.utils.rnn.pack_padded_sequence(
|
182 |
x, input_lengths, batch_first=True)
|
|
|
189 |
|
190 |
return outputs
|
191 |
|
192 |
+
def inference(self, x):
|
|
|
|
|
|
|
193 |
for conv in self.convolutions:
|
194 |
x = F.dropout(F.relu(conv(x)), 0.5, self.training)
|
195 |
|
|
|
496 |
|
497 |
return outputs
|
498 |
|
499 |
+
def forward(self, inputs):
|
500 |
text_inputs, text_lengths, mels, max_len, output_lengths = inputs
|
501 |
text_lengths, output_lengths = text_lengths.data, output_lengths.data
|
502 |
|
503 |
embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
|
504 |
|
505 |
+
encoder_outputs = self.encoder(embedded_inputs, text_lengths)
|
|
|
506 |
|
507 |
mel_outputs, gate_outputs, alignments = self.decoder(
|
508 |
encoder_outputs, mels, memory_lengths=text_lengths)
|
|
|
514 |
[mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
|
515 |
output_lengths)
|
516 |
|
517 |
+
def inference(self, inputs):
|
518 |
embedded_inputs = self.embedding(inputs).transpose(1, 2)
|
519 |
+
encoder_outputs = self.encoder.inference(embedded_inputs)
|
|
|
|
|
520 |
mel_outputs, gate_outputs, alignments = self.decoder.inference(
|
521 |
encoder_outputs)
|
522 |
|
|
|
527 |
[mel_outputs, mel_outputs_postnet, gate_outputs, alignments])
|
528 |
|
529 |
return outputs
|
|
multiproc.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import torch
|
3 |
+
import sys
|
4 |
+
import subprocess
|
5 |
+
|
6 |
+
argslist = list(sys.argv)[1:]
|
7 |
+
num_gpus = torch.cuda.device_count()
|
8 |
+
argslist.append('--n_gpus={}'.format(num_gpus))
|
9 |
+
workers = []
|
10 |
+
job_id = time.strftime("%Y_%m_%d-%H%M%S")
|
11 |
+
argslist.append("--group_name=group_{}".format(job_id))
|
12 |
+
|
13 |
+
for i in range(num_gpus):
|
14 |
+
argslist.append('--rank={}'.format(i))
|
15 |
+
stdout = None if i == 0 else open("logs/{}_GPU_{}.log".format(job_id, i),
|
16 |
+
"w")
|
17 |
+
print(argslist)
|
18 |
+
p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout)
|
19 |
+
workers.append(p)
|
20 |
+
argslist = argslist[:-1]
|
21 |
+
|
22 |
+
for p in workers:
|
23 |
+
p.wait()
|
requirements.txt
CHANGED
@@ -1,8 +1,11 @@
|
|
1 |
-
|
|
|
|
|
|
|
2 |
gunicorn
|
3 |
-
torch==1.12.1
|
4 |
-
torchaudio==0.12.1
|
5 |
-
torchvision==0.13.1
|
6 |
matplotlib==3.5.3
|
7 |
numpy==1.18.5
|
8 |
inflect
|
@@ -11,6 +14,4 @@ scipy==1.7.3
|
|
11 |
tensorboard==2.11.2
|
12 |
Unidecode
|
13 |
pillow
|
14 |
-
uvicorn
|
15 |
-
httpx==0.19.0
|
16 |
-
--extra-index-url https://download.pytorch.org/whl/cu113
|
|
|
1 |
+
flask
|
2 |
+
flask_cors
|
3 |
+
typing
|
4 |
+
fastapi
|
5 |
gunicorn
|
6 |
+
torch==1.12.1
|
7 |
+
torchaudio==0.12.1
|
8 |
+
torchvision==0.13.1
|
9 |
matplotlib==3.5.3
|
10 |
numpy==1.18.5
|
11 |
inflect
|
|
|
14 |
tensorboard==2.11.2
|
15 |
Unidecode
|
16 |
pillow
|
17 |
+
uvicorn
|
|
|
|
saved_model.pt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:6ccc0abcd0fb77104be73e6675454a06e7797bf1d4a1177181c32b648e9d75a9
|
3 |
-
size 5697243
|
|
|
|
|
|
|
|
speaker/__init__.py
DELETED
File without changes
|
speaker/bana.txt
DELETED
File without changes
|
speaker/data.py
DELETED
@@ -1,109 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import torchaudio.datasets as datasets
|
3 |
-
import torchaudio.transforms as transforms
|
4 |
-
from collections import defaultdict
|
5 |
-
import random
|
6 |
-
import layers
|
7 |
-
|
8 |
-
import warnings
|
9 |
-
|
10 |
-
class SpeakerMelLoader(torch.utils.data.Dataset):
|
11 |
-
"""
|
12 |
-
computes mel-spectrograms from audio file and pulls the speaker ID from the
|
13 |
-
dataset
|
14 |
-
"""
|
15 |
-
|
16 |
-
def __init__(self, dataset, format='speaker', speaker_utterances=4, mel_length = 128, mel_type = 'Tacotron'):
|
17 |
-
self.dataset = dataset
|
18 |
-
self.set_format(format)
|
19 |
-
self.speaker_utterances = speaker_utterances
|
20 |
-
self.mel_length = mel_length
|
21 |
-
self.mel_type = mel_type
|
22 |
-
self.mel_generators = dict()
|
23 |
-
|
24 |
-
def set_format(self,format):
|
25 |
-
self.format = format
|
26 |
-
|
27 |
-
if format == 'speaker':
|
28 |
-
self.create_speaker_index()
|
29 |
-
|
30 |
-
def create_speaker_index(self):
|
31 |
-
vals = [x.split('-',1) for x in self.dataset._walker]
|
32 |
-
speaker_map = defaultdict(list)
|
33 |
-
|
34 |
-
for i,v in enumerate(vals):
|
35 |
-
speaker_map[v[0]].append(i)
|
36 |
-
|
37 |
-
self.speaker_map = speaker_map
|
38 |
-
self.speaker_keys = list(speaker_map.keys())
|
39 |
-
|
40 |
-
def apply_mel_gen(self, waveform, sampling_rate, channels=80):
|
41 |
-
if (sampling_rate, channels) not in self.mel_generators:
|
42 |
-
if self.mel_type == 'MFCC':
|
43 |
-
mel_gen = transforms.MFCC(sample_rate=sampling_rate, n_mfcc=channels)
|
44 |
-
elif self.mel_type == 'Mel':
|
45 |
-
mel_gen = transforms.MelSpectrogram(sample_rate=sampling_rate, n_mels=channels)
|
46 |
-
elif self.mel_type == 'Tacotron':
|
47 |
-
mel_gen = layers.TacotronSTFT(sampling_rate=sampling_rate,n_mel_channels=channels)
|
48 |
-
else:
|
49 |
-
raise NotImplementedError('Unsupported mel_type in MelSpeakerLoader: '+self.mel_type)
|
50 |
-
self.mel_generators[(sampling_rate,channels)] = mel_gen
|
51 |
-
else:
|
52 |
-
mel_gen = self.mel_generators[(sampling_rate, channels)]
|
53 |
-
|
54 |
-
if self.mel_type == 'Tacotron':
|
55 |
-
#Replicating from Tacotron2 data loader
|
56 |
-
max_wav_value=32768.0
|
57 |
-
#skip normalization from Tacotron2, LibriSpeech data looks pre-normalized (all vals between 0-1)
|
58 |
-
audio_norm = waveform #/ max_wav_value
|
59 |
-
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
|
60 |
-
melspec = mel_gen.mel_spectrogram(audio_norm)
|
61 |
-
else:
|
62 |
-
audio = waveform.unsqueeze(0)
|
63 |
-
audio = torch.autograd.Variable(audio, requires_grad=False)
|
64 |
-
melspec = mel_gen(audio)
|
65 |
-
|
66 |
-
return melspec
|
67 |
-
|
68 |
-
def get_mel(self, waveform, sampling_rate, channels=80):
|
69 |
-
# We previously identified that these warnings were ok.
|
70 |
-
with warnings.catch_warnings():
|
71 |
-
warnings.filterwarnings('ignore', message=r'At least one mel filterbank has all zero values.*', module=r'torchaudio.*')
|
72 |
-
melspec = self.apply_mel_gen(waveform, sampling_rate, channels)
|
73 |
-
# melspec is (1,1,channels, time) by default
|
74 |
-
# return (time, channels)
|
75 |
-
melspec = torch.squeeze(melspec).T
|
76 |
-
return melspec
|
77 |
-
|
78 |
-
def __getitem__(self, index):
|
79 |
-
if self.format == 'utterance':
|
80 |
-
(waveform, sample_rate, _, speaker_id, _, _) = self.dataset[index]
|
81 |
-
mel = self.get_mel(waveform, sample_rate)
|
82 |
-
return (speaker_id, mel)
|
83 |
-
elif self.format == 'speaker':
|
84 |
-
speaker_id = self.speaker_keys[index]
|
85 |
-
utter_indexes = random.sample(self.speaker_map[speaker_id], self.speaker_utterances)
|
86 |
-
mels = []
|
87 |
-
for i in utter_indexes:
|
88 |
-
(waveform, sample_rate, _, speaker_id, _, _) = self.dataset[i]
|
89 |
-
mel = self.get_mel(waveform, sample_rate)
|
90 |
-
if mel.shape[0] < self.mel_length:
|
91 |
-
#Zero pad mel on the right to mel_length
|
92 |
-
#pad_tuple is (dn start, dn end, dn-1 start, dn-1 end, ... , d1 start, d1 end)
|
93 |
-
pad_tuple = (0,0,0,self.mel_length-mel.shape[0])
|
94 |
-
mel=torch.nn.functional.pad(mel,pad_tuple)
|
95 |
-
mel_frame = 0
|
96 |
-
else:
|
97 |
-
mel_frame = random.randint(0,mel.shape[0]-self.mel_length)
|
98 |
-
mels.append(mel[mel_frame:mel_frame+self.mel_length,:])
|
99 |
-
return (speaker_id, torch.stack(mels,0))
|
100 |
-
else:
|
101 |
-
raise NotImplementedError()
|
102 |
-
|
103 |
-
def __len__(self):
|
104 |
-
if self.format == 'utterance':
|
105 |
-
return len(self.dataset)
|
106 |
-
elif self.format == 'speaker':
|
107 |
-
return len(self.speaker_keys)
|
108 |
-
else:
|
109 |
-
raise NotImplementedError()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
speaker/model.py
DELETED
@@ -1,191 +0,0 @@
|
|
1 |
-
from torch import nn
|
2 |
-
import numpy as np
|
3 |
-
import torch
|
4 |
-
from torch.nn.utils import clip_grad_norm_
|
5 |
-
|
6 |
-
class SpeakerEncoder(nn.Module):
|
7 |
-
""" Learn speaker representation from speech utterance of arbitrary lengths.
|
8 |
-
"""
|
9 |
-
def __init__(self, device, loss_device):
|
10 |
-
super().__init__()
|
11 |
-
self.loss_device = loss_device
|
12 |
-
|
13 |
-
# lstm block consisting of 3 layers
|
14 |
-
# takes input 80 channel log-mel spectrograms, projected to 256 dimensions
|
15 |
-
self.lstm = nn.LSTM(
|
16 |
-
input_size=80,
|
17 |
-
hidden_size=256,
|
18 |
-
num_layers=3,
|
19 |
-
batch_first=True,
|
20 |
-
dropout=0,
|
21 |
-
bidirectional=False
|
22 |
-
).to(device)
|
23 |
-
|
24 |
-
self.linear = nn.Linear(in_features=256, out_features=256).to(device)
|
25 |
-
self.relu = nn.ReLU().to(device)
|
26 |
-
# epsilon term for numerical stability ( ie - division by 0)
|
27 |
-
self.epsilon = 1e-5
|
28 |
-
|
29 |
-
#Cosine similarity weights
|
30 |
-
self.sim_weight = nn.Parameter(torch.tensor([5.])).to(loss_device)
|
31 |
-
self.sim_bias = nn.Parameter(torch.tensor([-1.])).to(loss_device)
|
32 |
-
|
33 |
-
def forward(self, utterances, h_init=None, c_init=None):
|
34 |
-
# implement section 2.1 from https://arxiv.org/pdf/1806.04558.pdf
|
35 |
-
if h_init is None or c_init is None:
|
36 |
-
out, (hidden, cell) = self.lstm(utterances)
|
37 |
-
else:
|
38 |
-
out, (hidden, cell) = self.lstm(utterances, (h_init, c_init))
|
39 |
-
|
40 |
-
# compute speaker embedding from hidden state of final layer
|
41 |
-
final_hidden = hidden[-1]
|
42 |
-
speaker_embedding = self.relu(self.linear(final_hidden))
|
43 |
-
|
44 |
-
# l2 norm of speaker embedding
|
45 |
-
speaker_embedding = speaker_embedding / (torch.norm(speaker_embedding, dim=1, keepdim=True) + self.epsilon)
|
46 |
-
return speaker_embedding
|
47 |
-
|
48 |
-
def gradient_clipping(self):
|
49 |
-
self.sim_weight.grad *= 0.01
|
50 |
-
self.sim_bias.grad *= 0.01
|
51 |
-
|
52 |
-
#Pytorch to clip gradients if norm greater than max
|
53 |
-
clip_grad_norm_(self.parameters(),max_norm=3,norm_type=2)
|
54 |
-
|
55 |
-
def similarity_matrix(self, embeds, debug=False):
|
56 |
-
# calculate s_ji,k from section 2.1 of GE2E paper
|
57 |
-
# output matrix is cosine similarity between each utterance x centroid of each speaker
|
58 |
-
# embeds input size: (speakers, utterances, embedding size)
|
59 |
-
|
60 |
-
# Speaker centroids
|
61 |
-
# Equal to average of utterance embeddings for the speaker
|
62 |
-
# Used for neg examples (utterance comparing to false speaker)
|
63 |
-
# Equation 1 in paper
|
64 |
-
# size: (speakers, 1, embedding size)
|
65 |
-
speaker_centroid = torch.mean(embeds,dim=1,keepdim=True)
|
66 |
-
|
67 |
-
# Utterance exclusive centroids
|
68 |
-
# Equal to average of utterance embeddings for the speaker, excluding ith utterance
|
69 |
-
# Used for pos samples (utterance comparing to true speaker; speaker centroid exludes the utterance)
|
70 |
-
# Equation 8 in paper
|
71 |
-
# size: (speakers, utterances, embedding size)
|
72 |
-
num_utterance = embeds.shape[1]
|
73 |
-
utter_ex_centroid = (torch.sum(embeds,dim=1,keepdim=True) - embeds) / (num_utterance-1)
|
74 |
-
|
75 |
-
if debug:
|
76 |
-
print("e",embeds.shape)
|
77 |
-
print(embeds)
|
78 |
-
print("sc",speaker_centroid.shape)
|
79 |
-
print(speaker_centroid)
|
80 |
-
print("uc",utter_ex_centroid.shape)
|
81 |
-
print(utter_ex_centroid)
|
82 |
-
|
83 |
-
# Create pos and neg masks
|
84 |
-
num_speaker = embeds.shape[0]
|
85 |
-
i = torch.eye(num_speaker, dtype=torch.int)
|
86 |
-
pos_mask = torch.where(i)
|
87 |
-
neg_mask = torch.where(1-i)
|
88 |
-
|
89 |
-
if debug:
|
90 |
-
print("pm",len(pos_mask),len(pos_mask[0]))
|
91 |
-
print(pos_mask)
|
92 |
-
print("nm",len(neg_mask),len(neg_mask[0]))
|
93 |
-
print(neg_mask)
|
94 |
-
|
95 |
-
# Compile similarity matrix
|
96 |
-
# size: (speakers, utterances, speakers)
|
97 |
-
# initial size is (speakers, speakers, utterances for easier vectorization)
|
98 |
-
sim_matrix = torch.zeros(num_speaker, num_speaker, num_utterance).to(self.loss_device)
|
99 |
-
sim_matrix[pos_mask] = nn.functional.cosine_similarity(embeds,utter_ex_centroid,dim=2)
|
100 |
-
sim_matrix[neg_mask] = nn.functional.cosine_similarity(embeds[neg_mask[0]],speaker_centroid[neg_mask[1]],dim=2)
|
101 |
-
if debug:
|
102 |
-
print("sm",sim_matrix.shape)
|
103 |
-
print("pos vals",sim_matrix[pos_mask])
|
104 |
-
print("neg vals",sim_matrix[neg_mask])
|
105 |
-
print(sim_matrix)
|
106 |
-
|
107 |
-
sim_matrix = sim_matrix.permute(0,2,1)
|
108 |
-
|
109 |
-
if debug:
|
110 |
-
print("sm",sim_matrix.shape)
|
111 |
-
print(sim_matrix)
|
112 |
-
print("cos sim weight", self.sim_weight)
|
113 |
-
print("cos sim bias", self.sim_bias)
|
114 |
-
|
115 |
-
# Apply weight / bias
|
116 |
-
sim_matrix = sim_matrix * self.sim_weight + self.sim_bias
|
117 |
-
return sim_matrix
|
118 |
-
|
119 |
-
def softmax_loss(self, embeds):
|
120 |
-
"""
|
121 |
-
computes softmax loss as defined by equ 6 in the GE2E paper
|
122 |
-
:param embeds: shape (speakers, utterances, embedding size)
|
123 |
-
:return: computed softmax loss
|
124 |
-
"""
|
125 |
-
# per the GE2E paper, softmax loss as defined by equ 6
|
126 |
-
# performs slightly better over Text-Independent Speaker
|
127 |
-
# Verification tasks.
|
128 |
-
# ref section 2.1 of the GE2E paper
|
129 |
-
speaker_count = embeds.shape[0]
|
130 |
-
|
131 |
-
# speaker, utterance, speaker
|
132 |
-
similarities = self.similarity_matrix(embeds)
|
133 |
-
|
134 |
-
# equ 6
|
135 |
-
loss_matrix = -similarities[torch.arange(0, speaker_count), :, torch.arange(0, speaker_count)] + \
|
136 |
-
torch.log(torch.sum(torch.exp(similarities), dim=2))
|
137 |
-
|
138 |
-
# equ 10
|
139 |
-
return torch.sum(loss_matrix)
|
140 |
-
|
141 |
-
def contrast_loss(self, embeds):
|
142 |
-
"""
|
143 |
-
computes contrast loss as defined by equ 7 in the GE2E paper
|
144 |
-
:param embeds: shape (speakers, utterances, embedding size)
|
145 |
-
:return: computed softmax loss
|
146 |
-
"""
|
147 |
-
# per the GE2E paper, contrast loss as defined by equ 7
|
148 |
-
# performs slightly better over Text-Dependent Speaker
|
149 |
-
# Verification tasks.
|
150 |
-
# ref section 2.1 of the GE2E paper
|
151 |
-
speaker_count, utterance_count = embeds.shape[0:2]
|
152 |
-
|
153 |
-
# speaker, utterance, speaker
|
154 |
-
similarities = self.similarity_matrix(embeds)
|
155 |
-
|
156 |
-
# Janky indexing to resolve k != j
|
157 |
-
mask = torch.ones(similarities.shape, dtype=torch.bool)
|
158 |
-
mask[torch.arange(speaker_count), :, torch.arange(speaker_count)] = False
|
159 |
-
closest_neighbors, _ = torch.max(similarities[mask].reshape(speaker_count, utterance_count, speaker_count - 1), dim=2)
|
160 |
-
|
161 |
-
# Positive influence over matching embeddings
|
162 |
-
matching_embedding = similarities[torch.arange(0, speaker_count), :, torch.arange(0, speaker_count)]
|
163 |
-
|
164 |
-
# equ 7
|
165 |
-
loss_matrix = 1 - torch.sigmoid(matching_embedding) + torch.sigmoid(closest_neighbors)
|
166 |
-
|
167 |
-
# equ 10
|
168 |
-
return torch.sum(loss_matrix)
|
169 |
-
|
170 |
-
def accuracy(self, embeds):
|
171 |
-
"""
|
172 |
-
computes argmax accuracy
|
173 |
-
:param embeds: shape (speakers, utterances, speakers)
|
174 |
-
:return: accuracy
|
175 |
-
"""
|
176 |
-
num_speaker, num_utter = embeds.shape[:2]
|
177 |
-
|
178 |
-
similarities = self.similarity_matrix(embeds)
|
179 |
-
preds = torch.argmax(similarities, dim=2)
|
180 |
-
preds_one_hot = torch.nn.functional.one_hot(preds,num_classes = num_speaker)
|
181 |
-
|
182 |
-
actual = torch.arange(num_speaker).unsqueeze(1).repeat(1,num_utter)
|
183 |
-
actual_one_hot = torch.nn.functional.one_hot(actual,num_classes=num_speaker)
|
184 |
-
|
185 |
-
return torch.sum(preds_one_hot * actual_one_hot)/(num_speaker*num_utter)
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
speaker/preprocess.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
# Reference https://github.com/CorentinJ/Real-Time-Voice-Cloning/blob/0713f860a3dd41afb56e83cff84dbdf589d5e11a/encoder/preprocess.py#L16
|
|
|
|
speaker/saved_model.pt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:6ccc0abcd0fb77104be73e6675454a06e7797bf1d4a1177181c32b648e9d75a9
|
3 |
-
size 5697243
|
|
|
|
|
|
|
|
speaker/saved_model_e175.pt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:52ba80266b9f45fc3d825942aae40858eeaaa73994ba86e9ed017a533dc13323
|
3 |
-
size 5861083
|
|
|
|
|
|
|
|
speaker/saved_models/dog.txt
DELETED
File without changes
|
speaker/saved_models/saved_model_e175.pt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:52ba80266b9f45fc3d825942aae40858eeaaa73994ba86e9ed017a533dc13323
|
3 |
-
size 5861083
|
|
|
|
|
|
|
|
speaker/saved_models/saved_model_e273_LargeBatch.pt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:fbaaaa28a7d58b1316f322e1f33a5a68c00046b7b89a823ae7d987a632b8c7d6
|
3 |
-
size 5861083
|
|
|
|
|
|
|
|
speaker/saved_models/saved_model_e300.pt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:d9be127fb61b6d2306ff877ab2184f187450953a5555a6751b3616b5ed84e78a
|
3 |
-
size 5698805
|
|
|
|
|
|
|
|