maolili commited on
Commit
c3eaaf0
·
1 Parent(s): 6626ab7

Delete voice.py

Browse files
Files changed (1) hide show
  1. voice.py +0 -532
voice.py DELETED
@@ -1,532 +0,0 @@
1
- import os
2
- import librosa
3
- import commons
4
- import re
5
- import numpy as np
6
- import torch
7
- import xml.etree.ElementTree as ET
8
- import config
9
- import logging
10
- import soundfile as sf
11
- from torch import no_grad, LongTensor, inference_mode, FloatTensor
12
- from io import BytesIO
13
- from graiax import silkcoder
14
- from utils.nlp import sentence_split
15
- from mel_processing import spectrogram_torch
16
- from text import text_to_sequence
17
- from models import SynthesizerTrn
18
- from utils import utils
19
-
20
- # torch.set_num_threads(1) # 设置torch线程为1
21
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
-
23
-
24
- class vits:
25
- def __init__(self, model, config, model_=None, model_type=None):
26
- self.model_type = model_type
27
- self.hps_ms = utils.get_hparams_from_file(config)
28
- self.n_speakers = getattr(self.hps_ms.data, 'n_speakers', 0)
29
- self.n_symbols = len(getattr(self.hps_ms, 'symbols', []))
30
- self.speakers = getattr(self.hps_ms, 'speakers', ['0'])
31
- self.use_f0 = getattr(self.hps_ms.data, 'use_f0', False)
32
- self.emotion_embedding = getattr(self.hps_ms.data, 'emotion_embedding',
33
- getattr(self.hps_ms.model, 'emotion_embedding', False))
34
- self.bert_embedding = getattr(self.hps_ms.data, 'bert_embedding',
35
- getattr(self.hps_ms.model, 'bert_embedding', False))
36
- self.hps_ms.model.emotion_embedding = self.emotion_embedding
37
- self.hps_ms.model.bert_embedding = self.bert_embedding
38
-
39
- self.net_g_ms = SynthesizerTrn(
40
- self.n_symbols,
41
- self.hps_ms.data.filter_length // 2 + 1,
42
- self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
43
- n_speakers=self.n_speakers,
44
- **self.hps_ms.model)
45
- _ = self.net_g_ms.eval()
46
-
47
- # load model
48
- self.load_model(model, model_)
49
-
50
- def load_model(self, model, model_=None):
51
- utils.load_checkpoint(model, self.net_g_ms)
52
- self.net_g_ms.to(device)
53
- if self.model_type == "hubert":
54
- self.hubert = model_
55
- elif self.model_type == "w2v2":
56
- self.emotion_reference = model_
57
-
58
- def get_cleaned_text(self, text, hps, cleaned=False):
59
- if cleaned:
60
- text_norm = text_to_sequence(text, hps.symbols, [])
61
- else:
62
- if self.bert_embedding:
63
- text_norm, char_embed = text_to_sequence(text, hps.symbols, hps.data.text_cleaners,
64
- bert_embedding=self.bert_embedding)
65
- text_norm = LongTensor(text_norm)
66
- return text_norm, char_embed
67
- else:
68
- text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
69
- if hps.data.add_blank:
70
- text_norm = commons.intersperse(text_norm, 0)
71
- text_norm = LongTensor(text_norm)
72
- return text_norm
73
-
74
- def get_cleaner(self):
75
- return getattr(self.hps_ms.data, 'text_cleaners', [None])[0]
76
-
77
- def get_speakers(self, escape=False):
78
- return self.speakers
79
-
80
- def infer(self, params):
81
- with no_grad():
82
- x_tst = params.get("stn_tst").unsqueeze(0).to(device)
83
- x_tst_lengths = LongTensor([params.get("stn_tst").size(0)]).to(device)
84
- x_tst_prosody = torch.FloatTensor(params.get("char_embeds")).unsqueeze(0).to(
85
- device) if self.bert_embedding else None
86
- sid = params.get("sid").to(device) if not self.bert_embedding else None
87
- emotion = params.get("emotion").to(device) if self.emotion_embedding else None
88
-
89
- audio = self.net_g_ms.infer(x=x_tst,
90
- x_lengths=x_tst_lengths,
91
- sid=sid,
92
- noise_scale=params.get("noise_scale"),
93
- noise_scale_w=params.get("noise_scale_w"),
94
- length_scale=params.get("length_scale"),
95
- emotion_embedding=emotion,
96
- bert=x_tst_prosody)[0][0, 0].data.float().cpu().numpy()
97
-
98
- torch.cuda.empty_cache()
99
-
100
- return audio
101
-
102
- def get_infer_param(self, length_scale, noise_scale, noise_scale_w, text=None, speaker_id=None, audio_path=None,
103
- emotion=None, cleaned=False, f0_scale=1):
104
- emo = None
105
- char_embeds = None
106
- if self.model_type != "hubert":
107
- if self.bert_embedding:
108
- stn_tst, char_embeds = self.get_cleaned_text(text, self.hps_ms, cleaned=cleaned)
109
- sid = None
110
- else:
111
- stn_tst = self.get_cleaned_text(text, self.hps_ms, cleaned=cleaned)
112
- sid = LongTensor([speaker_id])
113
-
114
- if self.model_type == "w2v2":
115
- # if emotion_reference.endswith('.npy'):
116
- # emotion = np.load(emotion_reference)
117
- # emotion = FloatTensor(emotion).unsqueeze(0)
118
- # else:
119
- # audio16000, sampling_rate = librosa.load(
120
- # emotion_reference, sr=16000, mono=True)
121
- # emotion = self.w2v2(audio16000, sampling_rate)[
122
- # 'hidden_states']
123
- # emotion_reference = re.sub(
124
- # r'\..*$', '', emotion_reference)
125
- # np.save(emotion_reference, emotion.squeeze(0))
126
- # emotion = FloatTensor(emotion)
127
- emo = torch.FloatTensor(self.emotion_reference[emotion]).unsqueeze(0)
128
-
129
-
130
- elif self.model_type == "hubert":
131
- if self.use_f0:
132
- audio, sampling_rate = librosa.load(audio_path, sr=self.hps_ms.data.sampling_rate, mono=True)
133
- audio16000 = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
134
- else:
135
- audio16000, sampling_rate = librosa.load(audio_path, sr=16000, mono=True)
136
-
137
- with inference_mode():
138
- units = self.hubert.units(FloatTensor(audio16000).unsqueeze(0).unsqueeze(0)).squeeze(0).numpy()
139
- if self.use_f0:
140
- f0 = librosa.pyin(audio,
141
- sr=sampling_rate,
142
- fmin=librosa.note_to_hz('C0'),
143
- fmax=librosa.note_to_hz('C7'),
144
- frame_length=1780)[0]
145
- target_length = len(units[:, 0])
146
- f0 = np.nan_to_num(np.interp(np.arange(0, len(f0) * target_length, len(f0)) / target_length,
147
- np.arange(0, len(f0)), f0)) * f0_scale
148
- units[:, 0] = f0 / 10
149
-
150
- stn_tst = FloatTensor(units)
151
- sid = LongTensor([speaker_id])
152
- params = {"length_scale": length_scale, "noise_scale": noise_scale,
153
- "noise_scale_w": noise_scale_w, "stn_tst": stn_tst,
154
- "sid": sid, "emotion": emo, "char_embeds": char_embeds}
155
-
156
- return params
157
-
158
- def get_tasks(self, voice):
159
- text = voice.get("text", None)
160
- speaker_id = voice.get("id", 0)
161
- length = voice.get("length", 1)
162
- noise = voice.get("noise", 0.667)
163
- noisew = voice.get("noisew", 0.8)
164
- max = voice.get("max", 50)
165
- lang = voice.get("lang", "auto")
166
- speaker_lang = voice.get("speaker_lang", None)
167
- audio_path = voice.get("audio_path", None)
168
- emotion = voice.get("emotion", 0)
169
-
170
- # 去除所有多余的空白字符
171
- if text is not None: text = re.sub(r'\s+', ' ', text).strip()
172
-
173
- tasks = []
174
- if self.model_type == "vits":
175
- sentence_list = sentence_split(text, max, lang, speaker_lang)
176
- for sentence in sentence_list:
177
- params = self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length,
178
- noise_scale=noise, noise_scale_w=noisew)
179
- tasks.append(params)
180
-
181
- elif self.model_type == "hubert":
182
- params = self.get_infer_param(speaker_id=speaker_id, length_scale=length, noise_scale=noise,
183
- noise_scale_w=noisew, audio_path=audio_path)
184
- tasks.append(params)
185
-
186
- elif self.model_type == "w2v2":
187
- sentence_list = sentence_split(text, max, lang, speaker_lang)
188
- for sentence in sentence_list:
189
- params = self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length,
190
- noise_scale=noise, noise_scale_w=noisew, emotion=emotion)
191
- tasks.append(params)
192
-
193
- return tasks
194
-
195
- def get_audio(self, voice, auto_break=False):
196
- tasks = self.get_tasks(voice)
197
- # 停顿0.75s,避免语音分段合成再拼接后的连接突兀
198
- brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
199
-
200
- audios = []
201
- for task in tasks:
202
- if auto_break:
203
- chunk = np.concatenate((self.infer(task), brk), axis=0)
204
- else:
205
- chunk = self.infer(task)
206
- audios.append(chunk)
207
-
208
- audio = np.concatenate(audios, axis=0)
209
- return audio
210
-
211
- def get_stream_audio(self, voice, auto_break=False):
212
- tasks = self.get_tasks(voice)
213
-
214
- brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
215
-
216
- for task in tasks:
217
- if auto_break:
218
- chunk = np.concatenate((self.infer(task), brk), axis=0)
219
- else:
220
- chunk = self.infer(task)
221
-
222
- yield chunk
223
-
224
- def voice_conversion(self, voice):
225
- audio_path = voice.get("audio_path")
226
- original_id = voice.get("original_id")
227
- target_id = voice.get("target_id")
228
-
229
- audio = utils.load_audio_to_torch(
230
- audio_path, self.hps_ms.data.sampling_rate)
231
-
232
- y = audio.unsqueeze(0)
233
-
234
- spec = spectrogram_torch(y, self.hps_ms.data.filter_length,
235
- self.hps_ms.data.sampling_rate, self.hps_ms.data.hop_length,
236
- self.hps_ms.data.win_length,
237
- center=False)
238
- spec_lengths = LongTensor([spec.size(-1)])
239
- sid_src = LongTensor([original_id])
240
-
241
- with no_grad():
242
- sid_tgt = LongTensor([target_id])
243
- audio = self.net_g_ms.voice_conversion(spec.to(device),
244
- spec_lengths.to(device),
245
- sid_src=sid_src.to(device),
246
- sid_tgt=sid_tgt.to(device))[0][0, 0].data.cpu().float().numpy()
247
-
248
- torch.cuda.empty_cache()
249
-
250
- return audio
251
-
252
-
253
- class TTS:
254
- def __init__(self, voice_obj, voice_speakers):
255
- self._voice_obj = voice_obj
256
- self._voice_speakers = voice_speakers
257
- self._strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
258
- self._speakers_count = sum([len(self._voice_speakers[i]) for i in self._voice_speakers])
259
- self._vits_speakers_count = len(self._voice_speakers["VITS"])
260
- self._hubert_speakers_count = len(self._voice_speakers["HUBERT-VITS"])
261
- self._w2v2_speakers_count = len(self._voice_speakers["W2V2-VITS"])
262
- self.dem = None
263
-
264
- # Initialization information
265
- self.logger = logging.getLogger("vits-simple-api")
266
- self.logger.info(f"torch:{torch.__version__} cuda_available:{torch.cuda.is_available()}")
267
- self.logger.info(f'device:{device} device.type:{device.type}')
268
-
269
- if getattr(config, "DIMENSIONAL_EMOTION_MODEL", None) != None:
270
- try:
271
- import audonnx
272
- root = os.path.dirname(config.DIMENSIONAL_EMOTION_MODEL)
273
- model_file = config.DIMENSIONAL_EMOTION_MODEL
274
- self.dem = audonnx.load(root=root, model_file=model_file)
275
- except Exception as e:
276
- self.logger.warning(f"Load DIMENSIONAL_EMOTION_MODEL failed {e}")
277
-
278
- if self._vits_speakers_count != 0: self.logger.info(f"[VITS] {self._vits_speakers_count} speakers")
279
- if self._hubert_speakers_count != 0: self.logger.info(f"[hubert] {self._hubert_speakers_count} speakers")
280
- if self._w2v2_speakers_count != 0: self.logger.info(f"[w2v2] {self._w2v2_speakers_count} speakers")
281
- self.logger.info(f"{self._speakers_count} speakers in total")
282
- if self._speakers_count == 0:
283
- self.logger.warning(f"No model was loaded")
284
-
285
- @property
286
- def voice_speakers(self):
287
- return self._voice_speakers
288
-
289
- @property
290
- def speakers_count(self):
291
- return self._speakers_count
292
-
293
- @property
294
- def vits_speakers_count(self):
295
- return self._vits_speakers_count
296
-
297
- @property
298
- def hubert_speakers_count(self):
299
- return self._hubert_speakers_count
300
-
301
- @property
302
- def w2v2_speakers_count(self):
303
- return self._w2v2_speakers_count
304
-
305
- def encode(self, sampling_rate, audio, format):
306
- with BytesIO() as f:
307
- if format.upper() == 'OGG':
308
- sf.write(f, audio, sampling_rate, format="ogg")
309
- return BytesIO(f.getvalue())
310
- elif format.upper() == 'SILK':
311
- sf.write(f, audio, sampling_rate, format="wav")
312
- return BytesIO(silkcoder.encode(f))
313
- elif format.upper() == 'MP3':
314
- sf.write(f, audio, sampling_rate, format="mp3")
315
- return BytesIO(f.getvalue())
316
- elif format.upper() == 'WAV':
317
- sf.write(f, audio, sampling_rate, format="wav")
318
- return BytesIO(f.getvalue())
319
- elif format.upper() == 'FLAC':
320
- sf.write(f, audio, sampling_rate, format="flac")
321
- return BytesIO(f.getvalue())
322
- else:
323
- raise ValueError(f"Unsupported format:{format}")
324
-
325
- def convert_time_string(self, time_string):
326
- time_value = float(re.findall(r'\d+\.?\d*', time_string)[0])
327
- time_unit = re.findall(r'[a-zA-Z]+', time_string)[0].lower()
328
-
329
- if time_unit.upper() == 'MS':
330
- return time_value / 1000
331
- elif time_unit.upper() == 'S':
332
- return time_value
333
- elif time_unit.upper() == 'MIN':
334
- return time_value * 60
335
- elif time_unit.upper() == 'H':
336
- return time_value * 3600
337
- elif time_unit.upper() == 'D':
338
- return time_value * 24 * 3600 # 不会有人真写D吧?
339
- else:
340
- raise ValueError("Unsupported time unit: {}".format(time_unit))
341
-
342
- def generate_audio_chunks(self, audio):
343
- chunk_size = 4096
344
- while True:
345
- chunk = audio.read(chunk_size)
346
- if not chunk:
347
- break
348
- yield chunk
349
-
350
- def parse_ssml(self, ssml):
351
- root = ET.fromstring(ssml)
352
- format = root.attrib.get("format", "wav")
353
- voice_tasks = []
354
- brk_count = 0
355
- strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
356
-
357
- for element in root.iter():
358
- if element.tag == "voice":
359
- id = int(element.attrib.get("id", root.attrib.get("id", config.ID)))
360
- lang = element.attrib.get("lang", root.attrib.get("lang", config.LANG))
361
- length = float(element.attrib.get("length", root.attrib.get("length", config.LENGTH)))
362
- noise = float(element.attrib.get("noise", root.attrib.get("noise", config.NOISE)))
363
- noisew = float(element.attrib.get("noisew", root.attrib.get("noisew", config.NOISEW)))
364
- max = int(element.attrib.get("max", root.attrib.get("max", "0")))
365
- # 不填写默认就是vits
366
- model = element.attrib.get("model", root.attrib.get("model", "vits"))
367
- # w2v2-vits/emotion-vits才有emotion
368
- emotion = int(element.attrib.get("emotion", root.attrib.get("emotion", 0)))
369
-
370
- voice_element = ET.tostring(element, encoding='unicode')
371
-
372
- pattern_voice = r'<voice.*?>(.*?)</voice>'
373
- pattern_break = r'<break\s*?(.*?)\s*?/>'
374
-
375
- matches_voice = re.findall(pattern_voice, voice_element)[0]
376
- matches_break = re.split(pattern_break, matches_voice)
377
- for match in matches_break:
378
- strength = re.search(r'\s*strength\s*=\s*[\'\"](.*?)[\'\"]', match)
379
- time = re.search(r'\s*time\s*=\s*[\'\"](.*?)[\'\"]', match)
380
- # break标签 strength属性
381
- if strength:
382
- brk = strength_dict[strength.group(1)]
383
- voice_tasks.append({"break": brk})
384
- brk_count += 1
385
- # break标签 time属性
386
- elif time:
387
- brk = self.convert_time_string(time.group(1))
388
- voice_tasks.append({"break": brk})
389
- brk_count += 1
390
- # break标签 为空说明只写了break,默认停顿0.75s
391
- elif match == "":
392
- voice_tasks.append({"break": 0.75})
393
- brk_count += 1
394
- # voice标签中除了break剩下的就是文本
395
- else:
396
- voice_tasks.append({"id": id,
397
- "text": match,
398
- "lang": lang,
399
- "length": length,
400
- "noise": noise,
401
- "noisew": noisew,
402
- "max": max,
403
- "model": model,
404
- "emotion": emotion
405
- })
406
-
407
- # 分段末尾停顿0.75s
408
- voice_tasks.append({"break": 0.75})
409
- elif element.tag == "break":
410
- # brk_count大于0说明voice标签中有break
411
- if brk_count > 0:
412
- brk_count -= 1
413
- continue
414
- brk = strength_dict.get(element.attrib.get("strength"),
415
- self.convert_time_string(element.attrib.get("time", "750ms")))
416
- voice_tasks.append({"break": brk})
417
-
418
- for i in voice_tasks:
419
- self.logger.debug(i)
420
-
421
- return voice_tasks, format
422
-
423
- def create_ssml_infer_task(self, ssml, fname):
424
- voice_tasks, format = self.parse_ssml(ssml)
425
-
426
- audios = []
427
- for voice in voice_tasks:
428
- if voice.get("break"):
429
- audios.append(np.zeros(int(voice.get("break") * 22050), dtype=np.int16))
430
- else:
431
- model = voice.get("model").upper()
432
- if model != "VITS" and model != "W2V2-VITS" and model != "EMOTION-VITS":
433
- raise ValueError(f"Unsupported model: {voice.get('model')}")
434
- voice_obj = self._voice_obj[model][voice.get("id")][1]
435
- voice["id"] = self._voice_obj[model][voice.get("id")][0]
436
- audio = voice_obj.get_audio(voice)
437
- audios.append(audio)
438
-
439
- audio = np.concatenate(audios, axis=0)
440
- encoded_audio = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
441
- if config.SAVE_AUDIO:
442
- path = f"{config.CACHE_PATH}/{fname}"
443
- utils.save_audio(encoded_audio.getvalue(), path)
444
- return encoded_audio, format
445
-
446
- def vits_infer(self, voice, fname):
447
- format = voice.get("format", "wav")
448
- voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
449
- voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
450
- sampling_rate = voice_obj.hps_ms.data.sampling_rate
451
- audio = voice_obj.get_audio(voice, auto_break=True)
452
- encoded_audio = self.encode(sampling_rate, audio, format)
453
- if config.SAVE_AUDIO:
454
- path = f"{config.CACHE_PATH}/{fname}"
455
- utils.save_audio(encoded_audio.getvalue(), path)
456
- return encoded_audio
457
-
458
- def stream_vits_infer(self, voice, fname):
459
- format = voice.get("format", "wav")
460
- voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
461
- voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
462
- sampling_rate = voice_obj.hps_ms.data.sampling_rate
463
- genertator = voice_obj.get_stream_audio(voice, auto_break=True)
464
- audio = BytesIO()
465
- for chunk in genertator:
466
- encoded_audio = self.encode(sampling_rate, chunk, format)
467
- for encoded_audio_chunk in self.generate_audio_chunks(encoded_audio):
468
- yield encoded_audio_chunk
469
- if config.SAVE_AUDIO:
470
- audio.write(encoded_audio.getvalue())
471
- if config.SAVE_AUDIO:
472
- path = f"{config.CACHE_PATH}/{fname}"
473
- utils.save_audio(audio.getvalue(), path)
474
-
475
- def hubert_vits_infer(self, voice, fname):
476
- format = voice.get("format", "wav")
477
- voice_obj = self._voice_obj["HUBERT-VITS"][voice.get("id")][1]
478
- voice["id"] = self._voice_obj["HUBERT-VITS"][voice.get("id")][0]
479
- sampling_rate = voice_obj.hps_ms.data.sampling_rate
480
- audio = voice_obj.get_audio(voice)
481
- encoded_audio = self.encode(sampling_rate, audio, format)
482
- if config.SAVE_AUDIO:
483
- path = f"{config.CACHE_PATH}/{fname}"
484
- utils.save_audio(encoded_audio.getvalue(), path)
485
- return encoded_audio
486
-
487
- def w2v2_vits_infer(self, voice, fname):
488
- format = voice.get("format", "wav")
489
- voice_obj = self._voice_obj["W2V2-VITS"][voice.get("id")][1]
490
- voice["id"] = self._voice_obj["W2V2-VITS"][voice.get("id")][0]
491
- sampling_rate = voice_obj.hps_ms.data.sampling_rate
492
- audio = voice_obj.get_audio(voice, auto_break=True)
493
- encoded_audio = self.encode(sampling_rate, audio, format)
494
- if config.SAVE_AUDIO:
495
- path = f"{config.CACHE_PATH}/{fname}"
496
- utils.save_audio(encoded_audio.getvalue(), path)
497
- return encoded_audio
498
-
499
- def vits_voice_conversion(self, voice, fname):
500
- original_id = voice.get("original_id")
501
- target_id = voice.get("target_id")
502
- format = voice.get("format")
503
-
504
- original_id_obj = int(self._voice_obj["VITS"][original_id][2])
505
- target_id_obj = int(self._voice_obj["VITS"][target_id][2])
506
-
507
- if original_id_obj != target_id_obj:
508
- raise ValueError(f"speakers are in diffrent VITS Model")
509
-
510
- voice["original_id"] = int(self._voice_obj["VITS"][original_id][0])
511
- voice["target_id"] = int(self._voice_obj["VITS"][target_id][0])
512
-
513
- voice_obj = self._voice_obj["VITS"][original_id][1]
514
- sampling_rate = voice_obj.hps_ms.data.sampling_rate
515
-
516
- audio = voice_obj.voice_conversion(voice)
517
- encoded_audio = self.encode(sampling_rate, audio, format)
518
- if config.SAVE_AUDIO:
519
- path = f"{config.CACHE_PATH}/{fname}"
520
- utils.save_audio(encoded_audio.getvalue(), path)
521
- return encoded_audio
522
-
523
- def get_dimensional_emotion_npy(self, audio):
524
- if self.dem is None:
525
- raise ValueError(f"Please configure DIMENSIONAL_EMOTION_MODEL path in config.py")
526
- audio16000, sampling_rate = librosa.load(audio, sr=16000, mono=True)
527
- emotion = self.dem(audio16000, sampling_rate)['hidden_states']
528
- emotion_npy = BytesIO()
529
- np.save(emotion_npy, emotion.squeeze(0))
530
- emotion_npy.seek(0)
531
-
532
- return emotion_npy