linly / TTS /PaddleTTS.py
David Victor
init
bc3753a
raw
history blame
4.72 kB
import os
from paddlespeech.cli.tts.infer import TTSExecutor
"""
PaddleSpeech
声码器说明:这里预制了三种声码器【PWGan】【WaveRnn】【HifiGan】, 三种声码器效果和生成时间有比较大的差距,请跟进自己的需要进行选择。不过只选择了前两种,因为WaveRNN太慢了
| 声码器 | 音频质量 | 生成速度 |
| :----: | :----: | :----: |
| PWGan | 中等 | 中等 |
| WaveRnn | 高 | 非常慢(耐心等待) |
| HifiGan | 低 | 快 |
这些PaddleSpeech中的样例主要按数据集分类,我们主要使用的TTS数据集有:
CSMCS (普通话单发音人)
AISHELL3 (普通话多发音人)
LJSpeech (英文单发音人)
VCTK (英文多发音人)
PaddleSpeech 的 TTS 模型具有以下映射关系:
tts0 - Tacotron2
tts1 - TransformerTTS
tts2 - SpeedySpeech
tts3 - FastSpeech2
voc0 - WaveFlow
voc1 - Parallel WaveGAN
voc2 - MelGAN
voc3 - MultiBand MelGAN
voc4 - Style MelGAN
voc5 - HiFiGAN
vc0 - Tacotron2 Voice Clone with GE2E
vc1 - FastSpeech2 Voice Clone with GE2E
以下是 PaddleSpeech 提供的可以被命令行和 python API 使用的预训练模型列表:
- 声学模型
| 模型 | 语言 |
| :--- | :---: |
| speedyspeech_csmsc | zh |
| fastspeech2_csmsc | zh |
| fastspeech2_ljspeech | en |
| fastspeech2_aishell3 | zh |
| fastspeech2_vctk | en |
| fastspeech2_cnndecoder_csmsc | zh |
| fastspeech2_mix | mix |
| tacotron2_csmsc | zh |
| tacotron2_ljspeech | en |
| fastspeech2_male | zh |
| fastspeech2_male | en |
| fastspeech2_male | mix |
| fastspeech2_canton | canton |
- 声码器
| 模型 | 语言 |
| :--- | :---: |
| pwgan_csmsc | zh |
| pwgan_ljspeech | en |
| pwgan_aishell3 | zh |
| pwgan_vctk | en |
| mb_melgan_csmsc | zh |
| style_melgan_csmsc | zh |
| hifigan_csmsc | zh |
| hifigan_ljspeech | en |
| hifigan_aishell3 | zh |
| hifigan_vctk | en |
| wavernn_csmsc | zh |
| pwgan_male | zh |
| hifigan_male | zh |
"""
class PaddleTTS:
def __init__(self) -> None:
pass
def predict(self, text, am, voc, spk_id = 174, lang = 'zh', male=False, save_path = 'output.wav'):
self.tts = TTSExecutor()
use_onnx = True
voc = voc.lower()
am = am.lower()
if male:
assert voc in ["pwgan", "hifigan"], "male voc must be 'pwgan' or 'hifigan'"
wav_file = self.tts(
text = text,
output = save_path,
am='fastspeech2_male',
voc= voc + '_male',
lang=lang,
use_onnx=use_onnx
)
return wav_file
assert am in ['tacotron2', 'fastspeech2'], "am must be 'tacotron2' or 'fastspeech2'"
# 混合中文英文语音合成
if lang == 'mix':
# mix只有fastspeech2
am = 'fastspeech2_mix'
voc += '_csmsc'
# 英文语音合成
elif lang == 'en':
am += '_ljspeech'
voc += '_ljspeech'
# 中文语音合成
elif lang == 'zh':
assert voc in ['wavernn', 'pwgan', 'hifigan', 'style_melgan', 'mb_melgan'], "voc must be 'wavernn' or 'pwgan' or 'hifigan' or 'style_melgan' or 'mb_melgan'"
am += '_csmsc'
voc += '_csmsc'
elif lang == 'canton':
am = 'fastspeech2_canton'
voc = 'pwgan_aishell3'
spk_id = 10
print("am:", am, "voc:", voc, "lang:", lang, "male:", male, "spk_id:", spk_id)
try:
cmd = f'paddlespeech tts --am {am} --voc {voc} --input "{text}" --output {save_path} --lang {lang} --spk_id {spk_id} --use_onnx {use_onnx}'
os.system(cmd)
wav_file = save_path
except:
# 语音合成
wav_file = self.tts(
text = text,
output = save_path,
am = am,
voc = voc,
lang = lang,
spk_id = spk_id,
use_onnx=use_onnx
)
return wav_file
if __name__ == "__main__":
tts = PaddleTTS()
tts.predict("Hello world", 'FastSpeech2', 'PWGan', spk_id=174, lang='en', male=False, save_path='output.wav')