File size: 5,126 Bytes
ae81afb
49fb505
bb53d91
 
ae81afb
49fb505
ae81afb
49fb505
fd4bbc5
ae81afb
49fb505
ae81afb
49fb505
ae81afb
49fb505
 
 
 
 
 
 
 
 
 
ae81afb
49fb505
ae81afb
49fb505
e1be1d0
ae81afb
e1be1d0
 
 
 
 
 
49fb505
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a09114e
 
 
49fb505
 
 
 
 
 
 
a09114e
49fb505
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec4ac80
a94d9ab
 
ef39552
a94d9ab
49fb505
 
66e46f9
7466a37
fd4bbc5
 
 
409f492
 
7466a37
409f492
 
c62de4e
409f492
 
 
 
 
 
fd4bbc5
fff97ab
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
print("import gradio")
import gradio as gr
from scipy.io.wavfile import write
import tempfile
print("import ppn")
from pypinyin import lazy_pinyin, Style
print("import torch")
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("import ttts")
from ttts.utils.infer_utils import load_model
print("import mel")
from ttts.vocoder.feature_extractors import MelSpectrogramFeatures
print("import torchaudio")
import torchaudio
MODELS = {
    'vqvae.pth':'./TTTS/vae-30.pt',
    'gpt.pth': './TTTS/gpt-70.pt',
    'clvp2.pth': '',
    'diffusion.pth': './TTTS/diffusion-855.pt',
    'vocoder.pth': './ttts/pretrained_models/pytorch_model.bin',
    'rlg_auto.pth': '',
    'rlg_diffuser.pth': '',
}
print("import tokenizer")
from ttts.gpt.voice_tokenizer import VoiceBpeTokenizer
print("import f")
import torch.nn.functional as F
cond_audio = 'ttts/3.wav'
print("load audio")
audio,sr = torchaudio.load(cond_audio)
if audio.shape[0]>1:
    audio = audio[0].unsqueeze(0)
audio = torchaudio.transforms.Resample(sr, 24000)(audio)
cond_mel = MelSpectrogramFeatures()(audio).to(device)
print(cond_mel.shape)
auto_conditioning = cond_mel
settings = {'temperature': .8, 'length_penalty': 1.0, 'repetition_penalty': 2.0,
                    'top_p': .8,
                    'cond_free_k': 2.0, 'diffusion_temperature': 1.0}
top_p = .8
temperature = .8
autoregressive_batch_size = 1
length_penalty = 1.0
repetition_penalty = 2.0
max_mel_tokens = 600
from vocos import Vocos
from ttts.diffusion.train import do_spectrogram_diffusion
from ttts.utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule
from ttts.diffusion.aa_model import denormalize_tacotron_mel, normalize_tacotron_mel
# print(device)

vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
gpt = load_model('gpt',MODELS['gpt.pth'], './ttts/gpt/config.json',device)
gpt.post_init_gpt2_config(use_deepspeed=False, kv_cache=False, half=False)
tokenizer = VoiceBpeTokenizer('ttts/gpt/gpt_tts_tokenizer.json')
def speak(text):
    pinyin = ' '.join(lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True))
    text_tokens = torch.IntTensor(tokenizer.encode(pinyin)).unsqueeze(0).to(device)
    text_tokens = F.pad(text_tokens, (0, 1))  # This may not be necessary.
    text_tokens = text_tokens.to(device)
    print(pinyin)
    print(text_tokens)
    
    codes = gpt.inference_speech(auto_conditioning, text_tokens,
                                do_sample=True,
                                top_p=top_p,
                                temperature=temperature,
                                num_return_sequences=autoregressive_batch_size,
                                length_penalty=length_penalty,
                                repetition_penalty=repetition_penalty,
                                max_generate_length=max_mel_tokens)
    latent = gpt(auto_conditioning, text_tokens,
    torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,
    torch.tensor([codes.shape[-1]*gpt.mel_length_compression], device=text_tokens.device),
    return_latent=True, clip_inputs=False).transpose(1,2)
    diffusion = load_model('diffusion',MODELS['diffusion.pth'],'./ttts/diffusion/config.yaml',device)
    diffuser = SpacedDiffusion(use_timesteps=space_timesteps(1000, [50]), model_mean_type='epsilon',
                           model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 1000),
                           conditioning_free=True, conditioning_free_k=2., sampler='dpm++2m')
    diffusion_conditioning = normalize_tacotron_mel(cond_mel)
    mel = do_spectrogram_diffusion(diffusion, diffuser, latent, diffusion_conditioning, temperature=1.0).detach().cpu()
    wav = vocos.decode(mel).detach().cpu()
    # return (24000, wav.numpy())
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
        torchaudio.save(f.name,wav.detach().cpu(), 24000)
        return f.name

with gr.Blocks() as demo:
    gr.Markdown('# TTTS\n\nAn **unofficial** demo of [TTTS](https://github.com/adelacvg/ttts) based on XTTS. TTTS only supports Chinese.')
    txt = gr.Textbox(label="Text to say", interactive=True)
    btn = gr.Button("Say")
    aud = gr.Audio(interactive=False)
    btn.click(speak, inputs=txt, outputs=aud)
    gr.Examples(
        examples=[
            '大家好,今天来点大家想看的东西。',
            '最近,文本转语音领域发展迅速。',
            '那是一个漆黑的暴风雨之夜;大雨倾盆而下,只是偶尔被一阵狂风挡住,狂风席卷了街道(因为我们的场景就在伦敦),沿着房顶哗哗作响,猛烈地搅动着在黑暗中挣扎的稀疏灯火。',
            '你会很高兴听到你怀有如此不祥预感的一项事业的开始并没有发生任何灾难。 我昨天到达这里,我的首要任务是向我亲爱的妹妹保证我的幸福,并增强我事业成功的信心。',
        ],
        inputs=txt,
        outputs=aud,
        fn=speak,
        cache_examples=True
    )
    
demo.queue(max_size=20, api_open=False).launch(show_api=False)