Spaces:
Sleeping
Sleeping
File size: 5,126 Bytes
ae81afb 49fb505 bb53d91 ae81afb 49fb505 ae81afb 49fb505 fd4bbc5 ae81afb 49fb505 ae81afb 49fb505 ae81afb 49fb505 ae81afb 49fb505 ae81afb 49fb505 e1be1d0 ae81afb e1be1d0 49fb505 a09114e 49fb505 a09114e 49fb505 ec4ac80 a94d9ab ef39552 a94d9ab 49fb505 66e46f9 7466a37 fd4bbc5 409f492 7466a37 409f492 c62de4e 409f492 fd4bbc5 fff97ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
print("import gradio")
import gradio as gr
from scipy.io.wavfile import write
import tempfile
print("import ppn")
from pypinyin import lazy_pinyin, Style
print("import torch")
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("import ttts")
from ttts.utils.infer_utils import load_model
print("import mel")
from ttts.vocoder.feature_extractors import MelSpectrogramFeatures
print("import torchaudio")
import torchaudio
MODELS = {
'vqvae.pth':'./TTTS/vae-30.pt',
'gpt.pth': './TTTS/gpt-70.pt',
'clvp2.pth': '',
'diffusion.pth': './TTTS/diffusion-855.pt',
'vocoder.pth': './ttts/pretrained_models/pytorch_model.bin',
'rlg_auto.pth': '',
'rlg_diffuser.pth': '',
}
print("import tokenizer")
from ttts.gpt.voice_tokenizer import VoiceBpeTokenizer
print("import f")
import torch.nn.functional as F
cond_audio = 'ttts/3.wav'
print("load audio")
audio,sr = torchaudio.load(cond_audio)
if audio.shape[0]>1:
audio = audio[0].unsqueeze(0)
audio = torchaudio.transforms.Resample(sr, 24000)(audio)
cond_mel = MelSpectrogramFeatures()(audio).to(device)
print(cond_mel.shape)
auto_conditioning = cond_mel
settings = {'temperature': .8, 'length_penalty': 1.0, 'repetition_penalty': 2.0,
'top_p': .8,
'cond_free_k': 2.0, 'diffusion_temperature': 1.0}
top_p = .8
temperature = .8
autoregressive_batch_size = 1
length_penalty = 1.0
repetition_penalty = 2.0
max_mel_tokens = 600
from vocos import Vocos
from ttts.diffusion.train import do_spectrogram_diffusion
from ttts.utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule
from ttts.diffusion.aa_model import denormalize_tacotron_mel, normalize_tacotron_mel
# print(device)
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
gpt = load_model('gpt',MODELS['gpt.pth'], './ttts/gpt/config.json',device)
gpt.post_init_gpt2_config(use_deepspeed=False, kv_cache=False, half=False)
tokenizer = VoiceBpeTokenizer('ttts/gpt/gpt_tts_tokenizer.json')
def speak(text):
pinyin = ' '.join(lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True))
text_tokens = torch.IntTensor(tokenizer.encode(pinyin)).unsqueeze(0).to(device)
text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary.
text_tokens = text_tokens.to(device)
print(pinyin)
print(text_tokens)
codes = gpt.inference_speech(auto_conditioning, text_tokens,
do_sample=True,
top_p=top_p,
temperature=temperature,
num_return_sequences=autoregressive_batch_size,
length_penalty=length_penalty,
repetition_penalty=repetition_penalty,
max_generate_length=max_mel_tokens)
latent = gpt(auto_conditioning, text_tokens,
torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,
torch.tensor([codes.shape[-1]*gpt.mel_length_compression], device=text_tokens.device),
return_latent=True, clip_inputs=False).transpose(1,2)
diffusion = load_model('diffusion',MODELS['diffusion.pth'],'./ttts/diffusion/config.yaml',device)
diffuser = SpacedDiffusion(use_timesteps=space_timesteps(1000, [50]), model_mean_type='epsilon',
model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 1000),
conditioning_free=True, conditioning_free_k=2., sampler='dpm++2m')
diffusion_conditioning = normalize_tacotron_mel(cond_mel)
mel = do_spectrogram_diffusion(diffusion, diffuser, latent, diffusion_conditioning, temperature=1.0).detach().cpu()
wav = vocos.decode(mel).detach().cpu()
# return (24000, wav.numpy())
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
torchaudio.save(f.name,wav.detach().cpu(), 24000)
return f.name
with gr.Blocks() as demo:
gr.Markdown('# TTTS\n\nAn **unofficial** demo of [TTTS](https://github.com/adelacvg/ttts) based on XTTS. TTTS only supports Chinese.')
txt = gr.Textbox(label="Text to say", interactive=True)
btn = gr.Button("Say")
aud = gr.Audio(interactive=False)
btn.click(speak, inputs=txt, outputs=aud)
gr.Examples(
examples=[
'大家好,今天来点大家想看的东西。',
'最近,文本转语音领域发展迅速。',
'那是一个漆黑的暴风雨之夜;大雨倾盆而下,只是偶尔被一阵狂风挡住,狂风席卷了街道(因为我们的场景就在伦敦),沿着房顶哗哗作响,猛烈地搅动着在黑暗中挣扎的稀疏灯火。',
'你会很高兴听到你怀有如此不祥预感的一项事业的开始并没有发生任何灾难。 我昨天到达这里,我的首要任务是向我亲爱的妹妹保证我的幸福,并增强我事业成功的信心。',
],
inputs=txt,
outputs=aud,
fn=speak,
cache_examples=True
)
demo.queue(max_size=20, api_open=False).launch(show_api=False) |