{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from pypinyin import lazy_pinyin, Style\n",
"import torch\n",
"\n",
"MODELS = {\n",
" 'vqvae.pth':'/home/hyc/tortoise_plus_zh/ttts/vqvae/logs/2023-11-24-01-21-25/model-30.pt',\n",
" 'gpt.pth': '/home/hyc/tortoise_plus_zh/ttts/gpt/logs/2023-12-24-14-22-14/model-70.pt',\n",
" # 'gpt.pth': '/home/hyc/tortoise_plus_zh/ttts/gpt/logs/2024-01-12-10-20-25/model-7.pt',\n",
" 'clvp2.pth': '',\n",
" 'diffusion.pth': '/home/hyc/tortoise_plus_zh/ttts/diffusion/logs/2024-01-09-17-44-36/model-855.pt',\n",
" 'vocoder.pth': '~/tortoise_plus_zh/ttts/pretrained_models/pytorch_model.bin',\n",
" 'rlg_auto.pth': '',\n",
" 'rlg_diffuser.pth': '',\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"da4 jia1 hao3 , jin1 tian1 lai2 dian3 da4 jia1 xiang3 kan4 de5 dong1 xi1 。\n",
"tensor([[161, 2, 155, 2, 16, 87, 2, 43, 2, 224, 2, 171, 71, 2,\n",
" 182, 2, 188, 2, 161, 2, 155, 2, 62, 92, 2, 19, 63, 2,\n",
" 65, 2, 12, 84, 2, 228, 2, 39, 0]], device='cuda:2',\n",
" dtype=torch.int32)\n"
]
}
],
"source": [
"from ttts.gpt.voice_tokenizer import VoiceBpeTokenizer\n",
"import torch.nn.functional as F\n",
"device = 'cuda:2'\n",
"text = \"大家好,今天来点大家想看的东西。\"\n",
"# text = \"霞浦县衙城镇乌旗瓦窑村水位猛涨。\"\n",
"# text = '高德官方网站,拥有全面、精准的地点信息,公交驾车路线规划,特色语音导航,商家团购、优惠信息。'\n",
"# text = '四是四,十是十,十四是十四,四十是四十。'\n",
"# text = '八百标兵奔北坡,炮兵并排北边跑。炮兵怕把标兵碰,标兵怕碰炮兵炮。'\n",
"# text = '黑化肥发灰,灰化肥发黑。黑化肥挥发会发灰;灰化肥挥发会发黑。'\n",
"# text = '先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。然侍卫之臣不懈于内,忠志之士忘身于外者,盖追先帝之殊遇,欲报之于陛下也。诚宜开张圣听,以光先帝遗德,恢弘志士之气,不宜妄自菲薄,引喻失义,以塞忠谏之路也。'\n",
"pinyin = ' '.join(lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True))\n",
"tokenizer = VoiceBpeTokenizer('gpt/gpt_tts_tokenizer.json')\n",
"text_tokens = torch.IntTensor(tokenizer.encode(pinyin)).unsqueeze(0).to(device)\n",
"text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary.\n",
"text_tokens = text_tokens.to(device)\n",
"print(pinyin)\n",
"print(text_tokens)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/hyc/miniconda3/envs/vocos/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"/home/hyc/miniconda3/envs/vocos/lib/python3.10/site-packages/transformers/configuration_utils.py:380: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([1, 100, 400])\n"
]
}
],
"source": [
"from ttts.utils.infer_utils import load_model\n",
"from ttts.vocoder.feature_extractors import MelSpectrogramFeatures\n",
"import torchaudio\n",
"# device = 'gpu:0'\n",
"gpt = load_model('gpt',MODELS['gpt.pth'],'gpt/config.json',device)\n",
"gpt.post_init_gpt2_config(use_deepspeed=False, kv_cache=False, half=False)\n",
"# diffusion = load_model('diffusion',MODELS['diffusion.pth'],'ttts/diffusion/config.json',device)\n",
"cond_audio = '3.wav'\n",
"audio,sr = torchaudio.load(cond_audio)\n",
"if audio.shape[0]>1:\n",
" audio = audio[0].unsqueeze(0)\n",
"audio = torchaudio.transforms.Resample(sr,24000)(audio)\n",
"cond_mel = MelSpectrogramFeatures()(audio).to(device)\n",
"print(cond_mel.shape)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([1, 100, 400])\n",
"torch.Size([1, 37])\n",
"torch.Size([1, 37])\n",
"tensor([[2867, 2218, 7537, 7678, 6390, 5131, 4404, 5513, 3286, 1552, 3988, 2081,\n",
" 2288, 89, 63, 5695, 4186, 2972, 7915, 2843, 5904, 7735, 1031, 1603,\n",
" 6780, 6281, 2746, 1217, 3853, 1815, 4043, 6257, 2936, 5986, 5248, 5129,\n",
" 5711, 170, 1626, 3157, 764, 7297, 4627, 2371, 2423, 769, 8168, 7773,\n",
" 4877, 5046, 6025, 7115, 2737, 2065, 3957, 3077, 4193, 1197, 5800, 4149,\n",
" 4599, 3384, 2931, 6760, 7167, 8193]], device='cuda:2')\n"
]
}
],
"source": [
"auto_conditioning = cond_mel\n",
"settings = {'temperature': .8, 'length_penalty': 1.0, 'repetition_penalty': 2.0,\n",
" 'top_p': .8,\n",
" 'cond_free_k': 2.0, 'diffusion_temperature': 1.0}\n",
"top_p = .8\n",
"temperature = .8\n",
"autoregressive_batch_size = 1\n",
"length_penalty = 1.0\n",
"repetition_penalty = 2.0\n",
"max_mel_tokens = 600\n",
"print(auto_conditioning.shape)\n",
"print(text_tokens.shape)\n",
"# text_tokens = F.pad(text_tokens,(0,400-text_tokens.shape[1]),value=0)\n",
"print(text_tokens.shape)\n",
"codes = gpt.inference_speech(auto_conditioning, text_tokens,\n",
" do_sample=True,\n",
" top_p=top_p,\n",
" temperature=temperature,\n",
" num_return_sequences=autoregressive_batch_size,\n",
" length_penalty=length_penalty,\n",
" repetition_penalty=repetition_penalty,\n",
" max_generate_length=max_mel_tokens)\n",
"print(codes)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# vqvae = load_model('vqvae', MODELS['vqvae.pth'], 'vqvae/config.json', device)\n",
"# mel = vqvae.decode(codes[:,:-1])[0]\n",
"# codes_gt = vqvae.get_codebook_indices(cond_mel)\n",
"# print(codes_gt)\n",
"# mel.shape"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# from PIL import Image\n",
"# import numpy as np\n",
"# from IPython.display import display\n",
"# from ttts.utils.utils import plot_spectrogram_to_numpy\n",
"# img = plot_spectrogram_to_numpy(mel[0, :, :].detach().unsqueeze(-1).cpu())\n",
"# image = Image.fromarray(np.uint8(img))\n",
"# display(image)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from ttts.vocoder.vocos import Vocos\n",
"\n",
"\n",
"vocos = Vocos.from_pretrained('pretrained_models/pytorch_model.bin','vocoder/config.yaml').to(device)\n",
"# audio = vocos.decode(mel)\n",
"# print(audio.shape)\n",
"# torchaudio.save('gen.wav',audio.detach().cpu(), 24000)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([1, 1024, 66])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"latent = gpt(auto_conditioning, text_tokens,\n",
" torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,\n",
" torch.tensor([codes.shape[-1]*gpt.mel_length_compression], device=text_tokens.device),\n",
" return_latent=True, clip_inputs=False).transpose(1,2)\n",
"latent.shape"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-01-13 15:46:00.320275: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
"2024-01-13 15:46:00.324447: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-12.1/lib64:/usr/local/cuda-12.1/lib64:\n",
"2024-01-13 15:46:00.324459: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is None and using 8 heads.\n",
"Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is 512 and using 8 heads.\n",
"Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is None and using 8 heads.\n",
"Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is 512 and using 8 heads.\n",
"Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is None and using 8 heads.\n",
"Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is 512 and using 8 heads.\n",
"Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is None and using 8 heads.\n",
"Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is 512 and using 8 heads.\n",
"Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is None and using 8 heads.\n",
"Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is 512 and using 8 heads.\n",
"Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is None and using 8 heads.\n",
"Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is 512 and using 8 heads.\n",
"Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is None and using 8 heads.\n",
"Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is 512 and using 8 heads.\n",
"Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is None and using 8 heads.\n",
"Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is 512 and using 8 heads.\n",
"base model params: 46144712\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 50/50 [00:05<00:00, 9.14it/s]\n"
]
}
],
"source": [
"from ttts.diffusion.train import do_spectrogram_diffusion\n",
"from ttts.utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule\n",
"from ttts.diffusion.aa_model import denormalize_tacotron_mel, normalize_tacotron_mel\n",
"# print(device)\n",
"diffusion = load_model('diffusion',MODELS['diffusion.pth'],'diffusion/config.yaml',device)\n",
"diffuser = SpacedDiffusion(use_timesteps=space_timesteps(1000, [50]), model_mean_type='epsilon',\n",
" model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 1000),\n",
" conditioning_free=True, conditioning_free_k=2., sampler='dpm++2m')\n",
"diffusion_conditioning = normalize_tacotron_mel(cond_mel)\n",
"mel = do_spectrogram_diffusion(diffusion, diffuser, latent, diffusion_conditioning, temperature=1.0)\n",
"wav = vocos.decode(mel)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from IPython.display import Audio\n",
"wav = wav.detach().cpu()\n",
"torchaudio.save('gen.wav',wav.detach().cpu(), 24000)\n",
"Audio(wav,rate=24000)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# from ttts.vocoder.vocos import Vocos\n",
"# from IPython.display import Audio\n",
"# vocoder = Vocos.from_pretrained(\"~/tortoise_plus_zh/ttts/pretrained_models/pytorch_model.bin\",\"~/tortoise_plus_zh/ttts/vocoder/config.yaml\")\n",
"# audio = vocoder.decode(mel.cpu())\n",
"# torchaudio.save('gen_0.wav',audio,24000)\n",
"# Audio('gen_0.wav')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "vocos",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}