{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from pypinyin import lazy_pinyin, Style\n", "import torch\n", "\n", "MODELS = {\n", " 'vqvae.pth':'/home/hyc/tortoise_plus_zh/ttts/vqvae/logs/2023-11-24-01-21-25/model-30.pt',\n", " 'gpt.pth': '/home/hyc/tortoise_plus_zh/ttts/gpt/logs/2023-12-24-14-22-14/model-70.pt',\n", " # 'gpt.pth': '/home/hyc/tortoise_plus_zh/ttts/gpt/logs/2024-01-12-10-20-25/model-7.pt',\n", " 'clvp2.pth': '',\n", " 'diffusion.pth': '/home/hyc/tortoise_plus_zh/ttts/diffusion/logs/2024-01-09-17-44-36/model-855.pt',\n", " 'vocoder.pth': '~/tortoise_plus_zh/ttts/pretrained_models/pytorch_model.bin',\n", " 'rlg_auto.pth': '',\n", " 'rlg_diffuser.pth': '',\n", "}" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "da4 jia1 hao3 , jin1 tian1 lai2 dian3 da4 jia1 xiang3 kan4 de5 dong1 xi1 。\n", "tensor([[161, 2, 155, 2, 16, 87, 2, 43, 2, 224, 2, 171, 71, 2,\n", " 182, 2, 188, 2, 161, 2, 155, 2, 62, 92, 2, 19, 63, 2,\n", " 65, 2, 12, 84, 2, 228, 2, 39, 0]], device='cuda:2',\n", " dtype=torch.int32)\n" ] } ], "source": [ "from ttts.gpt.voice_tokenizer import VoiceBpeTokenizer\n", "import torch.nn.functional as F\n", "device = 'cuda:2'\n", "text = \"大家好,今天来点大家想看的东西。\"\n", "# text = \"霞浦县衙城镇乌旗瓦窑村水位猛涨。\"\n", "# text = '高德官方网站,拥有全面、精准的地点信息,公交驾车路线规划,特色语音导航,商家团购、优惠信息。'\n", "# text = '四是四,十是十,十四是十四,四十是四十。'\n", "# text = '八百标兵奔北坡,炮兵并排北边跑。炮兵怕把标兵碰,标兵怕碰炮兵炮。'\n", "# text = '黑化肥发灰,灰化肥发黑。黑化肥挥发会发灰;灰化肥挥发会发黑。'\n", "# text = '先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。然侍卫之臣不懈于内,忠志之士忘身于外者,盖追先帝之殊遇,欲报之于陛下也。诚宜开张圣听,以光先帝遗德,恢弘志士之气,不宜妄自菲薄,引喻失义,以塞忠谏之路也。'\n", "pinyin = ' '.join(lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True))\n", "tokenizer = VoiceBpeTokenizer('gpt/gpt_tts_tokenizer.json')\n", "text_tokens = torch.IntTensor(tokenizer.encode(pinyin)).unsqueeze(0).to(device)\n", "text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary.\n", "text_tokens = text_tokens.to(device)\n", "print(pinyin)\n", "print(text_tokens)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/hyc/miniconda3/envs/vocos/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "/home/hyc/miniconda3/envs/vocos/lib/python3.10/site-packages/transformers/configuration_utils.py:380: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`.\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([1, 100, 400])\n" ] } ], "source": [ "from ttts.utils.infer_utils import load_model\n", "from ttts.vocoder.feature_extractors import MelSpectrogramFeatures\n", "import torchaudio\n", "# device = 'gpu:0'\n", "gpt = load_model('gpt',MODELS['gpt.pth'],'gpt/config.json',device)\n", "gpt.post_init_gpt2_config(use_deepspeed=False, kv_cache=False, half=False)\n", "# diffusion = load_model('diffusion',MODELS['diffusion.pth'],'ttts/diffusion/config.json',device)\n", "cond_audio = '3.wav'\n", "audio,sr = torchaudio.load(cond_audio)\n", "if audio.shape[0]>1:\n", " audio = audio[0].unsqueeze(0)\n", "audio = torchaudio.transforms.Resample(sr,24000)(audio)\n", "cond_mel = MelSpectrogramFeatures()(audio).to(device)\n", "print(cond_mel.shape)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([1, 100, 400])\n", "torch.Size([1, 37])\n", "torch.Size([1, 37])\n", "tensor([[2867, 2218, 7537, 7678, 6390, 5131, 4404, 5513, 3286, 1552, 3988, 2081,\n", " 2288, 89, 63, 5695, 4186, 2972, 7915, 2843, 5904, 7735, 1031, 1603,\n", " 6780, 6281, 2746, 1217, 3853, 1815, 4043, 6257, 2936, 5986, 5248, 5129,\n", " 5711, 170, 1626, 3157, 764, 7297, 4627, 2371, 2423, 769, 8168, 7773,\n", " 4877, 5046, 6025, 7115, 2737, 2065, 3957, 3077, 4193, 1197, 5800, 4149,\n", " 4599, 3384, 2931, 6760, 7167, 8193]], device='cuda:2')\n" ] } ], "source": [ "auto_conditioning = cond_mel\n", "settings = {'temperature': .8, 'length_penalty': 1.0, 'repetition_penalty': 2.0,\n", " 'top_p': .8,\n", " 'cond_free_k': 2.0, 'diffusion_temperature': 1.0}\n", "top_p = .8\n", "temperature = .8\n", "autoregressive_batch_size = 1\n", "length_penalty = 1.0\n", "repetition_penalty = 2.0\n", "max_mel_tokens = 600\n", "print(auto_conditioning.shape)\n", "print(text_tokens.shape)\n", "# text_tokens = F.pad(text_tokens,(0,400-text_tokens.shape[1]),value=0)\n", "print(text_tokens.shape)\n", "codes = gpt.inference_speech(auto_conditioning, text_tokens,\n", " do_sample=True,\n", " top_p=top_p,\n", " temperature=temperature,\n", " num_return_sequences=autoregressive_batch_size,\n", " length_penalty=length_penalty,\n", " repetition_penalty=repetition_penalty,\n", " max_generate_length=max_mel_tokens)\n", "print(codes)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# vqvae = load_model('vqvae', MODELS['vqvae.pth'], 'vqvae/config.json', device)\n", "# mel = vqvae.decode(codes[:,:-1])[0]\n", "# codes_gt = vqvae.get_codebook_indices(cond_mel)\n", "# print(codes_gt)\n", "# mel.shape" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# from PIL import Image\n", "# import numpy as np\n", "# from IPython.display import display\n", "# from ttts.utils.utils import plot_spectrogram_to_numpy\n", "# img = plot_spectrogram_to_numpy(mel[0, :, :].detach().unsqueeze(-1).cpu())\n", "# image = Image.fromarray(np.uint8(img))\n", "# display(image)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "from ttts.vocoder.vocos import Vocos\n", "\n", "\n", "vocos = Vocos.from_pretrained('pretrained_models/pytorch_model.bin','vocoder/config.yaml').to(device)\n", "# audio = vocos.decode(mel)\n", "# print(audio.shape)\n", "# torchaudio.save('gen.wav',audio.detach().cpu(), 24000)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([1, 1024, 66])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "latent = gpt(auto_conditioning, text_tokens,\n", " torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,\n", " torch.tensor([codes.shape[-1]*gpt.mel_length_compression], device=text_tokens.device),\n", " return_latent=True, clip_inputs=False).transpose(1,2)\n", "latent.shape" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-01-13 15:46:00.320275: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", "2024-01-13 15:46:00.324447: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-12.1/lib64:/usr/local/cuda-12.1/lib64:\n", "2024-01-13 15:46:00.324459: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is None and using 8 heads.\n", "Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is 512 and using 8 heads.\n", "Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is None and using 8 heads.\n", "Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is 512 and using 8 heads.\n", "Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is None and using 8 heads.\n", "Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is 512 and using 8 heads.\n", "Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is None and using 8 heads.\n", "Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is 512 and using 8 heads.\n", "Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is None and using 8 heads.\n", "Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is 512 and using 8 heads.\n", "Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is None and using 8 heads.\n", "Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is 512 and using 8 heads.\n", "Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is None and using 8 heads.\n", "Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is 512 and using 8 heads.\n", "Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is None and using 8 heads.\n", "Setting up MemoryEfficientCrossAttention. Query dim is 512, context_dim is 512 and using 8 heads.\n", "base model params: 46144712\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 50/50 [00:05<00:00, 9.14it/s]\n" ] } ], "source": [ "from ttts.diffusion.train import do_spectrogram_diffusion\n", "from ttts.utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule\n", "from ttts.diffusion.aa_model import denormalize_tacotron_mel, normalize_tacotron_mel\n", "# print(device)\n", "diffusion = load_model('diffusion',MODELS['diffusion.pth'],'diffusion/config.yaml',device)\n", "diffuser = SpacedDiffusion(use_timesteps=space_timesteps(1000, [50]), model_mean_type='epsilon',\n", " model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 1000),\n", " conditioning_free=True, conditioning_free_k=2., sampler='dpm++2m')\n", "diffusion_conditioning = normalize_tacotron_mel(cond_mel)\n", "mel = do_spectrogram_diffusion(diffusion, diffuser, latent, diffusion_conditioning, temperature=1.0)\n", "wav = vocos.decode(mel)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from IPython.display import Audio\n", "wav = wav.detach().cpu()\n", "torchaudio.save('gen.wav',wav.detach().cpu(), 24000)\n", "Audio(wav,rate=24000)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# from ttts.vocoder.vocos import Vocos\n", "# from IPython.display import Audio\n", "# vocoder = Vocos.from_pretrained(\"~/tortoise_plus_zh/ttts/pretrained_models/pytorch_model.bin\",\"~/tortoise_plus_zh/ttts/vocoder/config.yaml\")\n", "# audio = vocoder.decode(mel.cpu())\n", "# torchaudio.save('gen_0.wav',audio,24000)\n", "# Audio('gen_0.wav')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "vocos", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }