{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/user/lojban/jboselvoha/jboselvoha_env/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "import IPython.display as ipd\n", "\n", "import os\n", "import json\n", "import math\n", "import torch\n", "from torch import nn\n", "from torch.nn import functional as F\n", "from torch.utils.data import DataLoader\n", "\n", "import commons\n", "import utils\n", "from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate\n", "from models import SynthesizerTrn\n", "from text.symbols import symbols\n", "from text import text_to_sequence\n", "\n", "from scipy.io.wavfile import write\n", "\n", "\n", "def get_text(text, hps):\n", " text_norm = text_to_sequence(text, hps.data.text_cleaners)\n", " if hps.data.add_blank:\n", " text_norm = commons.intersperse(text_norm, 0)\n", " text_norm = torch.LongTensor(text_norm)\n", " return text_norm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## LJ Speech" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "hps = utils.get_hparams_from_file(\"./configs/ljs_base.json\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "net_g = SynthesizerTrn(\n", " len(symbols),\n", " hps.data.filter_length // 2 + 1,\n", " hps.train.segment_size // hps.data.hop_length,\n", " **hps.model).cuda()\n", "_ = net_g.eval()\n", "\n", "_ = utils.load_checkpoint(\"pretrained_ljs.pth\", net_g, None)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stn_tst = get_text(\"I have no idea.\", hps)\n", "with torch.no_grad():\n", " x_tst = stn_tst.cuda().unsqueeze(0)\n", " x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()\n", " audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()\n", "ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## VCTK" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "hps = utils.get_hparams_from_file(\"./configs/vctk_base.json\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "net_g = SynthesizerTrn(\n", " len(symbols),\n", " hps.data.filter_length // 2 + 1,\n", " hps.train.segment_size // hps.data.hop_length,\n", " n_speakers=hps.data.n_speakers,\n", " **hps.model).cuda()\n", "_ = net_g.eval()\n", "\n", "_ = utils.load_checkpoint(\"pretrained_vctk.pth\", net_g, None)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stn_tst = get_text(\"This is the course on the lawzhban logical language.\", hps)\n", "with torch.no_grad():\n", " x_tst = stn_tst.cuda().unsqueeze(0)\n", " x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()\n", " sid = torch.LongTensor([4]).cuda()\n", " audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()\n", "ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Voice Conversion" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "ename": "FileNotFoundError", "evalue": "[Errno 2] No such file or directory: 'DUMMY2/p234/p234_071.wav'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn [15], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m dataset \u001b[39m=\u001b[39m TextAudioSpeakerLoader(hps\u001b[39m.\u001b[39;49mdata\u001b[39m.\u001b[39;49mvalidation_files, hps\u001b[39m.\u001b[39;49mdata)\n\u001b[1;32m 2\u001b[0m collate_fn \u001b[39m=\u001b[39m TextAudioSpeakerCollate()\n\u001b[1;32m 3\u001b[0m loader \u001b[39m=\u001b[39m DataLoader(dataset, num_workers\u001b[39m=\u001b[39m\u001b[39m8\u001b[39m, shuffle\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m,\n\u001b[1;32m 4\u001b[0m batch_size\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m, pin_memory\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,\n\u001b[1;32m 5\u001b[0m drop_last\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m, collate_fn\u001b[39m=\u001b[39mcollate_fn)\n", "File \u001b[0;32m~/lojban/jboselvoha/vits/data_utils.py:176\u001b[0m, in \u001b[0;36mTextAudioSpeakerLoader.__init__\u001b[0;34m(self, audiopaths_sid_text, hparams)\u001b[0m\n\u001b[1;32m 174\u001b[0m random\u001b[39m.\u001b[39mseed(\u001b[39m1234\u001b[39m)\n\u001b[1;32m 175\u001b[0m random\u001b[39m.\u001b[39mshuffle(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39maudiopaths_sid_text)\n\u001b[0;32m--> 176\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_filter()\n", "File \u001b[0;32m~/lojban/jboselvoha/vits/data_utils.py:191\u001b[0m, in \u001b[0;36mTextAudioSpeakerLoader._filter\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmin_text_len \u001b[39m<\u001b[39m\u001b[39m=\u001b[39m \u001b[39mlen\u001b[39m(text) \u001b[39mand\u001b[39;00m \u001b[39mlen\u001b[39m(text) \u001b[39m<\u001b[39m\u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmax_text_len:\n\u001b[1;32m 190\u001b[0m audiopaths_sid_text_new\u001b[39m.\u001b[39mappend([audiopath, sid, text])\n\u001b[0;32m--> 191\u001b[0m lengths\u001b[39m.\u001b[39mappend(os\u001b[39m.\u001b[39;49mpath\u001b[39m.\u001b[39;49mgetsize(audiopath) \u001b[39m/\u001b[39m\u001b[39m/\u001b[39m (\u001b[39m2\u001b[39m \u001b[39m*\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhop_length))\n\u001b[1;32m 192\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39maudiopaths_sid_text \u001b[39m=\u001b[39m audiopaths_sid_text_new\n\u001b[1;32m 193\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlengths \u001b[39m=\u001b[39m lengths\n", "File \u001b[0;32m~/lojban/jboselvoha/jboselvoha_env/lib/python3.8/genericpath.py:50\u001b[0m, in \u001b[0;36mgetsize\u001b[0;34m(filename)\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mgetsize\u001b[39m(filename):\n\u001b[1;32m 49\u001b[0m \u001b[39m\"\"\"Return the size of a file, reported by os.stat().\"\"\"\u001b[39;00m\n\u001b[0;32m---> 50\u001b[0m \u001b[39mreturn\u001b[39;00m os\u001b[39m.\u001b[39;49mstat(filename)\u001b[39m.\u001b[39mst_size\n", "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'DUMMY2/p234/p234_071.wav'" ] } ], "source": [ "dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)\n", "collate_fn = TextAudioSpeakerCollate()\n", "loader = DataLoader(dataset, num_workers=8, shuffle=False,\n", " batch_size=1, pin_memory=True,\n", " drop_last=True, collate_fn=collate_fn)\n", "data_list = list(loader)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'data_list' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn [16], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mwith\u001b[39;00m torch\u001b[39m.\u001b[39mno_grad():\n\u001b[0;32m----> 2\u001b[0m x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src \u001b[39m=\u001b[39m [x\u001b[39m.\u001b[39mcuda() \u001b[39mfor\u001b[39;00m x \u001b[39min\u001b[39;00m data_list[\u001b[39m0\u001b[39m]]\n\u001b[1;32m 3\u001b[0m sid_tgt1 \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mLongTensor([\u001b[39m1\u001b[39m])\u001b[39m.\u001b[39mcuda()\n\u001b[1;32m 4\u001b[0m sid_tgt2 \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mLongTensor([\u001b[39m2\u001b[39m])\u001b[39m.\u001b[39mcuda()\n", "\u001b[0;31mNameError\u001b[0m: name 'data_list' is not defined" ] } ], "source": [ "with torch.no_grad():\n", " x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda() for x in data_list[0]]\n", " sid_tgt1 = torch.LongTensor([1]).cuda()\n", " sid_tgt2 = torch.LongTensor([2]).cuda()\n", " sid_tgt3 = torch.LongTensor([4]).cuda()\n", " audio1 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data.cpu().float().numpy()\n", " audio2 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt2)[0][0,0].data.cpu().float().numpy()\n", " audio3 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt3)[0][0,0].data.cpu().float().numpy()\n", "print(\"Original SID: %d\" % sid_src.item())\n", "ipd.display(ipd.Audio(y[0].cpu().numpy(), rate=hps.data.sampling_rate, normalize=False))\n", "print(\"Converted SID: %d\" % sid_tgt1.item())\n", "ipd.display(ipd.Audio(audio1, rate=hps.data.sampling_rate, normalize=False))\n", "print(\"Converted SID: %d\" % sid_tgt2.item())\n", "ipd.display(ipd.Audio(audio2, rate=hps.data.sampling_rate, normalize=False))\n", "print(\"Converted SID: %d\" % sid_tgt3.item())\n", "ipd.display(ipd.Audio(audio3, rate=hps.data.sampling_rate, normalize=False))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.10.8 (conda)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.15" }, "vscode": { "interpreter": { "hash": "e83f87adbbd22850962ce64ca64909ac520dc4a639578a07f0a0cdcfce9beb18" } } }, "nbformat": 4, "nbformat_minor": 4 }