diff --git "a/tts.ipynb" "b/tts.ipynb" --- "a/tts.ipynb" +++ "b/tts.ipynb" @@ -9,52 +9,70 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/miniconda/base/envs/llm/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "import os\n", "\n", "import torch\n", "import gradio as gr\n", "from TTS.api import TTS\n", - "os.environ[\"COQUI_TOS_AGREED\"] = \"1\"" + "os.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n", + "# os.environ[\"PYTORCH_ENABLE_MPS_FALLBACK\"] = \"1\"" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from collections import namedtuple\n", "\n", - "Voice = namedtuple('voice', ['name', 'neutral','sad','angry','happy'])\n" + "Voice = namedtuple('voice', ['name', 'neutral','angry'])\n" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "voices = [\n", - " Voice('Rick', neutral='audio/rick/angry.mp3', sad=None, angry=None, happy=None),\n", + " Voice('Attenborough', neutral='audio/attenborough/neutral.wav', angry=None),\n", + " Voice('Rick', neutral='audio/rick/neutral.wav', angry=None),\n", + " Voice('Freeman', neutral='audio/freeman/neutral.wav', angry='audio/freeman/angry.wav'),\n", + " Voice('Walken', neutral='audio/walken/neutral.wav', angry=None),\n", + " Voice('Darth Wader', neutral='audio/darth/neutral.wav', angry=None),\n", "]" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[voice(name='Rick', neutral='audio/rick/angry.mp3', sad=None, angry=None, happy=None)]" + "[voice(name='Attenborough', neutral='audio/attenborough/neutral.mp3', angry=None),\n", + " voice(name='Rick', neutral='audio/rick/neutral.mp3', angry=None),\n", + " voice(name='Freeman', neutral='audio/freeman/neutral.mp3', angry='audio/freeman/angry.mp3'),\n", + " voice(name='Walken', neutral='audio/walken/neutral.mp3', angry=None),\n", + " voice(name='Darth Wader', neutral='audio/darth/neutral.mp3', angry=None)]" ] }, - "execution_count": 14, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -65,14 +83,32 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " > tts_models/multilingual/multi-dataset/xtts_v1.1 is already downloaded.\n", + " > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/miniconda/base/envs/llm/lib/python3.11/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " _torch_pytree._register_pytree_node(\n", + "/opt/homebrew/Caskroom/miniconda/base/envs/llm/lib/python3.11/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " _torch_pytree._register_pytree_node(\n", + "/opt/homebrew/Caskroom/miniconda/base/envs/llm/lib/python3.11/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " _torch_pytree._register_pytree_node(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ " > Using model: xtts\n" ] } @@ -80,12 +116,58 @@ "source": [ "#load model for text to speech\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", - "tts_pipelins = TTS(\"tts_models/multilingual/multi-dataset/xtts_v1.1\").to(device)" + "# device = \"mps\"\n", + "tts_pipelins = TTS(\"tts_models/multilingual/multi-dataset/xtts_v2\").to(device)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import IPython\n" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "speaker_embedding_cache = {}" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_speaker_embedding(voice_path: str, config, pipeline, cache):\n", + " if voice_path not in cache:\n", + " cache[voice_path] = pipeline.synthesizer.tts_model.get_conditioning_latents(\n", + " audio_path=voice_path,\n", + " gpt_cond_len=config.gpt_cond_len,\n", + " gpt_cond_chunk_len=config.gpt_cond_chunk_len,\n", + " max_ref_length=config.max_ref_len,\n", + " sound_norm_refs=config.sound_norm_refs,\n", + " )\n", + " return cache[voice_path]" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "out = compute_speaker_embedding(voices[0].neutral, tts_pipelins.synthesizer.tts_config, tts_pipelins, speaker_embedding_cache)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -93,29 +175,297 @@ "output_type": "stream", "text": [ " > Text splitted to sentences.\n", - "[\"Hello, I am Rick, pickle rick, you took a wrong turn and now you're stuck in a parallel universe\"]\n", - " > Processing time: 0.7903299331665039\n", - " > Real-time factor: 0.11176741294459602\n" + "['Hey Petra, so you are hungry?', 'and you like me to prepare some strawberries for you?', 'do you like strawberries?']\n", + " > Processing time: 15.77448582649231\n", + " > Real-time factor: 1.7459813091024587\n" ] - }, + } + ], + "source": [ + "out = tts_pipelins.tts(\n", + " \"Hello, I am Rick, pickle rick, you took a wrong turn and now you're stuck in a parallel universe\",\n", + " speaker_wav=\"audio/freeman/neutral.wav\",\n", + " language=\"en\",\n", + " # file_path=\"out.wav\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List\n", + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "ref_audio_path = \"audio/freeman/neutral.wav\"" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "config.max_ref_len = 360" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "config = tts_pipelins.synthesizer.tts_config\n", + "(gpt_cond_latent, speaker_embedding) = tts_pipelins.synthesizer.tts_model.get_conditioning_latents(\n", + " audio_path=ref_audio_path,\n", + " gpt_cond_len=config.gpt_cond_len,\n", + " gpt_cond_chunk_len=config.gpt_cond_chunk_len,\n", + " max_ref_length=config.max_ref_len,\n", + " sound_norm_refs=config.sound_norm_refs,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [], + "source": [ + "(gpt_cond_latent, speaker_embedding) = compute_speaker_embedding(voices[0].neutral, tts_pipelins.synthesizer.tts_config, tts_pipelins, speaker_embedding_cache)" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [ { "data": { "text/plain": [ - "'out.wav'" + "(205872,)" ] }, - "execution_count": 27, + "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "tts_pipelins.tts_to_file(\"Hello, I am Rick, pickle rick, you took a wrong turn and now you're stuck in a parallel universe\", speaker_wav=\"audio/rick/neutral.wav\", emotion='neutral', language='en', file_path='out.wav')" + "np.array(out)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 110, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "205872" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(out)" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Text splitted to sentences.\n", + "['Something is up!']\n", + " > Processing time: 2.9515581130981445\n", + " > Real-time factor: 1.588292083019672\n" + ] + } + ], + "source": [ + "out = tts(\n", + " tts_pipelins.synthesizer,\n", + " \"Something is up!\",\n", + " # speaker_wav=ref_audio_path,\n", + " language_name=\"en\",\n", + " speaker=None,\n", + " gpt_cond_latent=gpt_cond_latent,\n", + " speaker_embedding=speaker_embedding,\n", + " speed=1.1,\n", + " # file_path=\"out.wav\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 129, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "IPython.display.Audio(out, rate=22050)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input\n", + "\n", + "def tts(\n", + " self,\n", + " text: str = \"\",\n", + " language_name: str = \"\",\n", + " reference_wav=None,\n", + " gpt_cond_latent=None,\n", + " speaker_embedding=None,\n", + " split_sentences: bool = True,\n", + " **kwargs,\n", + ") -> List[int]:\n", + " \"\"\"🐸 TTS magic. Run all the models and generate speech.\n", + "\n", + " Args:\n", + " text (str): input text.\n", + " speaker_name (str, optional): speaker id for multi-speaker models. Defaults to \"\".\n", + " language_name (str, optional): language id for multi-language models. Defaults to \"\".\n", + " speaker_wav (Union[str, List[str]], optional): path to the speaker wav for voice cloning. Defaults to None.\n", + " style_wav ([type], optional): style waveform for GST. Defaults to None.\n", + " style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None.\n", + " reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.\n", + " reference_speaker_name ([type], optional): speaker id of reference waveform. Defaults to None.\n", + " split_sentences (bool, optional): split the input text into sentences. Defaults to True.\n", + " **kwargs: additional arguments to pass to the TTS model.\n", + " Returns:\n", + " List[int]: [description]\n", + " \"\"\"\n", + " start_time = time.time()\n", + " wavs = []\n", + "\n", + " if not text and not reference_wav:\n", + " raise ValueError(\n", + " \"You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API.\"\n", + " )\n", + "\n", + " if text:\n", + " sens = [text]\n", + " if split_sentences:\n", + " print(\" > Text splitted to sentences.\")\n", + " sens = self.split_into_sentences(text)\n", + " print(sens)\n", + "\n", + " if not reference_wav: # not voice conversion\n", + " for sen in sens:\n", + " outputs = self.tts_model.inference(\n", + " sen,\n", + " language_name,\n", + " gpt_cond_latent,\n", + " speaker_embedding,\n", + " # GPT inference\n", + " temperature=0.75,\n", + " length_penalty=1.0,\n", + " repetition_penalty=10.0,\n", + " top_k=50,\n", + " top_p=0.85,\n", + " do_sample=True,\n", + " **kwargs,\n", + " )\n", + " waveform = outputs[\"wav\"]\n", + " if torch.is_tensor(waveform) and waveform.device != torch.device(\"cpu\") and not use_gl:\n", + " waveform = waveform.cpu()\n", + " if not use_gl:\n", + " waveform = waveform.numpy()\n", + " waveform = waveform.squeeze()\n", + "\n", + " # trim silence\n", + " if \"do_trim_silence\" in self.tts_config.audio and self.tts_config.audio[\"do_trim_silence\"]:\n", + " waveform = trim_silence(waveform, self.tts_model.ap)\n", + "\n", + " wavs += list(waveform)\n", + " wavs += [0] * 10000\n", + "\n", + "\n", + " # compute stats\n", + " process_time = time.time() - start_time\n", + " audio_time = len(wavs) / self.tts_config.audio[\"sample_rate\"]\n", + " print(f\" > Processing time: {process_time}\")\n", + " print(f\" > Real-time factor: {process_time / audio_time}\")\n", + " return wavs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "type(tts_pipelins)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "IPython.display.Audio(out, rate=22050)" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -141,34 +491,99 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "voice_options = []\n", + "for voice in voices:\n", + " if voice.neutral:\n", + " voice_options.append(f\"{voice.name} - Neutral\")\n", + " if voice.angry:\n", + " voice_options.append(f\"{voice.name} - Angry\")" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [], + "source": [ + "def voice_from_text(voice):\n", + " for v in voices:\n", + " if voice == f\"{v.name} - Neutral\":\n", + " return v.neutral\n", + " if voice == f\"{v.name} - Angry\":\n", + " return v.angry" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [], + "source": [ + "def tts_gradio(text, voice, state):\n", + " print(text, voice, state)\n", + " voice_path = voice_from_text(voice)\n", + " (gpt_cond_latent, speaker_embedding) = compute_speaker_embedding(voice_path, tts_pipelins.synthesizer.tts_config, tts_pipelins, speaker_embedding_cache)\n", + " out = tts(\n", + " tts_pipelins.synthesizer,\n", + " text,\n", + " language_name=\"en\",\n", + " speaker=None,\n", + " gpt_cond_latent=gpt_cond_latent,\n", + " speaker_embedding=speaker_embedding,\n", + " speed=1.1,\n", + " # file_path=\"out.wav\",\n", + " )\n", + " return (22050, np.array(out)), dict(text=text, voice=voice)" + ] + }, + { + "cell_type": "code", + "execution_count": 122, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/conda/lib/python3.10/site-packages/gradio/utils.py:924: UserWarning: Expected 1 arguments for function , received 3.\n", - " warnings.warn(\n", - "/opt/conda/lib/python3.10/site-packages/gradio/utils.py:932: UserWarning: Expected maximum 1 arguments for function , received 3.\n", - " warnings.warn(\n" - ] - }, + "data": { + "text/plain": [ + "dict_keys(['audio/attenborough/neutral.wav'])" + ] + }, + "execution_count": 122, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "speaker_embedding_cache.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\n", + "This is going to be fun, let's enjoy ourselves\n", + "Closing server running on port: 7860\n", + "Closing server running on port: 7860\n", + "Closing server running on port: 7860\n", + "Closing server running on port: 7860\n", + "Closing server running on port: 7860\n", + "Closing server running on port: 7860\n", + "Closing server running on port: 7860\n", + "Closing server running on port: 7860\n", + "Closing server running on port: 7860\n", + "Closing server running on port: 7860\n", + "Closing server running on port: 7860\n", "Running on local URL: http://0.0.0.0:7860\n", "\n", - "Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/04/18 13:48:05 [W] [service.go:132] login to server failed: dial tcp 44.237.78.176:7000: i/o timeout\n" + "To create a public link, set `share=True` in `launch()`.\n" ] }, { @@ -183,43 +598,42 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is going to be fun, let's enjoy ourselves Darth Wader - Neutral None\n", + " > Text splitted to sentences.\n", + "[\"This is going to be fun, let's enjoy ourselves\"]\n", + " > Processing time: 9.152068138122559\n", + " > Real-time factor: 1.8119083325456329\n" + ] + }, { "name": "stderr", "output_type": "stream", "text": [ - "Traceback (most recent call last):\n", - " File \"/opt/conda/lib/python3.10/site-packages/gradio/queueing.py\", line 527, in process_events\n", - " response = await route_utils.call_process_api(\n", - " File \"/opt/conda/lib/python3.10/site-packages/gradio/route_utils.py\", line 261, in call_process_api\n", - " output = await app.get_blocks().process_api(\n", - " File \"/opt/conda/lib/python3.10/site-packages/gradio/blocks.py\", line 1786, in process_api\n", - " result = await self.call_function(\n", - " File \"/opt/conda/lib/python3.10/site-packages/gradio/blocks.py\", line 1338, in call_function\n", - " prediction = await anyio.to_thread.run_sync(\n", - " File \"/opt/conda/lib/python3.10/site-packages/anyio/to_thread.py\", line 56, in run_sync\n", - " return await get_async_backend().run_sync_in_worker_thread(\n", - " File \"/opt/conda/lib/python3.10/site-packages/anyio/_backends/_asyncio.py\", line 2144, in run_sync_in_worker_thread\n", - " return await future\n", - " File \"/opt/conda/lib/python3.10/site-packages/anyio/_backends/_asyncio.py\", line 851, in run\n", - " result = context.run(func, *args)\n", - " File \"/opt/conda/lib/python3.10/site-packages/gradio/utils.py\", line 759, in wrapper\n", - " response = f(*args, **kwargs)\n", - "TypeError: tts() takes 1 positional argument but 3 were given\n" + "/opt/homebrew/Caskroom/miniconda/base/envs/llm/lib/python3.11/site-packages/gradio/processing_utils.py:390: UserWarning: Trying to convert audio automatically from float64 to 16-bit int format.\n", + " warnings.warn(warning.format(data.dtype))\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Keyboard interruption in main thread... closing server.\n", - "Killing tunnel 0.0.0.0:7860 <> None\n" + "This is going to be fun, let's enjoy ourselves Darth Wader - Neutral {'text': \"This is going to be fun, let's enjoy ourselves\", 'voice': 'Darth Wader - Neutral'}\n", + " > Text splitted to sentences.\n", + "[\"This is going to be fun, let's enjoy ourselves\"]\n", + " > Processing time: 7.824646234512329\n", + " > Real-time factor: 1.8261372721316347\n", + "Keyboard interruption in main thread... closing server.\n" ] }, { "data": { "text/plain": [] }, - "execution_count": 8, + "execution_count": 127, "metadata": {}, "output_type": "execute_result" } @@ -232,28 +646,29 @@ "\n", "\n", "model_answer= ''\n", - "general_context= ''\n", + "general_context= \"This is going to be fun, let's enjoy ourselves\"\n", "# Define the initial state with some initial context.\n", "print(general_context)\n", "initial_state = {'context': general_context}\n", "initial_context= initial_state['context']\n", "# Create the Gradio interface.\n", "iface = gr.Interface(\n", - " fn=tts,\n", + " fn=tts_gradio,\n", " inputs=[\n", - " gr.Textbox(value=initial_context, visible=True),\n", - " gr.Radio(choices=[x.name for x in voices], label='Choose a voice', value=voices[0].name, show_label=True), # Radio button for voice selection\n", + " gr.Textbox(value=initial_context, visible=True, label='Enter the text to be converted to speech', placeholder=\"This is going to be fun, let's enjoy ourselves\", lines=5),\n", + " gr.Radio(choices=voice_options, label='Choose a voice', value=voice_options[0], show_label=True), # Radio button for voice selection\n", " gr.State() # This will keep track of the context state across interactions.\n", " ],\n", " outputs=[\n", - " gr.Audio(label = 'output audio'),\n", + " gr.Audio(label = 'output audio', autoplay=True),\n", " gr.State()\n", - " ]\n", + " ],\n", + " flagging_options=['👎', '👍'],\n", ")\n", "#close all interfaces open to make the port available\n", "gr.close_all()\n", "# Launch the interface.\n", - "iface.launch(debug=True, share=True, server_name=\"0.0.0.0\", server_port=7860, ssl_verify=False)" + "iface.launch(debug=True, share=False, server_name=\"0.0.0.0\", server_port=7860, ssl_verify=False)" ] }, { @@ -280,7 +695,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.11.8" } }, "nbformat": 4,