diff --git "a/inference_tts.ipynb" "b/inference_tts.ipynb" new file mode 100644--- /dev/null +++ "b/inference_tts.ipynb" @@ -0,0 +1,169 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "1e2248a6-99e4-4780-aa34-c149d2af7372", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['This is Generated Audio, India, a land of ancient wisdom and boundless', 'potential, stands at the cusp of a new era. Our youth, the vibrant heartbeat of', 'our nation, hold the key to unlocking this potential. They are the digital', 'natives, the innovators, the dreamers who will shape the India of tomorrow.', \"Knowledge is the most powerful weapon in today's world. It's not just about\", 'education, but about the ability to think critically, to adapt, and to innovate.', 'Our youth, with their thirst for knowledge and access to technology, have the', 'potential to become global leaders. The power of India lies in its diversity. It', 'is our diversity that makes us unique, that fuels our creativity, and that', 'drives our progress. Our youth, with their understanding of different cultures', 'and perspectives, can bridge divides and foster unity. Technology is the', 'catalyst for change. It has the power to transform lives, to create', 'opportunities, and to address challenges. Our youth, with their expertise in', 'technology, can develop solutions that benefit society as a whole. I believe in', \"the potential of India's youth. I believe in their ability to build a nation\", 'that is prosperous, inclusive, and sustainable. Let us empower them, support', 'their dreams, and provide them with the resources they need to succeed.', 'Together, we can create an India that is a beacon of hope for the world. This is', 'Generated Audio,']\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan\n", + "from speechbrain.pretrained import EncoderClassifier\n", + "from IPython.display import Audio\n", + "from datasets import load_dataset\n", + "import noisereduce as nr\n", + "import soundfile as sf\n", + "import os, torchaudio\n", + "import numpy as np\n", + "import torch\n", + "\n", + "\n", + "processor = SpeechT5Processor.from_pretrained(\"checkpoint-60000\")#Replace with the model folder \n", + "processor.tokenizer.split_special_tokens = True\n", + "model = SpeechT5ForTextToSpeech.from_pretrained(\"checkpoint-60000\")#Replace with the model folder \n", + "vocoder = SpeechT5HifiGan.from_pretrained(\"microsoft/speecht5_hifigan\")\n", + "embeddings_dataset = load_dataset(\"Matthijs/cmu-arctic-xvectors\", split=\"validation\")\n", + "speaker_embeddings = torch.tensor(embeddings_dataset[7306][\"xvector\"]).unsqueeze(0)\n", + "\n", + "\n", + "spk_model_name = \"speechbrain/spkrec-xvect-voxceleb\"\n", + "\n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "speaker_model = EncoderClassifier.from_hparams(\n", + " source=spk_model_name,\n", + " run_opts={\"device\": device},\n", + " savedir=os.path.join(\"/tmp\", spk_model_name),\n", + ")\n", + "signal, fs =torchaudio.load('wavs/converted_ratan_tata_tts_200.wav')\n", + "# Ensure to detach and clone before converting to tensor if needed\n", + "speaker_embeddings = speaker_model.encode_batch(signal) # Directly passing signal as a tensor, no need to wrap in torch.tensor\n", + "speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) # Normalize the embeddings\n", + "speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() # Squeeze and convert to numpy array\n", + "speaker_embeddings = torch.tensor(np.array([speaker_embeddings])) # Convert back to tensor if necessary\n", + "\n", + "\n", + "input_text=''' This is Generated Audio,\n", + "India, a land of ancient wisdom and boundless potential, stands at the cusp of a new era. Our youth, the vibrant heartbeat of our nation, hold the key to unlocking this potential. They are the digital natives, the innovators, the dreamers who will shape the India of tomorrow.\n", + "\n", + "Knowledge is the most powerful weapon in today's world. It's not just about education, but about the ability to think critically, to adapt, and to innovate. Our youth, with their thirst for knowledge and access to technology, have the potential to become global leaders.\n", + "\n", + "The power of India lies in its diversity. It is our diversity that makes us unique, that fuels our creativity, and that drives our progress. Our youth, with their understanding of different cultures and perspectives, can bridge divides and foster unity.\n", + "\n", + "Technology is the catalyst for change. It has the power to transform lives, to create opportunities, and to address challenges. Our youth, with their expertise in technology, can develop solutions that benefit society as a whole.\n", + "\n", + "I believe in the potential of India's youth. I believe in their ability to build a nation that is prosperous, inclusive, and sustainable. Let us empower them, support their dreams, and provide them with the resources they need to succeed. Together, we can create an India that is a beacon of hope for the world.\n", + "This is Generated Audio,\n", + " '''\n", + "\n", + "\n", + "def split_text_by_length(text, max_length=60):#from the paper speech_t5 max char length 120 char \"max_length=60\"\n", + " # Splits the text into chunks of max_length, preserving words\n", + " words = text.split()\n", + " result = []\n", + " current_line = []\n", + "\n", + " for word in words:\n", + " # Check if adding the next word exceeds the maximum length\n", + " if len(' '.join(current_line + [word])) > max_length:\n", + " result.append(' '.join(current_line))\n", + " current_line = [word]\n", + " else:\n", + " current_line.append(word)\n", + " \n", + " # Add the last remaining part\n", + " if current_line:\n", + " result.append(' '.join(current_line))\n", + " \n", + " return result\n", + "\n", + "\n", + "\n", + "splited_text=split_text_by_length(input_text,max_length=80)\n", + "print(splited_text)\n", + "\n", + "all_speech = []\n", + "\n", + "# Assuming splited_text is already defined\n", + "for i in splited_text:\n", + "\n", + " inputs = processor(text=i, return_tensors=\"pt\")\n", + " speech_chunk = model.generate_speech(inputs[\"input_ids\"], speaker_embeddings, vocoder=vocoder) \n", + " if isinstance(speech_chunk, torch.Tensor):\n", + " speech_chunk = speech_chunk.cpu().numpy()\n", + "\n", + " # Apply noise reduction to each speech chunk\n", + " reduced_noise_chunk = nr.reduce_noise(y=speech_chunk, sr=16000) # assuming 16kHz sample rate\n", + "\n", + " all_speech.append(reduced_noise_chunk)\n", + "\n", + "# Concatenate the noise-reduced speech chunks\n", + "concatenated_speech = np.concatenate(all_speech)\n", + "\n", + "# Display the final audio with noise reduced\n", + "Audio(concatenated_speech, rate=16000)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93c19b2c-0479-4a39-8632-4fc29f67abaa", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}