In [2]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
from IPython.display import Audio
from datasets import load_dataset
import noisereduce as nr
import soundfile as sf
import os, torchaudio
import numpy as np
import torch


processor = SpeechT5Processor.from_pretrained("checkpoint-60000")#Replace with the model folder 
processor.tokenizer.split_special_tokens = True
model = SpeechT5ForTextToSpeech.from_pretrained("checkpoint-60000")#Replace with the model folder 
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
 source=spk_model_name,
 run_opts={"device": device},
 savedir=os.path.join("/tmp", spk_model_name),
)
signal, fs =torchaudio.load('wavs/converted_ratan_tata_tts_200.wav')
# Ensure to detach and clone before converting to tensor if needed
speaker_embeddings = speaker_model.encode_batch(signal) # Directly passing signal as a tensor, no need to wrap in torch.tensor
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) # Normalize the embeddings
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() # Squeeze and convert to numpy array
speaker_embeddings = torch.tensor(np.array([speaker_embeddings])) # Convert back to tensor if necessary


input_text=''' This is Generated Audio,
India, a land of ancient wisdom and boundless potential, stands at the cusp of a new era. Our youth, the vibrant heartbeat of our nation, hold the key to unlocking this potential. They are the digital natives, the innovators, the dreamers who will shape the India of tomorrow.

Knowledge is the most powerful weapon in today's world. It's not just about education, but about the ability to think critically, to adapt, and to innovate. Our youth, with their thirst for knowledge and access to technology, have the potential to become global leaders.

The power of India lies in its diversity. It is our diversity that makes us unique, that fuels our creativity, and that drives our progress. Our youth, with their understanding of different cultures and perspectives, can bridge divides and foster unity.

Technology is the catalyst for change. It has the power to transform lives, to create opportunities, and to address challenges. Our youth, with their expertise in technology, can develop solutions that benefit society as a whole.

I believe in the potential of India's youth. I believe in their ability to build a nation that is prosperous, inclusive, and sustainable. Let us empower them, support their dreams, and provide them with the resources they need to succeed. Together, we can create an India that is a beacon of hope for the world.
This is Generated Audio,
 '''


def split_text_by_length(text, max_length=60):#from the paper speech_t5 max char length 120 char "max_length=60"
 # Splits the text into chunks of max_length, preserving words
 words = text.split()
 result = []
 current_line = []

 for word in words:
 # Check if adding the next word exceeds the maximum length
 if len(' '.join(current_line + [word])) > max_length:
 result.append(' '.join(current_line))
 current_line = [word]
 else:
 current_line.append(word)
 
 # Add the last remaining part
 if current_line:
 result.append(' '.join(current_line))
 
 return result



splited_text=split_text_by_length(input_text,max_length=80)
print(splited_text)

all_speech = []

# Assuming splited_text is already defined
for i in splited_text:

 inputs = processor(text=i, return_tensors="pt")
 speech_chunk = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) 
 if isinstance(speech_chunk, torch.Tensor):
 speech_chunk = speech_chunk.cpu().numpy()

 # Apply noise reduction to each speech chunk
 reduced_noise_chunk = nr.reduce_noise(y=speech_chunk, sr=16000) # assuming 16kHz sample rate

 all_speech.append(reduced_noise_chunk)

# Concatenate the noise-reduced speech chunks
concatenated_speech = np.concatenate(all_speech)

# Display the final audio with noise reduced
Audio(concatenated_speech, rate=16000)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


['This is Generated Audio, India, a land of ancient wisdom and boundless', 'potential, stands at the cusp of a new era. Our youth, the vibrant heartbeat of', 'our nation, hold the key to unlocking this potential. They are the digital', 'natives, the innovators, the dreamers who will shape the India of tomorrow.', "Knowledge is the most powerful weapon in today's world. It's not just about", 'education, but about the ability to think critically, to adapt, and to innovate.', 'Our youth, with their thirst for knowledge and access to technology, have the', 'potential to become global leaders. The power of India lies in its diversity. It', 'is our diversity that makes us unique, that fuels our creativity, and that', 'drives our progress. Our youth, with their understanding of different cultures', 'and perspectives, can bridge divides and foster unity. Technology is the', 'catalyst for change. It has the power to transform lives, to create', 'opportunities, and to address challenges. Our you