krishna195's picture
Create app.py
10bf035 verified
raw
history blame
1.78 kB
# Install necessary libraries (if not installed)
# !pip install gradio transformers soundfile torch
import gradio as gr
import torch
import soundfile as sf
from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor, SpeechT5HifiGan
# Load the pre-trained model, vocoder, and processor
model = SpeechT5ForTextToSpeech.from_pretrained("krishna195/speecht5_krishna_finatuned")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# Speaker embeddings for speech generation (replace this with actual embeddings if needed)
speaker_embeddings = torch.randn(1, 512) # Example speaker embedding size (dummy embeddings)
# Function to generate speech from input text
def text_to_speech(input_text):
# Process the input text
inputs = processor(text=input_text, return_tensors="pt")
# Generate speech using the model and vocoder
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
# Save the audio to a file (temporary storage)
output_file = "generated_speech.wav"
sf.write(output_file, speech.numpy(), 16000)
# Return the path to the audio file for Gradio to play it
return output_file
# Create Gradio UI
iface = gr.Interface(
fn=text_to_speech,
inputs="text",
outputs="audio",
title="Text to Speech Generator",
description="Enter the text you want to convert to speech, and the model will generate the corresponding speech.",
examples=[
["Hello, how are you doing today?"],
["The CUDA programming model allows parallel computing on GPUs."],
["TensorFlow and PyTorch are popular machine learning frameworks."]
]
)
# Launch the Gradio interface
iface.launch()