AidenTTS / app.py
arnavmehta7's picture
Update app.py
59df2b8 verified
raw
history blame
1.65 kB
import gradio as gr
import torch
import librosa
from pathlib import Path
import tempfile, torchaudio
# Load the MARS5 model
mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
# Default reference audio and transcript
# default_audio_path = "example.wav"
# default_transcript = "We actually haven't managed to meet demand."
# Function to process the text and audio input and generate the synthesized output
def synthesize(text, audio_file, transcript):
# Load the reference audio
wav, sr = librosa.load(audio_file, sr=mars5.sr, mono=True)
wav = torch.from_numpy(wav)
# Define the configuration for the TTS model
deep_clone = True
cfg = config_class(deep_clone=deep_clone, rep_penalty_window=100, top_k=100, temperature=0.7, freq_penalty=3)
# Generate the synthesized audio
ar_codes, wav_out = mars5.tts(text, wav, transcript, cfg=cfg)
# Save the synthesized audio to a temporary file
output_path = Path(tempfile.mktemp(suffix=".wav"))
torchaudio.save(output_path, wav_out.unsqueeze(0), mars5.sr)
return str(output_path)
# Create the Gradio interface
interface = gr.Interface(
fn=synthesize,
inputs=[
gr.Textbox(label="Text to synthesize"),
gr.Audio(label="Audio file to clone from", type="filepath"),
gr.Textbox(label="Uploaded audio file transcript"),
],
outputs=gr.Audio(label="Synthesized Audio"),
title="MARS5 TTS Demo",
description="Enter text and upload an audio file to clone the voice and generate synthesized speech using MARS5 TTS."
)
# Launch the Gradio app
interface.launch()