import gradio as gr import torch import librosa from pathlib import Path import tempfile, torchaudio # Load the MARS5 model mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True) # Default reference audio and transcript # default_audio_path = "example.wav" # default_transcript = "We actually haven't managed to meet demand." # Function to process the text and audio input and generate the synthesized output def synthesize(text, audio_file, transcript): # Load the reference audio wav, sr = librosa.load(audio_file, sr=mars5.sr, mono=True) wav = torch.from_numpy(wav) # Define the configuration for the TTS model deep_clone = True cfg = config_class(deep_clone=deep_clone, rep_penalty_window=100, top_k=100, temperature=0.7, freq_penalty=3) # Generate the synthesized audio ar_codes, wav_out = mars5.tts(text, wav, transcript, cfg=cfg) # Save the synthesized audio to a temporary file output_path = Path(tempfile.mktemp(suffix=".wav")) torchaudio.save(output_path, wav_out.unsqueeze(0), mars5.sr) return str(output_path) # Create the Gradio interface interface = gr.Interface( fn=synthesize, inputs=[ gr.Textbox(label="Text to synthesize"), gr.Audio(label="Audio file to clone from", type="filepath"), gr.Textbox(label="Uploaded audio file transcript"), ], outputs=gr.Audio(label="Synthesized Audio"), title="MARS5 TTS Demo", description="Enter text and upload an audio file to clone the voice and generate synthesized speech using MARS5 TTS." ) # Launch the Gradio app interface.launch()