|
|
|
import gradio as gr |
|
import torch |
|
import librosa |
|
from pathlib import Path |
|
import tempfile, torchaudio |
|
|
|
|
|
|
|
mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
def synthesize(text, audio_file, transcript): |
|
|
|
wav, sr = librosa.load(audio_file, sr=mars5.sr, mono=True) |
|
wav = torch.from_numpy(wav) |
|
|
|
|
|
deep_clone = True |
|
cfg = config_class(deep_clone=deep_clone, rep_penalty_window=100, top_k=100, temperature=0.7, freq_penalty=3) |
|
|
|
|
|
ar_codes, wav_out = mars5.tts(text, wav, transcript, cfg=cfg) |
|
|
|
|
|
output_path = Path(tempfile.mktemp(suffix=".wav")) |
|
torchaudio.save(output_path, wav_out.unsqueeze(0), mars5.sr) |
|
|
|
return str(output_path) |
|
|
|
|
|
interface = gr.Interface( |
|
fn=synthesize, |
|
inputs=[ |
|
gr.Textbox(label="Text to synthesize"), |
|
gr.Audio(label="Audio file to clone from", type="filepath"), |
|
gr.Textbox(label="Uploaded audio file transcript"), |
|
], |
|
outputs=gr.Audio(label="Synthesized Audio"), |
|
title="MARS5 TTS Demo", |
|
description="Enter text and upload an audio file to clone the voice and generate synthesized speech using MARS5 TTS." |
|
) |
|
|
|
|
|
interface.launch() |