File size: 5,770 Bytes
0b5b973 6ae1eee 0b5b973 3a0629c 0b5b973 3a0629c b0c547a 3a0629c 26fdaed 3a0629c 0b5b973 3a0629c 0b5b973 b0c547a 3a0629c b0c547a 3a0629c b0c547a 3a0629c 0b5b973 b0c547a 0b5b973 b0c547a 0b5b973 6ae1eee 3a0629c 6ae1eee 3a0629c b0c547a 3a0629c b0c547a 3a0629c b0c547a 3a0629c b0c547a 3a0629c b0c547a 3a0629c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import gradio as gr
import torch
import librosa
from pathlib import Path
import tempfile, torchaudio
from transformers import pipeline
from uuid import uuid4
# Load the MARS5 model
mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
asr_model = pipeline(
"automatic-speech-recognition",
model="openai/whisper-tiny",
chunk_length_s=30,
device=torch.device("cuda:0"),
)
def transcribe_file(f: str) -> str:
predictions = asr_model(f, return_timestamps=True)["chunks"]
print(f">>>>>. predictions: {predictions}")
return " ".join([prediction["text"] for prediction in predictions])
# Function to process the text and audio input and generate the synthesized output
def synthesize(text, audio_file, transcript, kwargs_dict):
print(f">>>>>>> Kwargs dict: {kwargs_dict}")
# audio_file = Path(audio_file)
# temp_file = f"{uuid4()}.{audio_file.suffix}"
# # copying the audio_file
# with open(audio_file, 'rb') as src, open(temp_file, 'wb') as dst:
# dst.write(src.read())
# audio_file = temp_file
print(f">>>>> synthesizing! audio_file: {audio_file}")
if not transcript:
transcript = transcribe_file(audio_file)
# Load the reference audio
wav, sr = librosa.load(audio_file, sr=mars5.sr, mono=True)
wav = torch.from_numpy(wav)
# Define the configuration for the TTS model
cfg = config_class(**kwargs_dict)
# Generate the synthesized audio
ar_codes, wav_out = mars5.tts(text, wav, transcript.strip(), cfg=cfg)
# Save the synthesized audio to a temporary file
output_path = Path(tempfile.mktemp(suffix=".wav"))
torchaudio.save(output_path, wav_out.unsqueeze(0), mars5.sr)
return str(output_path)
defaults = {
'temperature': 0.8,
'top_k': -1,
'top_p': 0.2,
'typical_p': 1.0,
'freq_penalty': 2.6,
'presence_penalty': 0.4,
'rep_penalty_window': 100,
'max_prompt_phones': 360,
'deep_clone': True,
'nar_guidance_w': 3
}
with gr.Blocks() as demo:
gr.Markdown("## MARS5 TTS Demo\nEnter text and upload an audio file to clone the voice and generate synthesized speech using MARS5 TTS.")
text = gr.Textbox(label="Text to synthesize")
audio_file = gr.Audio(label="Audio file to clone from", type="filepath")
generate_btn = gr.Button("Generate Synthesized Audio")
with gr.Accordion("Advanced Settings", open=False):
gr.Markdown("additional inference settings\nWARNING: changing these incorrectly may degrade quality.")
prompt_text = gr.Textbox(label="Transcript of voice reference")
temperature = gr.Slider(minimum=0.01, maximum=3, step=0.01, label="temperature", value=defaults['temperature'])
top_k = gr.Slider(minimum=-1, maximum=2000, step=1, label="top_k", value=defaults['top_k'])
top_p = gr.Slider(minimum=0.01, maximum=1.0, step=0.01, label="top_p", value=defaults['top_p'])
typical_p = gr.Slider(minimum=0.01, maximum=1, step=0.01, label="typical_p", value=defaults['typical_p'])
freq_penalty = gr.Slider(minimum=0, maximum=5, step=0.05, label="freq_penalty", value=defaults['freq_penalty'])
presence_penalty = gr.Slider(minimum=0, maximum=5, step=0.05, label="presence_penalty", value=defaults['presence_penalty'])
rep_penalty_window = gr.Slider(minimum=1, maximum=500, step=1, label="rep_penalty_window", value=defaults['rep_penalty_window'])
nar_guidance_w = gr.Slider(minimum=1, maximum=8, step=0.1, label="nar_guidance_w", value=defaults['nar_guidance_w'])
deep_clone = gr.Checkbox(value=defaults['deep_clone'], label='deep_clone')
output = gr.Audio(label="Synthesized Audio", type="filepath")
def on_click(
text,
audio_file,
prompt_text,
temperature,
top_k,
top_p,
typical_p,
freq_penalty,
presence_penalty,
rep_penalty_window,
nar_guidance_w,
deep_clone
):
print(f">>>> transcript: {prompt_text}; audio_file = {audio_file}")
of = synthesize(
text,
audio_file,
prompt_text,
{
'temperature': temperature,
'top_k': top_k,
'top_p': top_p,
'typical_p': typical_p,
'freq_penalty': freq_penalty,
'presence_penalty': presence_penalty,
'rep_penalty_window': rep_penalty_window,
'nar_guidance_w': nar_guidance_w,
'deep_clone': deep_clone
}
)
print(f">>>> output file: {of}")
return of
generate_btn.click(
on_click,
inputs=[
text,
audio_file,
prompt_text,
temperature,
top_k,
top_p,
typical_p,
freq_penalty,
presence_penalty,
rep_penalty_window,
nar_guidance_w,
deep_clone
],
outputs=[output]
)
# Add examples
defaults = [0.8, -1, 0.2, 1.0, 2.6, 0.4, 100, 3, True]
examples = [
["Today is a wonderful day!", "female_speaker_1.flac", "People look, but no one ever finds it.", *defaults],
["You guys need to figure this out.", "male_speaker_1.flac", "Ask her to bring these things with her from the store.", *defaults]
]
gr.Examples(
examples=examples,
inputs=[text, audio_file, prompt_text, temperature, top_k, top_p, typical_p, freq_penalty, presence_penalty, rep_penalty_window, nar_guidance_w, deep_clone],
outputs=[output],
cache_examples=False,
fn=on_click
)
demo.launch(share=False) |