import re
import tempfile
from collections import OrderedDict
import click
import gradio as gr
import numpy as np
import soundfile as sf
import torchaudio
from cached_path import cached_path
from sentence_analyzer import SentenceAnalyzer
from f5_tts.model import DiT
from f5_tts.infer.utils_infer import (
load_vocoder,
load_model,
preprocess_ref_audio_text,
infer_process,
remove_silence_for_generated_wav,
save_spectrogram,
)
try:
import spaces
USING_SPACES = True
except ImportError:
USING_SPACES = False
import nltk
nltk.download('punkt_tab')
def gpu_decorator(func):
if USING_SPACES:
return spaces.GPU(func)
else:
return func
# Carregar vocoder
vocoder = load_vocoder()
import os
from huggingface_hub import hf_hub_download
def load_f5tts():
# Carrega o caminho do repositório e o nome do arquivo das variáveis de ambiente
repo_id = os.getenv("MODEL_REPO_ID", "SWivid/F5-TTS/F5TTS_Base")
filename = os.getenv("MODEL_FILENAME", "model_1200000.safetensors")
token = os.getenv("HUGGINGFACE_TOKEN")
# Valida se o token está presente
if not token:
raise ValueError("A variável de ambiente 'HUGGINGFACE_TOKEN' não foi definida.")
# Faz o download do modelo do repositório privado
ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename, use_auth_token=token)
F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
return load_model(DiT, F5TTS_model_cfg, ckpt_path)
# Carregar modelo F5TTS
F5TTS_ema_model = load_f5tts()
@gpu_decorator
def infer(
ref_audio_orig, ref_text, gen_text, remove_silence, cross_fade_duration=0.15, speed=1, show_info=gr.Info
):
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
ema_model = F5TTS_ema_model
final_wave, final_sample_rate, combined_spectrogram = infer_process(
ref_audio,
ref_text.lower().strip(),
gen_text.lower().strip(),
ema_model,
vocoder,
cross_fade_duration=cross_fade_duration,
speed=speed,
show_info=show_info,
progress=gr.Progress(),
)
# Remover silêncios
if remove_silence:
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
sf.write(f.name, final_wave, final_sample_rate)
remove_silence_for_generated_wav(f.name)
final_wave, _ = torchaudio.load(f.name)
final_wave = final_wave.squeeze().cpu().numpy()
# Salvar espectrograma
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
spectrogram_path = tmp_spectrogram.name
save_spectrogram(combined_spectrogram, spectrogram_path)
return (final_sample_rate, final_wave), spectrogram_path, ref_text
# Estilos CSS
custom_css = """
#sentences-container {
border: 1px solid #ddd;
border-radius: 4px;
padding: 10px;
margin-bottom: 10px;
}
.sentence-box {
border: 1px solid #eee;
padding: 5px;
margin-bottom: 5px;
border-radius: 4px;
background-color: #f9f9f9;
}
"""
with gr.Blocks(css=custom_css) as app:
with gr.Tabs():
with gr.Tab("TTS Básico"):
gr.Markdown("# TTS Básico com F5-TTS")
ref_audio_input = gr.Audio(label="Áudio de Referência", type="filepath")
gen_text_input = gr.Textbox(label="Texto para Gerar", lines=10)
generate_btn = gr.Button("Sintetizar", variant="primary")
# Container para as sentenças
with gr.Row(elem_id="sentences-container"):
sentences_output = gr.Textbox(label="Sentenças", lines=10, interactive=False)
with gr.Accordion("Configurações Avançadas", open=False):
ref_text_input = gr.Textbox(
label="Texto de Referência",
info="Deixe em branco para transcrever automaticamente o áudio de referência. Se você inserir texto, ele substituirá a transcrição automática.",
lines=2,
)
remove_silence = gr.Checkbox(
label="Remover Silêncios",
info="O modelo tende a produzir silêncios, especialmente em áudios mais longos. Podemos remover manualmente os silêncios, se necessário. Isso também aumentará o tempo de geração.",
value=False,
)
speed_slider = gr.Slider(
label="Velocidade",
minimum=0.3,
maximum=2.0,
value=1.0,
step=0.1,
info="Ajuste a velocidade do áudio.",
)
cross_fade_duration_slider = gr.Slider(
label="Duração do Cross-fade (s)",
minimum=0.0,
maximum=1.0,
value=0.15,
step=0.01,
info="Defina a duração do cross-fade entre os clipes de áudio.",
)
audio_output = gr.Audio(label="Áudio Sintetizado")
spectrogram_output = gr.Image(label="Espectrograma")
analyzer = SentenceAnalyzer()
@gpu_decorator
def basic_tts(
ref_audio_input,
ref_text_input,
gen_text_input,
remove_silence,
cross_fade_duration_slider,
speed_slider,
):
# Divida o texto em sentenças
sentences = analyzer.split_into_sentences(gen_text_input)
# Exiba as sentenças com formatação
formatted_sentences = "".join([
f'
{sentence}
'
for sentence in sentences
])
sentences_output.update(value=formatted_sentences)
# Gere áudio para cada sentença individualmente
audio_segments = []
for sentence in sentences:
audio_out, spectrogram_path, ref_text_out = infer(
ref_audio_input,
ref_text_input,
sentence, # Gere áudio para a sentença atual
remove_silence,
cross_fade_duration_slider,
speed_slider,
)
sr, audio_data = audio_out
audio_segments.append(audio_data)
# Concatene os segmentos de áudio
if audio_segments:
final_audio_data = np.concatenate(audio_segments)
return (sr, final_audio_data), spectrogram_path, gr.update(value=ref_text_out)
else:
gr.Warning("Nenhum áudio gerado.")
return None, None, gr.update(value=ref_text_out)
generate_btn.click(
basic_tts,
inputs=[
ref_audio_input,
ref_text_input,
gen_text_input,
remove_silence,
cross_fade_duration_slider,
speed_slider,
],
outputs=[audio_output, spectrogram_output, ref_text_input],
)
with gr.Tab("Multi-Speech"):
gr.Markdown("# Geração Multi-Speech com F5-TTS")
# Regular speech type (mandatory)
with gr.Row():
with gr.Column():
regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
regular_insert = gr.Button("Insert Label", variant="secondary")
regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=2)
# Regular speech type (max 100)
max_speech_types = 100
speech_type_rows = [] # 99
speech_type_names = [regular_name] # 100
speech_type_audios = [regular_audio] # 100
speech_type_ref_texts = [regular_ref_text] # 100
speech_type_delete_btns = [] # 99
speech_type_insert_btns = [regular_insert] # 100
# Additional speech types (99 more)
for i in range(max_speech_types - 1):
with gr.Row(visible=False) as row:
with gr.Column():
name_input = gr.Textbox(label="Speech Type Name")
delete_btn = gr.Button("Delete Type", variant="secondary")
insert_btn = gr.Button("Insert Label", variant="secondary")
audio_input = gr.Audio(label="Reference Audio", type="filepath")
ref_text_input = gr.Textbox(label="Reference Text", lines=2)
speech_type_rows.append(row)
speech_type_names.append(name_input)
speech_type_audios.append(audio_input)
speech_type_ref_texts.append(ref_text_input)
speech_type_delete_btns.append(delete_btn)
speech_type_insert_btns.append(insert_btn)
# Button to add speech type
add_speech_type_btn = gr.Button("Add Speech Type")
# Keep track of current number of speech types
speech_type_count = gr.State(value=1)
# Function to add a speech type
def add_speech_type_fn(speech_type_count):
if speech_type_count < max_speech_types:
speech_type_count += 1
# Prepare updates for the rows
row_updates = []
for i in range(1, max_speech_types):
if i < speech_type_count:
row_updates.append(gr.update(visible=True))
else:
row_updates.append(gr.update())
else:
# Optionally, show a warning
row_updates = [gr.update() for _ in range(1, max_speech_types)]
return [speech_type_count] + row_updates
add_speech_type_btn.click(
add_speech_type_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows
)
# Function to delete a speech type
def make_delete_speech_type_fn(index):
def delete_speech_type_fn(speech_type_count):
# Prepare updates
row_updates = []
for i in range(1, max_speech_types):
if i == index:
row_updates.append(gr.update(visible=False))
else:
row_updates.append(gr.update())
speech_type_count = max(1, speech_type_count)
return [speech_type_count] + row_updates
return delete_speech_type_fn
# Update delete button clicks
for i, delete_btn in enumerate(speech_type_delete_btns):
delete_fn = make_delete_speech_type_fn(i)
delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
# Text input for the prompt
gen_text_input_multistyle = gr.Textbox(
label="Text to Generate",
lines=10,
placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
)
def make_insert_speech_type_fn(index):
def insert_speech_type_fn(current_text, speech_type_name):
current_text = current_text or ""
speech_type_name = speech_type_name or "None"
updated_text = current_text + f"{{{speech_type_name}}} "
return gr.update(value=updated_text)
return insert_speech_type_fn
for i, insert_btn in enumerate(speech_type_insert_btns):
insert_fn = make_insert_speech_type_fn(i)
insert_btn.click(
insert_fn,
inputs=[gen_text_input_multistyle, speech_type_names[i]],
outputs=gen_text_input_multistyle,
)
with gr.Accordion("Advanced Settings", open=False):
remove_silence_multistyle = gr.Checkbox(
label="Remove Silences",
value=True,
)
# Generate button
generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
# Output audio
audio_output_multistyle = gr.Audio(label="Synthesized Audio")
@gpu_decorator
def generate_multistyle_speech(
gen_text,
*args,
):
speech_type_names_list = args[:max_speech_types]
speech_type_audios_list = args[max_speech_types : 2 * max_speech_types]
speech_type_ref_texts_list = args[2 * max_speech_types : 3 * max_speech_types]
remove_silence = args[3 * max_speech_types]
# Collect the speech types and their audios into a dict
speech_types = OrderedDict()
ref_text_idx = 0
for name_input, audio_input, ref_text_input in zip(
speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list
):
if name_input and audio_input:
speech_types[name_input] = {"audio": audio_input, "ref_text": ref_text_input}
else:
speech_types[f"@{ref_text_idx}@"] = {"audio": "", "ref_text": ""}
ref_text_idx += 1
# Parse the gen_text into segments
segments = parse_speechtypes_text(gen_text)
# For each segment, generate speech
generated_audio_segments = []
current_style = "Regular"
for segment in segments:
style = segment["style"]
text = segment["text"]
if style in speech_types:
current_style = style
else:
# If style not available, default to Regular
current_style = "Regular"
ref_audio = speech_types[current_style]["audio"]
ref_text = speech_types[current_style].get("ref_text", "")
# Generate speech for this segment
audio_out, _, ref_text_out = infer(
ref_audio, ref_text, text, remove_silence, 0, show_info=print
) # show_info=print no pull to top when generating
sr, audio_data = audio_out
generated_audio_segments.append(audio_data)
speech_types[current_style]["ref_text"] = ref_text_out
# Concatenate all audio segments
if generated_audio_segments:
final_audio_data = np.concatenate(generated_audio_segments)
return [(sr, final_audio_data)] + [
gr.update(value=speech_types[style]["ref_text"]) for style in speech_types
]
else:
gr.Warning("No audio generated.")
return [None] + [gr.update(value=speech_types[style]["ref_text"]) for style in speech_types]
generate_multistyle_btn.click(
generate_multistyle_speech,
inputs=[
gen_text_input_multistyle,
]
+ speech_type_names
+ speech_type_audios
+ speech_type_ref_texts
+ [
remove_silence_multistyle,
],
outputs=[audio_output_multistyle] + speech_type_ref_texts,
)
# Validation function to disable Generate button if speech types are missing
def validate_speech_types(gen_text, regular_name, *args):
speech_type_names_list = args[:max_speech_types]
# Collect the speech types names
speech_types_available = set()
if regular_name:
speech_types_available.add(regular_name)
for name_input in speech_type_names_list:
if name_input:
speech_types_available.add(name_input)
# Parse the gen_text to get the speech types used
segments = parse_speechtypes_text(gen_text)
speech_types_in_text = set(segment["style"] for segment in segments)
# Check if all speech types in text are available
missing_speech_types = speech_types_in_text - speech_types_available
if missing_speech_types:
# Disable the generate button
return gr.update(interactive=False)
else:
# Enable the generate button
return gr.update(interactive=True)
gen_text_input_multistyle.change(
validate_speech_types,
inputs=[gen_text_input_multistyle, regular_name] + speech_type_names,
outputs=generate_multistyle_btn,
)
@click.command()
@click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
@click.option("--host", "-H", default=None, help="Host to run the app on")
@click.option(
"--share",
"-s",
default=False,
is_flag=True,
help="Share the app via Gradio share link",
)
@click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
def main(port, host, share, api):
global app
print("Starting app...")
app.queue(api_open=api).launch(server_name=host, server_port=port, share=share, show_api=api)
if __name__ == "__main__":
if not USING_SPACES:
main()
else:
app.queue().launch()