import re
import tempfile
from collections import OrderedDict
import click
import gradio as gr
import numpy as np
import soundfile as sf
import torchaudio
from cached_path import cached_path
from sentence_analyzer import SentenceAnalyzer
from f5_tts.model import DiT
from f5_tts.infer.utils_infer import (
    load_vocoder,
    load_model,
    preprocess_ref_audio_text,
    infer_process,
    remove_silence_for_generated_wav,
    save_spectrogram,
)
try:
    import spaces
    USING_SPACES = True
except ImportError:
    USING_SPACES = False


import nltk
nltk.download('punkt_tab')

def gpu_decorator(func):
    if USING_SPACES:
        return spaces.GPU(func)
    else:
        return func

# Carregar vocoder
vocoder = load_vocoder()

import os
from huggingface_hub import hf_hub_download
def load_f5tts():
    # Carrega o caminho do repositório e o nome do arquivo das variáveis de ambiente
    repo_id = os.getenv("MODEL_REPO_ID", "SWivid/F5-TTS/F5TTS_Base")
    filename = os.getenv("MODEL_FILENAME", "model_1200000.safetensors")
    token = os.getenv("HUGGINGFACE_TOKEN")
    # Valida se o token está presente
    if not token:
        raise ValueError("A variável de ambiente 'HUGGINGFACE_TOKEN' não foi definida.")
    # Faz o download do modelo do repositório privado
    ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename, use_auth_token=token)
    
    F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
    return load_model(DiT, F5TTS_model_cfg, ckpt_path)

# Carregar modelo F5TTS
F5TTS_ema_model = load_f5tts()

@gpu_decorator
def infer(
    ref_audio_orig, ref_text, gen_text, remove_silence, cross_fade_duration=0.15, speed=1, show_info=gr.Info
):
    ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
    ema_model = F5TTS_ema_model
    final_wave, final_sample_rate, combined_spectrogram = infer_process(
        ref_audio,
        ref_text.lower().strip(),
        gen_text.lower().strip(),
        ema_model,
        vocoder,
        cross_fade_duration=cross_fade_duration,
        speed=speed,
        show_info=show_info,
        progress=gr.Progress(),
    )
    # Remover silêncios
    if remove_silence:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
            sf.write(f.name, final_wave, final_sample_rate)
            remove_silence_for_generated_wav(f.name)
            final_wave, _ = torchaudio.load(f.name)
        final_wave = final_wave.squeeze().cpu().numpy()
    # Salvar espectrograma
    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
        spectrogram_path = tmp_spectrogram.name
        save_spectrogram(combined_spectrogram, spectrogram_path)
    return (final_sample_rate, final_wave), spectrogram_path, ref_text

# Estilos CSS
custom_css = """
#sentences-container {
    border: 1px solid #ddd;
    border-radius: 4px;
    padding: 10px;
    margin-bottom: 10px;
}
.sentence-box {
    border: 1px solid #eee;
    padding: 5px;
    margin-bottom: 5px;
    border-radius: 4px;
    background-color: #f9f9f9;
}
"""

with gr.Blocks(css=custom_css) as app:
    with gr.Tabs():
        with gr.Tab("TTS Básico"):
            gr.Markdown("# TTS Básico com F5-TTS")
            ref_audio_input = gr.Audio(label="Áudio de Referência", type="filepath")
            gen_text_input = gr.Textbox(label="Texto para Gerar", lines=10)
            generate_btn = gr.Button("Sintetizar", variant="primary")

            # Container para as sentenças
            with gr.Row(elem_id="sentences-container"):
                sentences_output = gr.Textbox(label="Sentenças", lines=10, interactive=False)

            with gr.Accordion("Configurações Avançadas", open=False):
                ref_text_input = gr.Textbox(
                    label="Texto de Referência",
                    info="Deixe em branco para transcrever automaticamente o áudio de referência. Se você inserir texto, ele substituirá a transcrição automática.",
                    lines=2,
                )
                remove_silence = gr.Checkbox(
                    label="Remover Silêncios",
                    info="O modelo tende a produzir silêncios, especialmente em áudios mais longos. Podemos remover manualmente os silêncios, se necessário. Isso também aumentará o tempo de geração.",
                    value=False,
                )
                speed_slider = gr.Slider(
                    label="Velocidade",
                    minimum=0.3,
                    maximum=2.0,
                    value=1.0,
                    step=0.1,
                    info="Ajuste a velocidade do áudio.",
                )
                cross_fade_duration_slider = gr.Slider(
                    label="Duração do Cross-fade (s)",
                    minimum=0.0,
                    maximum=1.0,
                    value=0.15,
                    step=0.01,
                    info="Defina a duração do cross-fade entre os clipes de áudio.",
                )
            audio_output = gr.Audio(label="Áudio Sintetizado")
            spectrogram_output = gr.Image(label="Espectrograma")

            analyzer = SentenceAnalyzer()

            @gpu_decorator
            def basic_tts(
                ref_audio_input,
                ref_text_input,
                gen_text_input,
                remove_silence,
                cross_fade_duration_slider,
                speed_slider,
            ):
                # Divida o texto em sentenças
                sentences = analyzer.split_into_sentences(gen_text_input)

                # Exiba as sentenças com formatação
                formatted_sentences = "".join([
                    f'<div class="sentence-box">{sentence}</div>'
                    for sentence in sentences
                ])
                sentences_output.update(value=formatted_sentences)

                # Gere áudio para cada sentença individualmente
                audio_segments = []
                for sentence in sentences:
                    audio_out, spectrogram_path, ref_text_out = infer(
                        ref_audio_input,
                        ref_text_input,
                        sentence,  # Gere áudio para a sentença atual
                        remove_silence,
                        cross_fade_duration_slider,
                        speed_slider,
                    )
                    sr, audio_data = audio_out
                    audio_segments.append(audio_data)

                # Concatene os segmentos de áudio
                if audio_segments:
                    final_audio_data = np.concatenate(audio_segments)
                    return (sr, final_audio_data), spectrogram_path, gr.update(value=ref_text_out)
                else:
                    gr.Warning("Nenhum áudio gerado.")
                    return None, None, gr.update(value=ref_text_out)

            generate_btn.click(
                basic_tts,
                inputs=[
                    ref_audio_input,
                    ref_text_input,
                    gen_text_input,
                    remove_silence,
                    cross_fade_duration_slider,
                    speed_slider,
                ],
                outputs=[audio_output, spectrogram_output, ref_text_input],
            )
        
        with gr.Tab("Multi-Speech"):
            gr.Markdown("# Geração Multi-Speech com F5-TTS")

            # Regular speech type (mandatory)
            with gr.Row():
                with gr.Column():
                    regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
                    regular_insert = gr.Button("Insert Label", variant="secondary")
                regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
                regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=2)
            # Regular speech type (max 100)
            max_speech_types = 100
            speech_type_rows = []  # 99
            speech_type_names = [regular_name]  # 100
            speech_type_audios = [regular_audio]  # 100
            speech_type_ref_texts = [regular_ref_text]  # 100
            speech_type_delete_btns = []  # 99
            speech_type_insert_btns = [regular_insert]  # 100
            # Additional speech types (99 more)
            for i in range(max_speech_types - 1):
                with gr.Row(visible=False) as row:
                    with gr.Column():
                        name_input = gr.Textbox(label="Speech Type Name")
                        delete_btn = gr.Button("Delete Type", variant="secondary")
                        insert_btn = gr.Button("Insert Label", variant="secondary")
                    audio_input = gr.Audio(label="Reference Audio", type="filepath")
                    ref_text_input = gr.Textbox(label="Reference Text", lines=2)
                speech_type_rows.append(row)
                speech_type_names.append(name_input)
                speech_type_audios.append(audio_input)
                speech_type_ref_texts.append(ref_text_input)
                speech_type_delete_btns.append(delete_btn)
                speech_type_insert_btns.append(insert_btn)
            # Button to add speech type
            add_speech_type_btn = gr.Button("Add Speech Type")
            # Keep track of current number of speech types
            speech_type_count = gr.State(value=1)
            # Function to add a speech type
            def add_speech_type_fn(speech_type_count):
                if speech_type_count < max_speech_types:
                    speech_type_count += 1
                    # Prepare updates for the rows
                    row_updates = []
                    for i in range(1, max_speech_types):
                        if i < speech_type_count:
                            row_updates.append(gr.update(visible=True))
                        else:
                            row_updates.append(gr.update())
                else:
                    # Optionally, show a warning
                    row_updates = [gr.update() for _ in range(1, max_speech_types)]
                return [speech_type_count] + row_updates
            add_speech_type_btn.click(
                add_speech_type_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows
            )
            # Function to delete a speech type
            def make_delete_speech_type_fn(index):
                def delete_speech_type_fn(speech_type_count):
                    # Prepare updates
                    row_updates = []
                    for i in range(1, max_speech_types):
                        if i == index:
                            row_updates.append(gr.update(visible=False))
                        else:
                            row_updates.append(gr.update())
                    speech_type_count = max(1, speech_type_count)
                    return [speech_type_count] + row_updates
                return delete_speech_type_fn
            # Update delete button clicks
            for i, delete_btn in enumerate(speech_type_delete_btns):
                delete_fn = make_delete_speech_type_fn(i)
                delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
            # Text input for the prompt
            gen_text_input_multistyle = gr.Textbox(
                label="Text to Generate",
                lines=10,
                placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
            )
            def make_insert_speech_type_fn(index):
                def insert_speech_type_fn(current_text, speech_type_name):
                    current_text = current_text or ""
                    speech_type_name = speech_type_name or "None"
                    updated_text = current_text + f"{{{speech_type_name}}} "
                    return gr.update(value=updated_text)
                return insert_speech_type_fn
            for i, insert_btn in enumerate(speech_type_insert_btns):
                insert_fn = make_insert_speech_type_fn(i)
                insert_btn.click(
                    insert_fn,
                    inputs=[gen_text_input_multistyle, speech_type_names[i]],
                    outputs=gen_text_input_multistyle,
                )
            with gr.Accordion("Advanced Settings", open=False):
                remove_silence_multistyle = gr.Checkbox(
                    label="Remove Silences",
                    value=True,
                )
            # Generate button
            generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
            # Output audio
            audio_output_multistyle = gr.Audio(label="Synthesized Audio")
            @gpu_decorator
            def generate_multistyle_speech(
                gen_text,
                *args,
            ):
                speech_type_names_list = args[:max_speech_types]
                speech_type_audios_list = args[max_speech_types : 2 * max_speech_types]
                speech_type_ref_texts_list = args[2 * max_speech_types : 3 * max_speech_types]
                remove_silence = args[3 * max_speech_types]
                # Collect the speech types and their audios into a dict
                speech_types = OrderedDict()
                ref_text_idx = 0
                for name_input, audio_input, ref_text_input in zip(
                    speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list
                ):
                    if name_input and audio_input:
                        speech_types[name_input] = {"audio": audio_input, "ref_text": ref_text_input}
                    else:
                        speech_types[f"@{ref_text_idx}@"] = {"audio": "", "ref_text": ""}
                    ref_text_idx += 1
                # Parse the gen_text into segments
                segments = parse_speechtypes_text(gen_text)
                # For each segment, generate speech
                generated_audio_segments = []
                current_style = "Regular"
                for segment in segments:
                    style = segment["style"]
                    text = segment["text"]
                    if style in speech_types:
                        current_style = style
                    else:
                        # If style not available, default to Regular
                        current_style = "Regular"
                    ref_audio = speech_types[current_style]["audio"]
                    ref_text = speech_types[current_style].get("ref_text", "")
                    # Generate speech for this segment
                    audio_out, _, ref_text_out = infer(
                        ref_audio, ref_text, text, remove_silence, 0, show_info=print
                    )  # show_info=print no pull to top when generating
                    sr, audio_data = audio_out
                    generated_audio_segments.append(audio_data)
                    speech_types[current_style]["ref_text"] = ref_text_out
                # Concatenate all audio segments
                if generated_audio_segments:
                    final_audio_data = np.concatenate(generated_audio_segments)
                    return [(sr, final_audio_data)] + [
                        gr.update(value=speech_types[style]["ref_text"]) for style in speech_types
                    ]
                else:
                    gr.Warning("No audio generated.")
                    return [None] + [gr.update(value=speech_types[style]["ref_text"]) for style in speech_types]
            generate_multistyle_btn.click(
                generate_multistyle_speech,
                inputs=[
                    gen_text_input_multistyle,
                ]
                + speech_type_names
                + speech_type_audios
                + speech_type_ref_texts
                + [
                    remove_silence_multistyle,
                ],
                outputs=[audio_output_multistyle] + speech_type_ref_texts,
            )
            # Validation function to disable Generate button if speech types are missing
            def validate_speech_types(gen_text, regular_name, *args):
                speech_type_names_list = args[:max_speech_types]
                # Collect the speech types names
                speech_types_available = set()
                if regular_name:
                    speech_types_available.add(regular_name)
                for name_input in speech_type_names_list:
                    if name_input:
                        speech_types_available.add(name_input)
                # Parse the gen_text to get the speech types used
                segments = parse_speechtypes_text(gen_text)
                speech_types_in_text = set(segment["style"] for segment in segments)
                # Check if all speech types in text are available
                missing_speech_types = speech_types_in_text - speech_types_available
                if missing_speech_types:
                    # Disable the generate button
                    return gr.update(interactive=False)
                else:
                    # Enable the generate button
                    return gr.update(interactive=True)
            gen_text_input_multistyle.change(
                validate_speech_types,
                inputs=[gen_text_input_multistyle, regular_name] + speech_type_names,
                outputs=generate_multistyle_btn,
            )
            
@click.command()
@click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
@click.option("--host", "-H", default=None, help="Host to run the app on")
@click.option(
    "--share",
    "-s",
    default=False,
    is_flag=True,
    help="Share the app via Gradio share link",
)
@click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
def main(port, host, share, api):
    global app
    print("Starting app...")
    app.queue(api_open=api).launch(server_name=host, server_port=port, share=share, show_api=api)

if __name__ == "__main__":
    if not USING_SPACES:
        main()
    else:
        app.queue().launch()