import sys
import time

import torch
import torchaudio
import librosa

import gradio as gr

from transformers import AutoModelForCTC, Wav2Vec2BertProcessor

# Config
model_name = "Yehor/w2v-bert-2.0-uk-v2"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch_dtype = torch.float16

min_duration = 0.5
max_duration = 60

concurrency_limit = 1
use_torch_compile = False

# Load the model
asr_model = AutoModelForCTC.from_pretrained(model_name, torch_dtype=torch_dtype).to(device)
processor = Wav2Vec2BertProcessor.from_pretrained(model_name)

if use_torch_compile:
    asr_model = torch.compile(asr_model)

# Elements
examples = [
    "example_1.wav",
    "example_2.wav",
    "example_3.wav",
    "example_4.wav",
    "example_5.wav",
    "example_6.wav",
]

examples_table = '''
| File  | Text |
| ------------- | ------------- |
| `example_1.wav`  | тема про яку не люблять говорити офіційні джерела у генштабі і міноборони це хімічна зброя окупанти вже тривалий час використовують хімічну зброю заборонену |
| `example_2.wav`  | всіма конвенціями якщо спочатку це були гранати з дронів то тепер фіксують випадки застосування  |
| `example_3.wav`  | хімічних снарядів причому склад отруйної речовони різний а отже й наслідки для наших військових теж різні  |
| `example_4.wav`  | використовує на фронті все що має і хімічна зброя не нийняток тож з чим маємо справу розбиралася марія моганисян |
| `example_5.wav`  | двох тисяч випадків застосування росіянами боєприпасів споряджених небезпечними хімічними речовинами |
| `example_6.wav`  | на всі писані норми марія моганисян олександр моторний спецкор марафон єдині новини |
'''.strip()

# https://www.tablesgenerator.com/markdown_tables
authors_table = '''
## Authors

Follow them in social networks and **contact** if you need any help or have any questions:

| <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** |
|-------------------------------------------------------------------------------------------------|
| https://t.me/smlkw in Telegram                                                                  |
| https://x.com/yehor_smoliakov at X                                                              |
| https://github.com/egorsmkv at GitHub                                                           |
| https://huggingface.co/Yehor at Hugging Face                                                    |
| or use egorsmkv@gmail.com                                                                       |
'''.strip()

description_head = f"""
# Speech-to-Text for Ukrainian v2

## Overview

This space uses https://huggingface.co/Yehor/w2v-bert-2.0-uk-v2 model to recognize audio files.

> For demo, audio duration **must not** exceed **{max_duration}** seconds.
""".strip()

description_foot = f"""
## Community

- Join our Discord server where we talk about AI/ML/DL: https://discord.gg/yVAjkBgmt4
- Join our Speech Recognition group in Telegram: https://t.me/speech_recognition_uk

## More

Check out other ASR models: https://github.com/egorsmkv/speech-recognition-uk

{authors_table}
""".strip()

transcription_value = """
Recognized text will appear here.

Choose **an example file** below the Recognize button, upload **your audio file**, or use **the microphone** to record something.
""".strip()

tech_env = f"""
#### Environment

- Python: {sys.version}
- Torch device: {device}
- Torch dtype: {torch_dtype}
- Use torch.compile: {use_torch_compile}
""".strip()

tech_libraries = f"""
#### Libraries

- PyTorch: {torch.__version__}
- Transformers: {torch.__version__}
- Librosa: {librosa.version.version}
- Gradio: {gr.__version__}
""".strip()


def inference(audio_path, progress=gr.Progress()):
    if not audio_path:
        raise gr.Error("Please upload an audio file.")

    gr.Info("Starting recognition", duration=2)

    progress(0, desc="Recognizing")

    duration = librosa.get_duration(path=audio_path)
    if duration < min_duration:
        raise gr.Error(f"The duration of the file is less than {min_duration} seconds, it is {round(duration, 2)} seconds.")
    if duration > max_duration:
        raise gr.Error(f"The duration of the file exceeds {max_duration} seconds.")

    paths = [
        audio_path,
    ]

    results = []

    for path in progress.tqdm(paths, desc="Recognizing...", unit="file"):
        t0 = time.time()

        audio_duration = librosa.get_duration(path=path, sr=16_000)
        audio_input, _ = librosa.load(path, mono=True, sr=16_000)

        features = processor([audio_input], sampling_rate=16_000).input_features
        features = torch.tensor(features).to(device)

        if torch_dtype == torch.float16:
            features = features.half()

        with torch.inference_mode():
            logits = asr_model(features).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        predictions = processor.batch_decode(predicted_ids)

        if not predictions:
            predictions = '-'

        elapsed_time = round(time.time() - t0, 2)
        rtf = round(elapsed_time / audio_duration, 4)
        audio_duration = round(audio_duration, 2)

        results.append(
            {
                "path": path.split("/")[-1],
                "transcription": "\n".join(predictions),
                "audio_duration": audio_duration,
                "rtf": rtf,
            }
        )

    gr.Info("Finished!", duration=2)

    result_texts = []

    for result in results:
        result_texts.append(f'**{result["path"]}**')
        result_texts.append("\n\n")
        result_texts.append(f'> {result["transcription"]}')
        result_texts.append("\n\n")
        result_texts.append(f'**Audio duration**: {result["audio_duration"]}')
        result_texts.append("\n")
        result_texts.append(f'**Real-Time Factor**: {result["rtf"]}')

    return "\n".join(result_texts)


demo = gr.Blocks(
    title="Speech-to-Text for Ukrainian",
    analytics_enabled=False,
)

with demo:
    gr.Markdown(description_head)

    gr.Markdown("## Demo")

    with gr.Row():
        audio_file = gr.Audio(label="Audio file", type="filepath")
        transcription = gr.Markdown(
            label="Transcription",
            value=transcription_value,
        )

    gr.Button("Recognize").click(
        inference,
        concurrency_limit=concurrency_limit,
        inputs=audio_file,
        outputs=transcription,
    )

    with gr.Row():
        gr.Examples(label="Choose an example", inputs=audio_file, examples=examples)
    
    gr.Markdown(examples_table)

    gr.Markdown(description_foot)

    gr.Markdown('### Gradio app uses the following technologies:')
    with gr.Row():
        gr.Markdown(tech_env)
        gr.Markdown(tech_libraries)

if __name__ == "__main__":
    demo.queue()
    demo.launch()