import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
import gradio as gr
import numpy as np
import librosa
import os
from huggingface_hub import snapshot_download

# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

def parse_transcription_with_lm(logits):
    result = processor_with_LM.batch_decode(logits.cpu().numpy())
    text = result.text
    transcription = text[0].replace('<s>','')
    return transcription

def parse_transcription(logits):
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
    return transcription

file_path = snapshot_download(repo_id="shizukanabasho/North3", use_auth_token=os.environ['TOKEN'])

processor = Wav2Vec2Processor.from_pretrained(file_path)
processor_with_LM = Wav2Vec2ProcessorWithLM.from_pretrained(file_path)
model = Wav2Vec2ForCTC.from_pretrained(file_path).to(device)
   
def process(audio, applyLM):
    try:
        if isinstance(audio, str):
            # Process microphone audio
            arr, _ = librosa.load(audio, sr=16000, mono=True)
        elif isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[0], int) and isinstance(audio[1], np.ndarray):
            # Process uploaded audio file
            arr = audio[1].astype(np.float64)  # Ensure that the audio data is in float64 format
            print("Uploaded audio data - Shape:", arr.shape, "Data Type:", arr.dtype)
            # Resample to 16000 Hz and convert to mono if needed
            if arr.ndim > 1:
                arr = np.mean(arr, axis=1)  # Convert to mono if audio is stereo
            arr = librosa.resample(arr, orig_sr=audio[0], target_sr=16000)
            print("Processed audio data - Shape:", arr.shape, "Data Type:", arr.dtype)
        else:
            return "Invalid audio source"

        input_values = processor(arr, sampling_rate=16000, return_tensors="pt", padding=True)
        input_values = input_values.to(device)
        
        with torch.no_grad():
            logits = model(**input_values).logits

        if applyLM:
            transcription = parse_transcription_with_lm(logits)
        else:
            transcription = parse_transcription(logits)
        return transcription

    except Exception as e:
        return str(e)

microphone_interface = gr.Interface(
    fn=process,
    inputs=[gr.Audio(label="Microphone", sources="microphone", type="filepath"), gr.Checkbox(label="Apply LM", value=True)],
    outputs=gr.Textbox(label="Transcription:", lines=3),
    live=False,
    analytics_enabled=False,
    # show_tips=False,
    # enable_queue=True,
    # preserve_state=True,
    examples=[
        [os.path.join(os.path.dirname(__file__),"examples/บ่ต้องไปส่ง.mp3")],
        [os.path.join(os.path.dirname(__file__),"examples/ไปตวย.mp3")],
    ],
    cache_examples = False,
)

upload_interface = gr.Interface(
    fn=process,
    inputs=[gr.Audio(label="Upload Speech", sources="upload", type="numpy"), gr.Checkbox(label="Apply LM", value=True)],
    outputs=gr.Textbox(label="Transcription:", lines=3),
    analytics_enabled=False,
    # show_tips=False,
    # enable_queue=True,
    # preserve_state=True,
    examples=[
        [os.path.join(os.path.dirname(__file__),"examples/บ่ต้องไปส่ง.mp3")],
        [os.path.join(os.path.dirname(__file__),"examples/ไปตวย.mp3")],
    ],
    cache_examples = False,
)

with gr.Blocks() as datasets:
    gr.Markdown("""
# Dataset Details
This research project develops a Northern-Central Thai parallel speech dataset which is composed of 80,000 audio files. These files have been divided into two corpora, each corpus contain 40,000 files.
 
- **Corpus 1**: This corpus is composed of speech data from dialogues of daily life among northern Thai people.
- **Corpus 2**: This corpus is composed of speech data from dialogues in tourism domain between customers and service providers who are northern Thai people.
 
## Accessing the Dataset
If you're interested in accessing this dataset, please visit this <a href="http://nasds.cs.science.cmu.ac.th/sharing/u7mvjl9Bt" target="_blank">link</a>, and please contact us via email at ds.sci.cmu@gmail.com or phimphaka.t@cmu.ac.th for the password. 
 
This research has received funding support from the NSRF via the Program Management Unit for Human Resources & Institutional Development, Research and Innovation [grant number B04G640073].
""")
    with gr.Row():
        gr.Image(value = os.path.join(os.path.dirname(__file__),"examples/y2_train_5row.png"), type="pil", label="Dataset Example")


Instructions = """
    <center><strong><font size="5">Northern Thai Dialect Speech Recognition System</font></strong></center>
    <br>
    <p>Instructions:</p>
    <ul>
        <li>Click the 'Microphone' tab to use the device's microphone to record the speech. Click the 'Record from Microphone' button to start recording, and then click this button again to stop recording.</li>
        <li>Click 'Submit' to see the transcription of the speech in the right-hand side textbox.</li>
        <li>Click the 'Upload' tab to upload the speech file (.wav) and click 'Submit' to see the transcription.</li>
        <li>Click the 'Dataset' tab to access the speech corpus.</li>
    </ul>
    <p>This research has received funding support from the NSRF via the Program Management Unit for Human Resources & Institutional Development, Research and Innovation [grant number B04G640073]. </p>
""" 

if __name__ == "__main__":
    with gr.Blocks(title="Northern Thai ASR") as demo:

        gr.HTML(Instructions)

        tabbed_interface = gr.TabbedInterface(
            [microphone_interface, upload_interface, datasets],
            tab_names=["Microphone", "Upload", "Dataset"],
        )
    demo.launch()