import gradio as gr
from transformers import pipeline
from transformers.file_utils import cached_path, hf_bucket_url
import os, zipfile
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset
import torch
import kenlm
import torchaudio
from pyctcdecode import Alphabet, BeamSearchDecoderCTC, LanguageModel

"""Vietnamese speech2text"""
cache_dir = './cache/'
processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
lm_file = hf_bucket_url("nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip')
lm_file = cached_path(lm_file,cache_dir=cache_dir)
with zipfile.ZipFile(lm_file, 'r') as zip_ref:
    zip_ref.extractall(cache_dir)
lm_file = cache_dir + 'vi_lm_4grams.bin'\

def get_decoder_ngram_model(tokenizer, ngram_lm_path):
    vocab_dict = tokenizer.get_vocab()
    sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items())
    vocab = [x[1] for x in sort_vocab][:-2]
    vocab_list = vocab
    # convert ctc blank character representation
    vocab_list[tokenizer.pad_token_id] = ""
    # replace special characters
    vocab_list[tokenizer.unk_token_id] = ""
    # vocab_list[tokenizer.bos_token_id] = ""
    # vocab_list[tokenizer.eos_token_id] = ""
    # convert space character representation
    vocab_list[tokenizer.word_delimiter_token_id] = " "
    # specify ctc blank char index, since conventially it is the last entry of the logit matrix
    alphabet = Alphabet.build_alphabet(vocab_list, ctc_token_idx=tokenizer.pad_token_id)
    lm_model = kenlm.Model(ngram_lm_path)
    decoder = BeamSearchDecoderCTC(alphabet,
                                   language_model=LanguageModel(lm_model))
    return decoder
ngram_lm_model = get_decoder_ngram_model(processor.tokenizer, lm_file)

# define function to read in sound file
def speech_file_to_array_fn(path, max_seconds=10):
    batch = {"file": path}
    speech_array, sampling_rate = torchaudio.load(batch["file"])
    if sampling_rate != 16000:
      transform = torchaudio.transforms.Resample(orig_freq=sampling_rate,
                                                 new_freq=16000)
      speech_array = transform(speech_array)
    speech_array = speech_array[0]
    if max_seconds > 0:
      speech_array = speech_array[:max_seconds*16000]
    batch["speech"] = speech_array.numpy()
    batch["sampling_rate"] = 16000
    return batch
# tokenize
def speech2text(audio):
   # read in sound file
    # load dummy dataset and read soundfiles
    ds = speech_file_to_array_fn(audio.name)
    # infer model
    input_values = processor(
          ds["speech"], 
          sampling_rate=ds["sampling_rate"], 
          return_tensors="pt"
    ).input_values
    # decode ctc output
    logits = model(input_values).logits[0]
    pred_ids = torch.argmax(logits, dim=-1)
    greedy_search_output = processor.decode(pred_ids)
    beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
    return beam_search_output


"""Machine translation"""
model_checkpoint = "huynguyen208/fantastic4-finetuned-vi-to-en-PhoMT-demo"
translator = pipeline("translation", model=model_checkpoint)

def translate_vi2en(Vietnamese):
    return translator(Vietnamese)[0]['translation_text']

def inference(audio):
    vi_text = speech2text(audio)
    en_text = translate_vi2en(vi_text)
    return en_text


"""Gradio demo"""

vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
                   "Ánh mắt ta chạm nhau. Chỉ muốn ngắm anh lâu thật lâu.",
                   "Nếu như một câu nói có thể khiến em vui."]

vi_example_voice =[['vi_speech_01.wav'], ['vi_speech_02.wav'], ['vi_speech_03.wav']]

with gr.Blocks() as demo:
    with gr.Tabs():
        with gr.TabItem("Translation: Vietnamese to English"):
            with gr.Row():
                with gr.Column():
                    vietnamese = gr.Textbox(label="Vietnamese Text")
                    translate_to_english = gr.Button(value="Translate To English")
                with gr.Column():
                    english1 = gr.Textbox(label="English Text")
            translate_to_english.click(lambda text: translate_vi2en(text), inputs=vietnamese, outputs=english1)
            gr.Examples(examples=vi_example_text,
                        inputs=[vietnamese])
        with gr.TabItem("Speech2text and translation"):
            with gr.Row():
                with gr.Column():
                    audio = gr.Audio(source="microphone", label="Input Audio", type="filepath")
                    translate_button = gr.Button(value="Translate To English")
                with gr.Column():
                    english2 = gr.Textbox(label="English Text")
                
            translate_button.click(lambda voice: inference(voice), inputs=audio, outputs=english2)
            gr.Examples(examples=vi_example_voice,
                        inputs=[audio])

if __name__ == "__main__":
    demo.launch()