import os
import gradio as gr
import numpy as np
import torch
import ollama
import emoji
from datasets import load_dataset
from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
from huggingface_hub import login

from TTS_models import *

login(token = os.getenv('HF_TOKEN'))
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# load speech translation checkpoint
STT_model_id = "openai/whisper-tiny"

# load llm
llm_model_id = "gemma2:2b"

# init TTS model
TTS_model_id = "tts_models/en/ljspeech/tacotron2-DDC_ph"

client = ollama.Client()
llmpipe = pipeline(
    "text-generation",
    model="google/gemma-2-2b-it",
    model_kwargs={"torch_dtype": torch.bfloat16},
    stream=True,
    device=device
)

def translate(audio):
    global STT_model_id
    asr_pipe = pipeline("automatic-speech-recognition", model=STT_model_id, device=device)
    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate", "language":"fr"})
    print(f'Translated {outputs} using {asr_pipe.model}')
    return outputs["text"]

def transcribe(audio):
    global STT_model_id
    asr_pipe = pipeline("automatic-speech-recognition", model=STT_model_id, device=device)
    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe"})
    print(f'[transcribe] Transcribe {outputs}')
    return outputs["text"]

def chatCompletion(text):
    global llm_model_id
    global llmpipe
    global client

    messages = [
        {"role": "user", "content": "You are a helpful assistant. Answer in English only in text.\n\n"+text},
    ]

    try:
        response: ollama.ListResponse = ollama.list()
        
        response = client.chat(
            model=llm_model_id,
            messages=messages,
            stream=True,
            options={
                'num_predict': 256,
                'temperature': 0.5,
                'low_vram': True,
            },
        )

        buffer = ""
        for chunk in response:
            buffer += chunk["message"]["content"]

        print(f'[chatCompletion] {buffer}')
        return buffer
    except: 
        outputs = pipe(messages, max_new_tokens=256)
        buffer = outputs[0]["generated_text"][-1]["content"].strip()

        print(f'[chatCompletion] {buffer}')
        return buffer


def synthesise(text):
    global TTS_model_id
    text = emoji.replace_emoji(text, replace="!")
    synthesiser = XTTS(TTS_model_id)
    speech = synthesiser.synthesize(text)

    return (np.array(speech)* 32767).astype(np.int16)

def speech_to_speech_translation(audioMic, audioFile):
    audio = None
    if audioMic is not None:
        audio = audioMic
    elif audioFile is not None:
        audio = audioFile

    translated_text = chatCompletion("")
    # translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    return (22050, synthesised_speech), translated_text

def speech_to_speech(audioMic, audioFile):
    audio = None
    if audioMic is not None:
        audio = audioMic
    elif audioFile is not None:
        audio = audioFile

    translated_text = "Sorry no audio was found."

    if audio is not None:
        # Transcribe audio
        translated_text = transcribe(audio)

    # Call LLM
    answer = chatCompletion(translated_text)

    # Synthesize answer
    synthesised_speech = synthesise(answer)

    print(f'[speech_to_speech] Transcribed text {translated_text}')
    print(f'[speech_to_speech] LLM answer {answer}')

    return (22050, synthesised_speech), answer

with gr.Blocks() as demo:
    options = gr.WaveformOptions(sample_rate=22050)

    with gr.Tab("Instant Translation"):
        gr.Markdown(
        """
        # Tanslation of audio to audio
        The aime of this tab is to demonstrate the speech-to-speech translation capabilities of the [whisper-tiny](https://huggingface.co/openai/whisper-tiny) model.

        It uses:
        - [whisper-tiny](https://huggingface.co/openai/whisper-tiny) to transcribe, 
        - and glow-tts as a voice synthesizer.

        You can either record yourself or upload an audio file in the tabs below.
        This will translate to english.
        """)
        with gr.Row():
            with gr.Column(scale=1):
                with gr.Tab("Record Audio"):
                    audioMic = gr.Audio(sources="microphone", waveform_options=options, type="filepath")
                with gr.Tab("Upload Audio"):
                    audioFile = gr.Audio(sources="upload", waveform_options=options)

                transcribeBtn = gr.Button("Submit", size='lg')

            with gr.Column(scale=1):
                textOutput = gr.Textbox(label="Transcribed text")
                audioOutput = gr.Audio(waveform_options=options, type="numpy")

        transcribeBtn.click(fn=speech_to_speech_translation, inputs=[audioMic, audioFile], outputs=[audioOutput, textOutput], api_name="report_generation")

    with gr.Tab("Voice Assistant"):
        gr.Markdown(
        """
        # Voice Assistant
        This is a demo to show what are the possibilities for building your own voice assistant.
        This demo uses:
        - [whisper-tiny](https://huggingface.co/openai/whisper-tiny) to transcribe, 
        - [ollama/gemma2:2b](https://ollama.com/library/gemma2:2b) model to generate the answer of the assistant,
        - and glow-tts as a voice synthesizer.

        This means that you need to install ollama on your machine to be able to use this.

        You can either record yourself or upload an audio file in the tabs below.
        """)
        with gr.Row():
            with gr.Column(scale=1):
                with gr.Tab("Record Audio"):
                    audioMic = gr.Audio(sources="microphone", waveform_options=options, type="filepath")
                with gr.Tab("Upload Audio"):
                    audioFile = gr.Audio(sources="upload", waveform_options=options)

                transcribeBtn = gr.Button("Submit", size='lg')
            with gr.Column(scale=1):
                textOutput = gr.Textbox(label="Transcribed text")
                audioOutput = gr.Audio(waveform_options=options, type="numpy")

        transcribeBtn.click(fn=speech_to_speech, inputs=[audioMic, audioFile], outputs=[audioOutput, textOutput], api_name="report_generation")

demo.launch(auth=("FM", "FlandersM4ke"))