#https://raw.githubusercontent.com/rohan-paul/LLM-FineTuning-Large-Language-Models/refs/heads/main/Mixtral_Chatbot_with_Gradio/Mixtral_Chatbot_with_Gradio.py
from transformers import AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig

from threading import Thread
import gradio as gr
import transformers
import torch

# Run the entire app with `python run_mixtral.py`

""" The messages list should be of the following format:

messages =

[
    {"role": "user", "content": "User's first message"},
    {"role": "assistant", "content": "Assistant's first response"},
    {"role": "user", "content": "User's second message"},
    {"role": "assistant", "content": "Assistant's second response"},
    {"role": "user", "content": "User's third message"}
]

"""
""" The `format_chat_history` function below is designed to format the dialogue history into a prompt that can be fed into the Mixtral model. This will help understand the context of the conversation and generate appropriate responses by the Model.
The function takes a history of dialogues as input, which is a list of lists where each sublist represents a pair of user and assistant messages.
"""

def format_chat_history(history) -> str:
    messages = []
    
    # Add a system message to set the context
    messages.append({"role": "system", "content": "You are a helpful assistant."})
    
    for i, dialog in enumerate(history):
        if i == 0:
            # For the first interaction, only add the user message
            messages.append({"role": "user", "content": dialog[0]})
        else:
            # For subsequent interactions, add both user and assistant messages
            if dialog[0]:  # User message
                messages.append({"role": "user", "content": dialog[0]})
            if dialog[1]:  # Assistant message
                messages.append({"role": "assistant", "content": dialog[1]})
    
    return pipeline.tokenizer.apply_chat_template(
        messages, tokenize=False,
        add_generation_prompt=True)

def model_loading_pipeline():
    
    model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, Timeout=5)

    pipeline = transformers.pipeline(
        "text-generation",
        model=model_id,
        tokenizer=tokenizer,
        torch_dtype=torch.float16,
        load_in_4bit=True,  # or load_in_8bit=True, depending on your preference
        device_map="auto",  # This will automatically determine the best device setup
        streamer=streamer
    )
    return pipeline, streamer

def launch_gradio_app(pipeline, streamer):
    with gr.Blocks() as demo:
        chatbot = gr.Chatbot()
        msg = gr.Textbox()
        clear = gr.Button("Clear")

        def user(user_message, history):
            return "", history + [[user_message, None]]

        def bot(history):
            prompt = format_chat_history(history)

            history[-1][1] = ""
            kwargs = dict(text_inputs=prompt, max_new_tokens=2048, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
            thread = Thread(target=pipeline, kwargs=kwargs)
            thread.start()

            for token in streamer:
                history[-1][1] += token
                yield history

        msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot)
        clear.click(lambda: None, None, chatbot, queue=False)

    demo.queue()
    demo.launch(share=True, debug=True)

if __name__ == '__main__':
    pipeline, streamer = model_loading_pipeline()
    launch_gradio_app(pipeline, streamer)

# Run the entire app with `python run_mixtral.py`