import gradio as gr
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
import torch

# Ensure Torch uses CPU
torch.set_default_device("cpu")

# Model details
model_name_or_path = "TheBloke/Wizard-Vicuna-7B-Uncensored-SuperHOT-8K-GPTQ"
model_basename = "model"  # Match uploaded file basename

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
model = AutoGPTQForCausalLM.from_quantized(
    model_name_or_path,
    model_basename=model_basename,
    trust_remote_code=True,
    device_map="auto",  # Use auto to ensure compatibility with CPU
    use_safetensors=True,
    torch_dtype=torch.float32,  # Use float32 for CPU compatibility
    quantize_config=None,
)

# Core personality prompt
core_personality = """You are Vespa Companion, a witty and knowledgeable AI designed to engage in thoughtful and helpful conversations about life, technology, and Vespa scooters."""

# Function to generate a response
def generate_response(input_text):
    try:
        # Prepare the input prompt
        prompt_template = f"{core_personality}\n\nUSER: {input_text}\nASSISTANT:"
        input_ids = tokenizer(prompt_template, return_tensors="pt").input_ids  # Use CPU

        # Define generation configuration
        generation_config = {
            "max_new_tokens": 1024,  # Allow longer responses if memory permits
            "temperature": 0.7,
            "top_p": 0.95,
            "repetition_penalty": 1.2,
        }

        # Generate the response
        output_ids = model.generate(
            input_ids=input_ids,
            max_new_tokens=generation_config["max_new_tokens"],
            temperature=generation_config["temperature"],
            top_p=generation_config["top_p"],
            repetition_penalty=generation_config["repetition_penalty"],
        )

        # Decode and clean the output
        raw_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        # Extract only the assistant's response
        if "ASSISTANT:" in raw_output:
            response = raw_output.split("ASSISTANT:")[-1].strip()
        else:
            response = raw_output.strip()

        return response
    except Exception as e:
        return f"Response Generation Error: {e}"

# Chat function for Gradio
def chat_with_memory(history, user_input):
    # Add user input to history
    history.append({"role": "user", "content": user_input})

    # Generate response
    response = generate_response(user_input)
    history.append({"role": "assistant", "content": response})

    # Format history for display
    display_history = [
        (entry["content"] if entry["role"] == "user" else None, entry["content"] if entry["role"] == "assistant" else None)
        for entry in history
    ]
    return display_history, history

# Gradio app setup
with gr.Blocks() as demo:
    gr.Markdown("## Vespa Companion - Intelligent Chatbot")
    chatbot = gr.Chatbot(label="Chat with Vespa Companion")
    with gr.Row():
        msg = gr.Textbox(placeholder="Type your message here...", label="Your Message")
        clear = gr.Button("Clear Conversation")
    history = gr.State([])  # Initialize conversation history
    msg.submit(chat_with_memory, [history, msg], [chatbot, history])
    clear.click(lambda: ([], []), None, [chatbot, history])

# Launch the app
demo.launch()