import gradio as gr
import os
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Force CPU-only execution
os.environ["LLAMA_CPP_USE_CUDA"] = "0"  # Ensure CUDA is disabled

# Application title and description
title = "Mistral-7B-Instruct-GGUF Run On CPU"
description = """
🔎 [Mistral AI's Mistral 7B Instruct v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) [GGUF format model](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF), 4-bit quantization balanced quality GGUF version, running on CPU. English Only (supports other languages but with reduced quality). Using [GitHub - llama.cpp](https://github.com/ggerganov/llama.cpp).
"""

# Model setup
model_dir = "models"
model_name = "unsloth.Q4_K_M.gguf"
model_path = os.path.join(model_dir, model_name)

# Download the model if not already present
hf_hub_download(
    repo_id="vislupus/bulgarian-joke-master-gemma-2-2b-it-bnb-4bit-gguf",
    filename=model_name,
    local_dir=model_dir
)

# Check if the model file exists
if not os.path.exists(model_path):
    raise FileNotFoundError(f"Model file not found at {model_path}")

# Load the model using llama_cpp
print("Loading the model...")
llm = Llama(model_path=model_path)
print("Model loaded successfully!")

# Define the function to generate responses
def generate_response(prompt, temperature=0.7, top_p=1.0, max_tokens=256):
    """
    Generate a response from the model.

    Args:
        prompt (str): The user's input prompt.
        temperature (float): Sampling temperature.
        top_p (float): Top-p sampling parameter.
        max_tokens (int): Maximum number of tokens to generate.

    Returns:
        str: The model's response.
    """
    try:
        response = llm(prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p)
        return response["choices"][0]["text"].strip()
    except Exception as e:
        return f"Error generating response: {e}"

# Set up Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# 🦙 Llama GGUF Model Chatbot")
    gr.Markdown(description)

    # Input box for the user prompt
    prompt_input = gr.Textbox(label="Your Prompt", placeholder="Type your message here...", lines=5)

    # Advanced settings
    with gr.Accordion("Advanced Settings", open=False):
        temperature = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
        top_p = gr.Slider(0.1, 1.0, value=1.0, step=0.01, label="Top-p")
        max_tokens = gr.Slider(16, 512, value=256, step=16, label="Max Tokens")

    # Output box for the model's response
    response_output = gr.Textbox(label="Model Response", placeholder="The model's response will appear here...", lines=10)

    # Generate button
    generate_button = gr.Button("Generate Response")

    # Connect inputs and outputs
    generate_button.click(
        generate_response,
        inputs=[prompt_input, temperature, top_p, max_tokens],
        outputs=[response_output]
    )

# Launch the Gradio app
if __name__ == "__main__":
    demo.launch()