import gradio as gr
import os
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

os.environ["LLAMA_CPP_USE_CUDA"] = "0"

title = "Gemma 2 2B - Bulgarian Joke Master - GGUF"
description = """
🔎 [Gemma 2 2B](https://huggingface.co/unsloth/gemma-2-2b-bnb-4bit) fine-tuned for Bulgarian jokes, running on CPU in GGUF format.
This model is fine-tuned for generating humorous content in Bulgarian, utilizing the [Llama.cpp library](https://github.com/ggerganov/llama.cpp).
Running on CPU, it can still produce impressive results, although larger models may require more processing power.
"""

model_dir = "models"
model_name = "unsloth.Q4_K_M.gguf"
model_path = os.path.join(model_dir, model_name)

hf_hub_download(
    repo_id="vislupus/bulgarian-joke-master-SmolLM2-135M-Instruct-bnb-4bit-gguf",
    filename=model_name,
    local_dir=model_dir
)

if not os.path.exists(model_path):
    raise FileNotFoundError(f"Model file not found at {model_path}")

print("Loading the model...")
llm = Llama(model_path=model_path)
print("Model loaded successfully!")

def generate_response(message, history, temperature=0.7, top_p=1.0, max_tokens=256):  
     """
    Generate a response based on the user's message and the conversation history.
    """
    try:
        conversation = ""
        for msg in history:
            role, content = msg
            conversation += f"<start_of_turn>{role}\n{content}\n<end_of_turn>"

        conversation += f"<start_of_turn>user\n{message}\n<end_of_turn>"

        response = llm(conversation, max_tokens=max_tokens, temperature=temperature, top_p=top_p)
        return response["choices"][0]["text"].strip()

    except Exception as e:
        return f"Error generating response: {str(e)}"

if __name__ == "__main__":
    gguf_demo = gr.ChatInterface(
        generate_response,
        title=title,
        description=description,
    )

    gguf_demo.launch(share=True)

    try:
        if llm:
            llm.close()
    except Exception as e:
        print(f"Error closing model: {e}")