import spaces
import os
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent.chat_history import BasicChatHistory
from llama_cpp_agent.chat_history.messages import Roles
import gradio as gr
from huggingface_hub import hf_hub_download

token_huggingface = os.getenv("HUGGINGFACE_TOKEN")

hf_hub_download(
    repo_id="bartowski/gemma-2-2b-it-abliterated-GGUF",
    filename="gemma-2-2b-it-abliterated-Q4_K_M.gguf",
    local_dir="./models",
    token=token_huggingface
)

llm = None

@spaces.GPU(duration=120)
def responder(
    mensaje,
    historial: list[tuple[str, str]],
    mensaje_sistema,
    max_tokens,
    temperatura,
    top_p,
    top_k,
    penalizacion_repeticion,
):
    plantilla_chat = MessagesFormatterType.GEMMA_2

    global llm
    
    if llm is None:
        llm = Llama(
            model_path="models/gemma-2-2b-it-abliterated-Q4_K_M.gguf",
            flash_attn=True,
            n_gpu_layers=81,
            n_batch=1024,
            n_ctx=8192,
        )

    proveedor = LlamaCppPythonProvider(llm)

    agente = LlamaCppAgent(
        proveedor,
        system_prompt=f"{mensaje_sistema}",
        predefined_messages_formatter_type=plantilla_chat,
        debug_output=True
    )
    
    configuracion = proveedor.get_provider_default_settings()
    configuracion.temperature = temperatura
    configuracion.top_k = top_k
    configuracion.top_p = top_p
    configuracion.max_tokens = max_tokens
    configuracion.repeat_penalty = penalizacion_repeticion
    configuracion.stream = True

    mensajes = BasicChatHistory()

    for msj in historial:
        usuario = {
            'role': Roles.user,
            'content': msj[0]
        }
        asistente = {
            'role': Roles.assistant,
            'content': msj[1]
        }
        mensajes.add_message(usuario)
        mensajes.add_message(asistente)
    
    flujo = agente.get_chat_response(
        mensaje,
        llm_sampling_settings=configuracion,
        chat_history=mensajes,
        returns_streaming_generator=True,
        print_output=False
    )
    
    salida = ""
    for fragmento in flujo:
        salida += fragmento
        yield salida

descripcion = """<p align="center">Chat con Gemma 2B Abliterated usando llama.cpp</p>
<p><center>
<a href="https://huggingface.co/google/gemma-2-2b-it" target="_blank">[Modelo Gemma 2B it]</a>
<a href="https://huggingface.co/google/gemma-2-2b-it-GGUF" target="_blank">[Modelo Gemma 2B it GGUF]</a>
</center></p>
"""

demo = gr.ChatInterface(
    responder,
    additional_inputs=[
        gr.Textbox(value="Eres un asistente útil.", label="Mensaje del sistema"),
        gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Tokens máximos"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperatura"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p",
        ),
        gr.Slider(
            minimum=0,
            maximum=100,
            value=40,
            step=1,
            label="Top-k",
        ),
        gr.Slider(
            minimum=0.0,
            maximum=2.0,
            value=1.1,
            step=0.1,
            label="Penalización por repetición",
        ),
    ],
    retry_btn="Reintentar",
    undo_btn="Deshacer",
    clear_btn="Limpiar",
    submit_btn="Enviar",
    title="Chat con Gemma 2B usando llama.cpp", 
    description=descripcion,
    chatbot=gr.Chatbot(
        scale=1, 
        likeable=False,
        show_copy_button=True
    )
)

if __name__ == "__main__":
    demo.launch()