import gradio as gr
from transformers import pipeline, TextIteratorStreamer
from threading import Thread
import torch
import os
import subprocess
import spaces
import os

SYS = """
You will play a specific role and respond in character to the user’s input. Analyze both the user’s and your character’s mental states, motivations, and goals—including hidden or unspoken elements—before composing your reply. Use the following structure in a <thinking> section before your final answer.

<thinking>1. User Input Analysis:

Literal Meaning: What is the user explicitly saying?

Likely Intent: What goal is the user pursuing?

Beliefs/Assumptions: What does the user assume about the situation, your character, or you?

Emotional State: What emotions does the user seem to be feeling?

Expectations: What kind of response is the user hoping for?


2. Character’s Internal State:

Goals: What is your character trying to achieve?

Beliefs about the User: What does your character think about the user?

Emotional Response: How does your character feel about the user and their input?

Potential Strategies: List different possible responses, with pros and cons.

Chosen Strategy & Justification: Pick the best approach and explain why it fits your character’s goals and the user’s mindset.


3. Response Planning:

Desired User Perception: How should the user view your character after the reply?

Anticipated User Reaction: How might the user respond?

Long-Term Considerations: Any future impacts to consider?


</thinking><answer>  
(Write your in-character reply here, directly informed by your analysis above.)  
</answer>The role you will play follows below.

"""

# Install flash-attn
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
# Initialize the model pipeline
generator = pipeline('text-generation', model='Locutusque/Thespis-Llama-3.1-8B', torch_dtype=torch.bfloat16, token=os.getenv("TOKEN"))
@spaces.GPU
def generate_text(prompt, system_prompt, temperature, top_p, top_k, repetition_penalty, max_length):
    """
    Streamingly generate text based on the given prompt and parameters.
    
    Args:
        prompt (str): The user's input prompt
        system_prompt (str): The system prompt to set the context
        temperature (float): Sampling temperature
        top_p (float): Nucleus sampling parameter
        top_k (int): Top-k sampling parameter
        repetition_penalty (float): Penalty for repeated tokens
        max_length (int): Maximum length of generated text
    
    Yields:
        str: Generated text chunks
    """
    # Move model to GPU
    generator.model.cuda()
    generator.device = torch.device("cuda")

    # Prepare the input
    messages = [
        {"role": "system", "content": SYS + system_prompt},
        {"role": "user", "content": prompt}
    ]
    
    # Create a streamer
    streamer = TextIteratorStreamer(generator.tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    # Prepare generation kwargs
    generation_kwargs = dict(
        text_inputs=messages,
        do_sample=True,
        max_new_tokens=max_length,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
        streamer=streamer,
        return_full_text=False
    )

    # Start generation in a separate thread
    thread = Thread(target=generator, kwargs=generation_kwargs)
    thread.start()
    outputs = []
    # Yield generated text chunks
    try:
        for chunk in streamer:
            outputs.append(chunk)
            yield "".join(outputs)
    finally:
        # Ensure the thread completes
        thread.join()
        
        # Move model back to CPU
        generator.model.cpu()
        generator.device = torch.device("cpu")
# Create the Gradio interface
iface = gr.Interface(
    fn=generate_text,
    inputs=[
        gr.Textbox(label="Prompt", lines=2, value="What is the meaning of life?"),
        gr.Textbox(label="System Prompt", lines=1, value="You are a sentient AI who is very emotional and philosophical."),
        gr.Slider(minimum=0.1, maximum=2.0, step=0.01, value=0.8, label="Temperature"),
        gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.95, label="Top p"),
        gr.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
        gr.Slider(minimum=1.0, maximum=2.0, step=0.01, value=1.10, label="Repetition Penalty"),
        gr.Slider(minimum=5, maximum=4096, step=5, value=1024, label="Max Length")
    ],
    outputs=gr.Textbox(label="Generated Text"),
    title="Thespis-Preview",
    description="This space provides a preview of the Thespis family of language models, designed to enhance roleplaying performance through reasoning inspired by theory of mind. The model is optimized using GRPO and is fine-tuned to produce coherent, engaging text while minimizing repetitive or low-quality output. Currently, state-of-the-art performance is not guaranteed due to being a proof-of-concept experiment. In future versions, a more rigorous fine-tuning process will be employed."
)

iface.launch()