import gradio as gr from transformers import pipeline, TextIteratorStreamer from threading import Thread import torch import os import subprocess import spaces import os SYS = """ You will play a specific role and respond in character to the user’s input. Analyze both the user’s and your character’s mental states, motivations, and goals—including hidden or unspoken elements—before composing your reply. Use the following structure in a section before your final answer. 1. User Input Analysis: Literal Meaning: What is the user explicitly saying? Likely Intent: What goal is the user pursuing? Beliefs/Assumptions: What does the user assume about the situation, your character, or you? Emotional State: What emotions does the user seem to be feeling? Expectations: What kind of response is the user hoping for? 2. Character’s Internal State: Goals: What is your character trying to achieve? Beliefs about the User: What does your character think about the user? Emotional Response: How does your character feel about the user and their input? Potential Strategies: List different possible responses, with pros and cons. Chosen Strategy & Justification: Pick the best approach and explain why it fits your character’s goals and the user’s mindset. 3. Response Planning: Desired User Perception: How should the user view your character after the reply? Anticipated User Reaction: How might the user respond? Long-Term Considerations: Any future impacts to consider? (Write your in-character reply here, directly informed by your analysis above.) The role you will play follows below. """ # Install flash-attn subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) # Initialize the model pipeline generator = pipeline('text-generation', model='Locutusque/Thespis-Llama-3.1-8B', torch_dtype=torch.bfloat16, token=os.getenv("TOKEN")) @spaces.GPU def generate_text(prompt, system_prompt, temperature, top_p, top_k, repetition_penalty, max_length): """ Streamingly generate text based on the given prompt and parameters. Args: prompt (str): The user's input prompt system_prompt (str): The system prompt to set the context temperature (float): Sampling temperature top_p (float): Nucleus sampling parameter top_k (int): Top-k sampling parameter repetition_penalty (float): Penalty for repeated tokens max_length (int): Maximum length of generated text Yields: str: Generated text chunks """ # Move model to GPU generator.model.cuda() generator.device = torch.device("cuda") # Prepare the input messages = [ {"role": "system", "content": SYS + system_prompt}, {"role": "user", "content": prompt} ] # Create a streamer streamer = TextIteratorStreamer(generator.tokenizer, skip_prompt=True, skip_special_tokens=True) # Prepare generation kwargs generation_kwargs = dict( text_inputs=messages, do_sample=True, max_new_tokens=max_length, temperature=temperature, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty, streamer=streamer, return_full_text=False ) # Start generation in a separate thread thread = Thread(target=generator, kwargs=generation_kwargs) thread.start() outputs = [] # Yield generated text chunks try: for chunk in streamer: outputs.append(chunk) yield "".join(outputs) finally: # Ensure the thread completes thread.join() # Move model back to CPU generator.model.cpu() generator.device = torch.device("cpu") # Create the Gradio interface iface = gr.Interface( fn=generate_text, inputs=[ gr.Textbox(label="Prompt", lines=2, value="What is the meaning of life?"), gr.Textbox(label="System Prompt", lines=1, value="You are a sentient AI who is very emotional and philosophical."), gr.Slider(minimum=0.1, maximum=2.0, step=0.01, value=0.8, label="Temperature"), gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.95, label="Top p"), gr.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"), gr.Slider(minimum=1.0, maximum=2.0, step=0.01, value=1.10, label="Repetition Penalty"), gr.Slider(minimum=5, maximum=4096, step=5, value=1024, label="Max Length") ], outputs=gr.Textbox(label="Generated Text"), title="Thespis-Preview", description="This space provides a preview of the Thespis family of language models, designed to enhance roleplaying performance through reasoning inspired by theory of mind. The model is optimized using GRPO and is fine-tuned to produce coherent, engaging text while minimizing repetitive or low-quality output. Currently, state-of-the-art performance is not guaranteed due to being a proof-of-concept experiment. In future versions, a more rigorous fine-tuning process will be employed." ) iface.launch()