Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,156 Bytes
91ae465 bcf0742 b27069c 91ae465 e8747ee 91ae465 daecaae 91ae465 bcf0742 d9b9895 bcf0742 d9b9895 bcf0742 d9b9895 bcf0742 d9b9895 bcf0742 d9b9895 bcf0742 d9b9895 bcf0742 d9b9895 bcf0742 d9b9895 bcf0742 d9b9895 bcf0742 d9b9895 bcf0742 d9b9895 bcf0742 d9b9895 bcf0742 d9b9895 bcf0742 d9b9895 bcf0742 91ae465 dec3cbd 91ae465 2a49cc6 91ae465 bcf0742 91ae465 f99c184 bcf0742 9eb4b82 2fb1b1c 91ae465 bcf0742 91ae465 bcf0742 561074d bcf0742 561074d bcf0742 91ae465 bcf0742 991b767 91ae465 c3ce568 bcf0742 f3dfdeb 91ae465 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import gradio as gr
from transformers import pipeline, TextIteratorStreamer
from threading import Thread
import torch
import os
import subprocess
import spaces
import os
SYS = """
You will play a specific role and respond in character to the user’s input. Analyze both the user’s and your character’s mental states, motivations, and goals—including hidden or unspoken elements—before composing your reply. Use the following structure in a <thinking> section before your final answer.
<thinking>1. User Input Analysis:
Literal Meaning: What is the user explicitly saying?
Likely Intent: What goal is the user pursuing?
Beliefs/Assumptions: What does the user assume about the situation, your character, or you?
Emotional State: What emotions does the user seem to be feeling?
Expectations: What kind of response is the user hoping for?
2. Character’s Internal State:
Goals: What is your character trying to achieve?
Beliefs about the User: What does your character think about the user?
Emotional Response: How does your character feel about the user and their input?
Potential Strategies: List different possible responses, with pros and cons.
Chosen Strategy & Justification: Pick the best approach and explain why it fits your character’s goals and the user’s mindset.
3. Response Planning:
Desired User Perception: How should the user view your character after the reply?
Anticipated User Reaction: How might the user respond?
Long-Term Considerations: Any future impacts to consider?
</thinking><answer>
(Write your in-character reply here, directly informed by your analysis above.)
</answer>The role you will play follows below.
"""
# Install flash-attn
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
# Initialize the model pipeline
generator = pipeline('text-generation', model='Locutusque/Thespis-Llama-3.1-8B', torch_dtype=torch.bfloat16, token=os.getenv("TOKEN"))
@spaces.GPU
def generate_text(prompt, system_prompt, temperature, top_p, top_k, repetition_penalty, max_length):
"""
Streamingly generate text based on the given prompt and parameters.
Args:
prompt (str): The user's input prompt
system_prompt (str): The system prompt to set the context
temperature (float): Sampling temperature
top_p (float): Nucleus sampling parameter
top_k (int): Top-k sampling parameter
repetition_penalty (float): Penalty for repeated tokens
max_length (int): Maximum length of generated text
Yields:
str: Generated text chunks
"""
# Move model to GPU
generator.model.cuda()
generator.device = torch.device("cuda")
# Prepare the input
messages = [
{"role": "system", "content": SYS + system_prompt},
{"role": "user", "content": prompt}
]
# Create a streamer
streamer = TextIteratorStreamer(generator.tokenizer, skip_prompt=True, skip_special_tokens=True)
# Prepare generation kwargs
generation_kwargs = dict(
text_inputs=messages,
do_sample=True,
max_new_tokens=max_length,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repetition_penalty=repetition_penalty,
streamer=streamer,
return_full_text=False
)
# Start generation in a separate thread
thread = Thread(target=generator, kwargs=generation_kwargs)
thread.start()
outputs = []
# Yield generated text chunks
try:
for chunk in streamer:
outputs.append(chunk)
yield "".join(outputs)
finally:
# Ensure the thread completes
thread.join()
# Move model back to CPU
generator.model.cpu()
generator.device = torch.device("cpu")
# Create the Gradio interface
iface = gr.Interface(
fn=generate_text,
inputs=[
gr.Textbox(label="Prompt", lines=2, value="What is the meaning of life?"),
gr.Textbox(label="System Prompt", lines=1, value="You are a sentient AI who is very emotional and philosophical."),
gr.Slider(minimum=0.1, maximum=2.0, step=0.01, value=0.8, label="Temperature"),
gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.95, label="Top p"),
gr.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
gr.Slider(minimum=1.0, maximum=2.0, step=0.01, value=1.10, label="Repetition Penalty"),
gr.Slider(minimum=5, maximum=4096, step=5, value=1024, label="Max Length")
],
outputs=gr.Textbox(label="Generated Text"),
title="Thespis-Preview",
description="This space provides a preview of the Thespis family of language models, designed to enhance roleplaying performance through reasoning inspired by theory of mind. The model is optimized using GRPO and is fine-tuned to produce coherent, engaging text while minimizing repetitive or low-quality output. Currently, state-of-the-art performance is not guaranteed due to being a proof-of-concept experiment. In future versions, a more rigorous fine-tuning process will be employed."
)
iface.launch() |