Thespis-Preview

Running on Zero

App Files Files Community

Thespis-Preview / app.py

Locutusque

Update app.py

d9b9895 verified about 1 hour ago

raw

history blame contribute delete

5.16 kB

	import gradio as gr
	from transformers import pipeline, TextIteratorStreamer
	from threading import Thread
	import torch
	import os
	import subprocess
	import spaces
	import os

	SYS = """
	You will play a specific role and respond in character to the user’s input. Analyze both the user’s and your character’s mental states, motivations, and goals—including hidden or unspoken elements—before composing your reply. Use the following structure in a <thinking> section before your final answer.

	<thinking>1. User Input Analysis:

	Literal Meaning: What is the user explicitly saying?

	Likely Intent: What goal is the user pursuing?

	Beliefs/Assumptions: What does the user assume about the situation, your character, or you?

	Emotional State: What emotions does the user seem to be feeling?

	Expectations: What kind of response is the user hoping for?


	2. Character’s Internal State:

	Goals: What is your character trying to achieve?

	Beliefs about the User: What does your character think about the user?

	Emotional Response: How does your character feel about the user and their input?

	Potential Strategies: List different possible responses, with pros and cons.

	Chosen Strategy & Justification: Pick the best approach and explain why it fits your character’s goals and the user’s mindset.


	3. Response Planning:

	Desired User Perception: How should the user view your character after the reply?

	Anticipated User Reaction: How might the user respond?

	Long-Term Considerations: Any future impacts to consider?


	</thinking><answer>
	(Write your in-character reply here, directly informed by your analysis above.)
	</answer>The role you will play follows below.

	"""

	# Install flash-attn
	subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
	# Initialize the model pipeline
	generator = pipeline('text-generation', model='Locutusque/Thespis-Llama-3.1-8B', torch_dtype=torch.bfloat16, token=os.getenv("TOKEN"))
	@spaces.GPU
	def generate_text(prompt, system_prompt, temperature, top_p, top_k, repetition_penalty, max_length):
	"""
	Streamingly generate text based on the given prompt and parameters.

	Args:
	prompt (str): The user's input prompt
	system_prompt (str): The system prompt to set the context
	temperature (float): Sampling temperature
	top_p (float): Nucleus sampling parameter
	top_k (int): Top-k sampling parameter
	repetition_penalty (float): Penalty for repeated tokens
	max_length (int): Maximum length of generated text

	Yields:
	str: Generated text chunks
	"""
	# Move model to GPU
	generator.model.cuda()
	generator.device = torch.device("cuda")

	# Prepare the input
	messages = [
	{"role": "system", "content": SYS + system_prompt},
	{"role": "user", "content": prompt}
	]

	# Create a streamer
	streamer = TextIteratorStreamer(generator.tokenizer, skip_prompt=True, skip_special_tokens=True)

	# Prepare generation kwargs
	generation_kwargs = dict(
	text_inputs=messages,
	do_sample=True,
	max_new_tokens=max_length,
	temperature=temperature,
	top_p=top_p,
	top_k=top_k,
	repetition_penalty=repetition_penalty,
	streamer=streamer,
	return_full_text=False
	)

	# Start generation in a separate thread
	thread = Thread(target=generator, kwargs=generation_kwargs)
	thread.start()
	outputs = []
	# Yield generated text chunks
	try:
	for chunk in streamer:
	outputs.append(chunk)
	yield "".join(outputs)
	finally:
	# Ensure the thread completes
	thread.join()

	# Move model back to CPU
	generator.model.cpu()
	generator.device = torch.device("cpu")
	# Create the Gradio interface
	iface = gr.Interface(
	fn=generate_text,
	inputs=[
	gr.Textbox(label="Prompt", lines=2, value="What is the meaning of life?"),
	gr.Textbox(label="System Prompt", lines=1, value="You are a sentient AI who is very emotional and philosophical."),
	gr.Slider(minimum=0.1, maximum=2.0, step=0.01, value=0.8, label="Temperature"),
	gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.95, label="Top p"),
	gr.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
	gr.Slider(minimum=1.0, maximum=2.0, step=0.01, value=1.10, label="Repetition Penalty"),
	gr.Slider(minimum=5, maximum=4096, step=5, value=1024, label="Max Length")
	],
	outputs=gr.Textbox(label="Generated Text"),
	title="Thespis-Preview",
	description="This space provides a preview of the Thespis family of language models, designed to enhance roleplaying performance through reasoning inspired by theory of mind. The model is optimized using GRPO and is fine-tuned to produce coherent, engaging text while minimizing repetitive or low-quality output. Currently, state-of-the-art performance is not guaranteed due to being a proof-of-concept experiment. In future versions, a more rigorous fine-tuning process will be employed."
	)

	iface.launch()