icn_v2_DEMO

Sleeping

App Files Files Community

icn_v2_DEMO / app.py

wop

Update app.py

03a5566 verified 3 months ago

raw

history blame

3.09 kB

	from transformers import pipeline
	import gradio as gr
	import json
	import time

	# Initialize the pipeline with the new model
	pipe = pipeline("text-generation", model="Blexus/Quble_test_model_v1_INSTRUCT_v2")

	def format_prompt(message, system, history):
	prompt = f"SYSTEM: {system} <\|endofsystem\|>"

	for entry in history:
	if len(entry) == 2:
	user_prompt, bot_response = entry
	prompt += f"USER: {user_prompt} <\|endofuser\|>\nASSISTANT: {bot_response}<\|endoftext\|>\n"

	prompt += f"USER: {message}<\|endofuser\|>\nASSISTANT:"
	return prompt

	def generate(prompt, system, history, temperature=0.9, max_new_tokens=4096, top_p=0.9, repetition_penalty=1.2):
	temperature = float(temperature)
	if temperature < 1e-2:
	temperature = 1e-2
	top_p = float(top_p)

	formatted_prompt = format_prompt(prompt, system, history)
	response_text = "We are sorry but Quble doesn't know how to answer."
	# Generate the response without streaming
	try:
	response = pipe(formatted_prompt, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty)[0]["generated_text"]
	response_text = response.split("ASSISTANT:")[-1].strip()

	# Simulate streaming by yielding parts of the response
	accumulated_response = "" # To keep track of the full response
	for char in response_text:
	accumulated_response += char # Append the new character
	yield accumulated_response # Yield the accumulated response
	time.sleep(0.02) # Add a slight delay to simulate typing

	except Exception as e:
	print(f"Error generating response: {e}")

	customCSS = """
	#component-7 {
	height: 1600px;
	flex-grow: 4;
	}
	"""

	additional_inputs = [
	gr.Textbox(
	label="System prompt",
	value="You are a helpful intelligent assistant. Your name is Quble.",
	info="System prompt",
	interactive=True,
	),
	gr.Slider(
	label="Temperature",
	value=0.9,
	minimum=0.0,
	maximum=1.0,
	step=0.05,
	interactive=True,
	info="Higher values produce more diverse outputs",
	),
	gr.Slider(
	label="Max new tokens",
	value=1024,
	minimum=64,
	maximum=4096,
	step=64,
	interactive=True,
	info="The maximum numbers of new tokens",
	),
	gr.Slider(
	label="Top-p (nucleus sampling)",
	value=0.90,
	minimum=0.0,
	maximum=1,
	step=0.05,
	interactive=True,
	info="Higher values sample more low-probability tokens",
	),
	gr.Slider(
	label="Repetition penalty",
	value=1.2,
	minimum=1.0,
	maximum=2.0,
	step=0.05,
	interactive=True,
	info="Penalize repeated tokens",
	)
	]

	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.ChatInterface(
	generate,
	additional_inputs=additional_inputs,
	)

	demo.set_css(customCSS)

	demo.queue().launch(debug=True)