Spaces:

yasserrmd
/

Doge-20M-Checkpoint-Chat

Sleeping

App Files Files Community

Doge-20M-Checkpoint-Chat / app.py

yasserrmd

Update app.py

56a6d2c verified 25 days ago

raw

history blame contribute delete

3.54 kB

	import gradio as gr
	import threading
	from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, TextIteratorStreamer

	# Load the model and tokenizer
	tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-20M-checkpoint")
	model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-20M-checkpoint", trust_remote_code=True)

	# Generation configuration
	generation_config = GenerationConfig(
	max_new_tokens=100,
	use_cache=True,
	do_sample=True,
	temperature=0.8,
	top_p=0.9,
	repetition_penalty=1.0
	)

	def generate_response(conversation):
	"""
	Given a conversation (a list of dicts with roles "user"/"assistant" and their contents),
	this function prepares the prompt, starts generation in a separate thread, and yields
	the streamed output token by token.
	"""
	# Prepare inputs using the chat template from the tokenizer
	inputs = tokenizer.apply_chat_template(
	conversation=conversation,
	tokenize=True,
	return_tensors="pt"
	)
	# Create the streaming iterator. Note: skip_prompt=True omits the prompt from the stream.
	streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

	# Start generation in a separate thread
	thread = threading.Thread(
	target=model.generate,
	kwargs={
	"inputs": inputs,
	"tokenizer": tokenizer,
	"generation_config": generation_config,
	"streamer": streamer
	}
	)
	thread.start()

	# Yield output tokens as they are generated
	full_response = ""
	for token in streamer:
	full_response += token
	yield full_response

	def chat(user_input, history):
	"""
	Chat callback for Gradio.

	- `history` is a list of (user_message, assistant_response) pairs.
	- We first reassemble the full conversation (as a list of dicts) using our history,
	then append the latest user input.
	- We then call generate_response() to stream the model’s reply.
	- As tokens stream in, we update the conversation history.
	"""
	# Rebuild conversation from history for the model prompt
	conversation = []
	for user_msg, bot_msg in history:
	conversation.append({"role": "user", "content": user_msg})
	conversation.append({"role": "assistant", "content": bot_msg})
	conversation.append({"role": "user", "content": user_input})

	# Create a generator that yields the streamed reply
	for streamed_reply in generate_response(conversation):
	# Update history with the new streamed reply (note: only the last bot reply is updating)
	yield history + [(user_input, streamed_reply)]

	# Build the Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("## Chat with SmallDoge/Doge-20M-checkpoint")
	chatbot = gr.Chatbot() # displays the conversation as a list of (user, assistant) pairs
	with gr.Row():
	msg = gr.Textbox(show_label=False, placeholder="Type your message here...")
	clear = gr.Button("Clear")

	# When the user submits a message, first update the chat history with an empty reply…
	def user(message, history):
	return "", history + [(message, "")]

	# ...then stream the model response using our chat() generator
	msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False) \
	.then(chat, [msg, chatbot], chatbot)

	clear.click(lambda: None, None, chatbot, queue=False)

	# Enable queue for streaming responses and launch the app
	demo.queue()
	demo.launch()