yasserrmd commited on
Commit
5c6e205
·
verified ·
1 Parent(s): 7acecf9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -0
app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import threading
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, TextIteratorStreamer
4
+
5
+ # Load the model and tokenizer
6
+ tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-20M-Instruct")
7
+ model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-20M-Instruct", trust_remote_code=True)
8
+
9
+ # Generation configuration
10
+ generation_config = GenerationConfig(
11
+ max_new_tokens=100,
12
+ use_cache=True,
13
+ do_sample=True,
14
+ temperature=0.8,
15
+ top_p=0.9,
16
+ repetition_penalty=1.0
17
+ )
18
+
19
+ def generate_response(conversation):
20
+ """
21
+ Given a conversation (a list of dicts with roles "user"/"assistant" and their contents),
22
+ this function prepares the prompt, starts generation in a separate thread, and yields
23
+ the streamed output token by token.
24
+ """
25
+ # Prepare inputs using the chat template from the tokenizer
26
+ inputs = tokenizer.apply_chat_template(
27
+ conversation=conversation,
28
+ tokenize=True,
29
+ return_tensors="pt"
30
+ )
31
+ # Create the streaming iterator. Note: skip_prompt=True omits the prompt from the stream.
32
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
33
+
34
+ # Start generation in a separate thread
35
+ thread = threading.Thread(
36
+ target=model.generate,
37
+ kwargs={
38
+ "inputs": inputs,
39
+ "tokenizer": tokenizer,
40
+ "generation_config": generation_config,
41
+ "streamer": streamer
42
+ }
43
+ )
44
+ thread.start()
45
+
46
+ # Yield output tokens as they are generated
47
+ full_response = ""
48
+ for token in streamer:
49
+ full_response += token
50
+ yield full_response
51
+
52
+ def chat(user_input, history):
53
+ """
54
+ Chat callback for Gradio.
55
+
56
+ - `history` is a list of (user_message, assistant_response) pairs.
57
+ - We first reassemble the full conversation (as a list of dicts) using our history,
58
+ then append the latest user input.
59
+ - We then call generate_response() to stream the model’s reply.
60
+ - As tokens stream in, we update the conversation history.
61
+ """
62
+ # Rebuild conversation from history for the model prompt
63
+ conversation = []
64
+ for user_msg, bot_msg in history:
65
+ conversation.append({"role": "user", "content": user_msg})
66
+ conversation.append({"role": "assistant", "content": bot_msg})
67
+ conversation.append({"role": "user", "content": user_input})
68
+
69
+ # Create a generator that yields the streamed reply
70
+ for streamed_reply in generate_response(conversation):
71
+ # Update history with the new streamed reply (note: only the last bot reply is updating)
72
+ yield history + [(user_input, streamed_reply)]
73
+
74
+ # Build the Gradio interface
75
+ with gr.Blocks() as demo:
76
+ gr.Markdown("## Chat with SmallDoge/Doge-20M-Instruct")
77
+ chatbot = gr.Chatbot() # displays the conversation as a list of (user, assistant) pairs
78
+ with gr.Row():
79
+ msg = gr.Textbox(show_label=False, placeholder="Type your message here...")
80
+ clear = gr.Button("Clear")
81
+
82
+ # When the user submits a message, first update the chat history with an empty reply…
83
+ def user(message, history):
84
+ return "", history + [(message, "")]
85
+
86
+ # ...then stream the model response using our chat() generator
87
+ msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False) \
88
+ .then(chat, [msg, chatbot], chatbot)
89
+
90
+ clear.click(lambda: None, None, chatbot, queue=False)
91
+
92
+ # Enable queue for streaming responses and launch the app
93
+ demo.queue()
94
+ demo.launch()