savage1221 commited on
Commit
602d649
Β·
verified Β·
1 Parent(s): 2d2855d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +363 -0
app.py CHANGED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from transformers import AutoTokenizer
4
+ from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig
5
+ from optimum.intel.openvino import OVModelForCausalLM
6
+ from transformers import AutoConfig, AutoTokenizer
7
+ import gradio as gr
8
+ import time
9
+ from threading import Thread
10
+
11
+ from transformers import (
12
+ TextIteratorStreamer,
13
+ StoppingCriteria,
14
+ StoppingCriteriaList,
15
+ GenerationConfig,
16
+ )
17
+ # model_name = "openai-community/gpt2-large"
18
+ # model_dir = "F:\\phi3\\openvinomodel\\phi3\\int4"
19
+ # model_name = "savage1221/lora-fine"
20
+ # save_name = model_name.split("/")[-1] + "_openvino"
21
+ # precision = "f32"
22
+
23
+
24
+ # quantization_config = OVWeightQuantizationConfig(
25
+ # bits=4,
26
+ # sym=False,
27
+ # group_size=128,
28
+ # ratio=0.6,
29
+ # trust_remote_code=True,
30
+ # )
31
+
32
+ # ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}
33
+
34
+ # device = "gpu"
35
+
36
+
37
+ # load_kwargs = {
38
+ # "device": device,
39
+ # "ov_config": {
40
+ # "PERFORMANCE_HINT": "LATENCY",
41
+ # # "INFERENCE_PRECISION_HINT": precision,
42
+ # "CACHE_DIR": os.path.join(save_name, "model_cache"), # OpenVINO will use this directory as cache
43
+ # },
44
+ # "compile": False,
45
+ # "quantization_config": quantization_config,
46
+ # "trust_remote_code": True,
47
+ # # ov_config = ov_config
48
+ # }
49
+
50
+ # # Check whether the model was already exported
51
+ # saved = os.path.exists(save_name)
52
+
53
+ # model = OVModelForCausalLM.from_pretrained(
54
+ # # model_name
55
+ # model_name if not saved else save_name,
56
+ # export=not saved,
57
+ # **load_kwargs,
58
+ # )
59
+ # model = OVModelForCausalLM.from_pretrained(
60
+ # model_name,
61
+ # device='GPU.0',
62
+ # ov_config=ov_config,
63
+ # config=AutoConfig.from_pretrained(model_name, trust_remote_code=True),
64
+ # trust_remote_code=True,
65
+ # )
66
+
67
+ # # Load tokenizer to be used with the model
68
+ # tokenizer = AutoTokenizer.from_pretrained(model_name if not saved else save_name)
69
+ # tokenizer = AutoTokenizer.from_pretrained(model_name )
70
+
71
+ # # Save the exported model locally
72
+ # if not saved:
73
+ # model.save_pretrained(save_name)
74
+ # tokenizer.save_pretrained(save_name)
75
+
76
+ # # TODO Optional: export to huggingface/hub
77
+
78
+ # model_size = os.stat(os.path.join(save_name, "openvino_model.bin")).st_size / 1024 ** 3
79
+ # print(f'Model size in FP32: ~5.4GB, current model size in 4bit: {model_size:.2f}GB')
80
+
81
+ #####################################################################
82
+
83
+ # Load model directly
84
+ from transformers import AutoTokenizer, AutoModelForCausalLM
85
+
86
+ tokenizer = AutoTokenizer.from_pretrained("savage1221/lora-fine", trust_remote_code=True)
87
+ model = AutoModelForCausalLM.from_pretrained("savage1221/lora-fine", trust_remote_code=True)
88
+
89
+
90
+ # Copied and modified from https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/generation.py#L13
91
+ class SuffixCriteria(StoppingCriteria):
92
+ def __init__(self, start_length, eof_strings, tokenizer, check_fn=None):
93
+ self.start_length = start_length
94
+ self.eof_strings = eof_strings
95
+ self.tokenizer = tokenizer
96
+ if check_fn is None:
97
+ check_fn = lambda decoded_generation: any(
98
+ [decoded_generation.endswith(stop_string) for stop_string in self.eof_strings]
99
+ )
100
+ self.check_fn = check_fn
101
+
102
+ def __call__(self, input_ids, scores, **kwargs):
103
+ """Returns True if generated sequence ends with any of the stop strings"""
104
+ decoded_generations = self.tokenizer.batch_decode(input_ids[:, self.start_length :])
105
+ return all([self.check_fn(decoded_generation) for decoded_generation in decoded_generations])
106
+
107
+
108
+ def is_partial_stop(output, stop_str):
109
+ """Check whether the output contains a partial stop str."""
110
+ for i in range(0, min(len(output), len(stop_str))):
111
+ if stop_str.startswith(output[-i:]):
112
+ return True
113
+ return False
114
+
115
+
116
+
117
+ # Set the chat template to the tokenizer. The chat template implements the simple template of
118
+ # User: content
119
+ # Assistant: content
120
+ # ...
121
+ # Read more about chat templates here https://huggingface.co/docs/transformers/main/en/chat_templating
122
+ tokenizer.chat_template = "{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
123
+
124
+
125
+ def prepare_history_for_model(history):
126
+ """
127
+ Converts the history to a tokenized prompt in the format expected by the model.
128
+ Params:
129
+ history: dialogue history
130
+ Returns:
131
+ Tokenized prompt
132
+ """
133
+ messages = []
134
+ for idx, (user_msg, model_msg) in enumerate(history):
135
+ # skip the last assistant message if its empty, the tokenizer will do the formating
136
+ if idx == len(history) - 1 and not model_msg:
137
+ messages.append({"role": "User", "content": user_msg})
138
+ break
139
+ if user_msg:
140
+ messages.append({"role": "User", "content": user_msg})
141
+ if model_msg:
142
+ messages.append({"role": "Assistant", "content": model_msg})
143
+ input_token = tokenizer.apply_chat_template(
144
+ messages,
145
+ add_generation_prompt=True,
146
+ tokenize=True,
147
+ return_tensors="pt",
148
+ return_dict=True
149
+ )
150
+ return input_token
151
+
152
+
153
+ def generate(history, temperature, max_new_tokens, top_p, repetition_penalty, assisted):
154
+ """
155
+ Generates the assistant's reponse given the chatbot history and generation parameters
156
+
157
+ Params:
158
+ history: conversation history formated in pairs of user and assistant messages `[user_message, assistant_message]`
159
+ temperature: parameter for control the level of creativity in AI-generated text.
160
+ By adjusting the `temperature`, you can influence the AI model's probability distribution, making the text more focused or diverse.
161
+ max_new_tokens: The maximum number of tokens we allow the model to generate as a response.
162
+ top_p: parameter for control the range of tokens considered by the AI model based on their cumulative probability.
163
+ repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text.
164
+ assisted: boolean parameter to enable/disable assisted generation with speculative decoding.
165
+ Yields:
166
+ Updated history and generation status.
167
+ """
168
+ start = time.perf_counter()
169
+ # Construct the input message string for the model by concatenating the current system message and conversation history
170
+ # Tokenize the messages string
171
+ inputs = prepare_history_for_model(history)
172
+ input_length = inputs['input_ids'].shape[1]
173
+ # truncate input in case it is too long.
174
+ # TODO improve this
175
+ if input_length > 2000:
176
+ history = [history[-1]]
177
+ inputs = prepare_history_for_model(history)
178
+ input_length = inputs['input_ids'].shape[1]
179
+
180
+ prompt_char = "β–Œ"
181
+ history[-1][1] = prompt_char
182
+ yield history, "Status: Generating...", *([gr.update(interactive=False)] * 4)
183
+
184
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
185
+
186
+ # Create a stopping criteria to prevent the model from playing the role of the user aswell.
187
+ stop_str = ["\nUser:", "\nAssistant:", "\nRules:", "\nQuestion:"]
188
+ stopping_criteria = StoppingCriteriaList([SuffixCriteria(input_length, stop_str, tokenizer)])
189
+ # Prepare input for generate
190
+ generation_config = GenerationConfig(
191
+ max_new_tokens=max_new_tokens,
192
+ do_sample=temperature > 0.0,
193
+ temperature=temperature if temperature > 0.0 else 1.0,
194
+ repetition_penalty=repetition_penalty,
195
+ top_p=top_p,
196
+ eos_token_id=[tokenizer.eos_token_id],
197
+ pad_token_id=tokenizer.eos_token_id,
198
+ )
199
+ generate_kwargs = dict(
200
+ streamer=streamer,
201
+ generation_config=generation_config,
202
+ stopping_criteria=stopping_criteria,
203
+ ) | inputs
204
+
205
+ if assisted:
206
+ target_generate = stateless_model.generate
207
+ generate_kwargs["assistant_model"] = asst_model
208
+ else:
209
+ target_generate = model.generate
210
+
211
+ t1 = Thread(target=target_generate, kwargs=generate_kwargs)
212
+ t1.start()
213
+
214
+ # Initialize an empty string to store the generated text.
215
+ partial_text = ""
216
+ for new_text in streamer:
217
+ partial_text += new_text
218
+ history[-1][1] = partial_text + prompt_char
219
+ for s in stop_str:
220
+ if (pos := partial_text.rfind(s)) != -1:
221
+ break
222
+ if pos != -1:
223
+ partial_text = partial_text[:pos]
224
+ break
225
+ elif any([is_partial_stop(partial_text, s) for s in stop_str]):
226
+ continue
227
+ yield history, "Status: Generating...", *([gr.update(interactive=False)] * 4)
228
+ history[-1][1] = partial_text
229
+ generation_time = time.perf_counter() - start
230
+ yield history, f'Generation time: {generation_time:.2f} sec', *([gr.update(interactive=True)] * 4)
231
+
232
+
233
+ #############################################################
234
+
235
+
236
+ # model.compile()
237
+
238
+
239
+ try:
240
+ demo.close()
241
+ except:
242
+ pass
243
+
244
+
245
+ EXAMPLES = [
246
+ ["What is OpenVINO?"],
247
+ ["Can you explain to me briefly what is Python programming language?"],
248
+ ["Explain the plot of Cinderella in a sentence."],
249
+ ["Write a Python function to perform binary search over a sorted list. Use markdown to write code"],
250
+ ["Lily has a rubber ball that she drops from the top of a wall. The wall is 2 meters tall. How long will it take for the ball to reach the ground?"],
251
+ ]
252
+
253
+
254
+ def add_user_text(message, history):
255
+ """
256
+ Add user's message to chatbot history
257
+
258
+ Params:
259
+ message: current user message
260
+ history: conversation history
261
+ Returns:
262
+ Updated history, clears user message and status
263
+ """
264
+ # Append current user message to history with a blank assistant message which will be generated by the model
265
+ history.append([message, None])
266
+ return ('', history)
267
+
268
+
269
+ def prepare_for_regenerate(history):
270
+ """
271
+ Delete last assistant message to prepare for regeneration
272
+
273
+ Params:
274
+ history: conversation history
275
+ Returns:
276
+ updated history
277
+ """
278
+ history[-1][1] = None
279
+ return history
280
+
281
+
282
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
283
+ gr.Markdown('<h1 style="text-align: center;">Chat with Phi-3 on Meteor Lake iGPU</h1>')
284
+ chatbot = gr.Chatbot()
285
+ with gr.Row():
286
+ assisted = gr.Checkbox(value=False, label="Assisted Generation", scale=10)
287
+ msg = gr.Textbox(placeholder="Enter message here...", show_label=False, autofocus=True, scale=75)
288
+ status = gr.Textbox("Status: Idle", show_label=False, max_lines=1, scale=15)
289
+ with gr.Row():
290
+ submit = gr.Button("Submit", variant='primary')
291
+ regenerate = gr.Button("Regenerate")
292
+ clear = gr.Button("Clear")
293
+ with gr.Accordion("Advanced Options:", open=False):
294
+ with gr.Row():
295
+ with gr.Column():
296
+ temperature = gr.Slider(
297
+ label="Temperature",
298
+ value=0.0,
299
+ minimum=0.0,
300
+ maximum=1.0,
301
+ step=0.05,
302
+ interactive=True,
303
+ )
304
+ max_new_tokens = gr.Slider(
305
+ label="Max new tokens",
306
+ value=512,
307
+ minimum=0,
308
+ maximum=1024,
309
+ step=32,
310
+ interactive=True,
311
+ )
312
+ with gr.Column():
313
+ top_p = gr.Slider(
314
+ label="Top-p (nucleus sampling)",
315
+ value=1.0,
316
+ minimum=0.0,
317
+ maximum=1.0,
318
+ step=0.05,
319
+ interactive=True,
320
+ )
321
+ repetition_penalty = gr.Slider(
322
+ label="Repetition penalty",
323
+ value=1.0,
324
+ minimum=1.0,
325
+ maximum=2.0,
326
+ step=0.1,
327
+ interactive=True,
328
+ )
329
+ gr.Examples(
330
+ EXAMPLES, inputs=msg, label="Click on any example and press the 'Submit' button"
331
+ )
332
+
333
+ # Sets generate function to be triggered when the user submit a new message
334
+ gr.on(
335
+ triggers=[submit.click, msg.submit],
336
+ fn=add_user_text,
337
+ inputs=[msg, chatbot],
338
+ outputs=[msg, chatbot],
339
+ queue=False,
340
+ ).then(
341
+ fn=generate,
342
+ inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, assisted],
343
+ outputs=[chatbot, status, msg, submit, regenerate, clear],
344
+ concurrency_limit=1,
345
+ queue=True
346
+ )
347
+ regenerate.click(
348
+ fn=prepare_for_regenerate,
349
+ inputs=chatbot,
350
+ outputs=chatbot,
351
+ queue=True,
352
+ concurrency_limit=1
353
+ ).then(
354
+ fn=generate,
355
+ inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, assisted],
356
+ outputs=[chatbot, status, msg, submit, regenerate, clear],
357
+ concurrency_limit=1,
358
+ queue=True
359
+ )
360
+ clear.click(fn=lambda: (None, "Status: Idle"), inputs=None, outputs=[chatbot, status], queue=False)
361
+
362
+
363
+ demo.launch()