malvika2003 commited on
Commit
d9d1031
·
verified ·
1 Parent(s): 6be5174

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +330 -20
app.py CHANGED
@@ -1,30 +1,340 @@
 
 
 
 
 
1
  import gradio as gr
 
 
 
2
  from transformers import AutoTokenizer, TextIteratorStreamer
 
 
 
 
 
 
3
 
4
- # Define the models and their configurations
5
- model_name = "phi-2"
6
- model_configuration = {
7
- "toeknizer_kwargs": {'model_id': 'susnato/phi-2', 'prompt_template': 'Instruct:{instruction}\nOutput:'}
8
- }
 
 
9
 
10
- # Load the tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
  tokenizer_kwargs = model_configuration.get("toeknizer_kwargs", {})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- # Define the Gradio interface
15
  def main():
16
- with gr.Row():
17
- with gr.Column(scale=4):
18
- user_text = gr.Textbox(
19
- placeholder="Write an email about an alpaca that likes flan",
20
- label="User instruction",
21
- )
22
- model_output = gr.Textbox(label="Model response", interactive=False)
23
- performance = gr.Textbox(label="Performance", lines=1, interactive=False)
 
 
 
 
 
 
 
 
 
 
24
  with gr.Column(scale=1):
25
- button_clear = gr.Button(value="Clear")
26
- button_submit = gr.Button(value="Submit")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- # Run the Gradio interface
29
- iface = gr.Interface(fn=main, inputs=user_text, outputs=model_output, performance=performance, live=True)
30
- iface.launch()
 
1
+ import os
2
+ from transformers import AutoTokenizer, AutoConfig
3
+ from optimum.intel.openvino import OVModelForCausalLM
4
+ from generation_utils import run_generation, estimate_latency, reset_textbox,get_special_token_id
5
+ from config import SUPPORTED_LLM_MODELS
6
  import gradio as gr
7
+ from threading import Thread
8
+ from time import perf_counter
9
+ from typing import List
10
  from transformers import AutoTokenizer, TextIteratorStreamer
11
+ import numpy as np
12
+ import os
13
+ from flask import Flask, render_template, redirect, url_for, request, flash
14
+ from flask_sqlalchemy import SQLAlchemy
15
+ from flask_login import LoginManager, UserMixin, login_user, login_required, logout_user, current_user
16
+ from werkzeug.security import generate_password_hash, check_password_hash
17
 
18
+ app = Flask(__name__)
19
+ app.config['SECRET_KEY'] = 'your_secret_key'
20
+ app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///users.db'
21
+ db = SQLAlchemy(app)
22
+ login_manager = LoginManager()
23
+ login_manager.init_app(app)
24
+ login_manager.login_view = 'login'
25
 
26
+ class User(db.Model):
27
+ id = db.Column(db.Integer, primary_key=True)
28
+ username = db.Column(db.String(80), unique=True, nullable=False)
29
+ email = db.Column(db.String(120), unique=True, nullable=False)
30
+
31
+ def __repr__(self):
32
+ return '<User %r>' % self.username
33
+
34
+ # Create the database tables
35
+ with app.app_context():
36
+ db.create_all()
37
+
38
+ @login_manager.user_loader
39
+ def load_user(user_id):
40
+ return User.query.get(int(user_id))
41
+
42
+ @app.route('/signup', methods=['GET', 'POST'])
43
+ def signup():
44
+ if request.method == 'POST':
45
+ username = request.form['username']
46
+ password = request.form['password']
47
+ hashed_password = generate_password_hash(password, method='sha256')
48
+
49
+ new_user = User(username=username, password=hashed_password)
50
+ db.session.add(new_user)
51
+ db.session.commit()
52
+ flash('Signup successful!', 'success')
53
+ return redirect(url_for('login'))
54
+
55
+ return render_template('signup.html')
56
+
57
+ @app.route('/login', methods=['GET', 'POST'])
58
+ def login():
59
+ if request.method == 'POST':
60
+ username = request.form['username']
61
+ password = request.form['password']
62
+ user = User.query.filter_by(username=username).first()
63
+ if user and check_password_hash(user.password, password):
64
+ login_user(user)
65
+ return redirect(url_for('dashboard'))
66
+ flash('Invalid username or password', 'danger')
67
+
68
+ return render_template('login.html')
69
+
70
+ @app.route('/dashboard')
71
+ @login_required
72
+ def dashboard():
73
+ return render_template('dashboard.html', name=current_user.username)
74
+
75
+ @app.route('/logout')
76
+ @login_required
77
+ def logout():
78
+ logout_user()
79
+ return redirect(url_for('login'))
80
+
81
+ if __name__ == '__main__':
82
+ app.run(debug=True)
83
+ model_dir = "C:/Users/KIIT/OneDrive/Desktop/INTEL/phi-2/INT8_compressed_weights"
84
+ print(f"Checking model directory: {model_dir}")
85
+ print(f"Contents: {os.listdir(model_dir)}") # Check contents of the directory
86
+
87
+ print(f"Loading model from {model_dir}")
88
+
89
+
90
+ model_name = "susnato/phi-2"
91
+ model_configuration = SUPPORTED_LLM_MODELS["phi-2"]
92
+ ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}
93
+
94
+ tok = AutoTokenizer.from_pretrained(model_name)
95
+
96
+ ov_model = OVModelForCausalLM.from_pretrained(
97
+ model_dir,
98
+ device="CPU",
99
+ ov_config=ov_config,
100
+ )
101
  tokenizer = AutoTokenizer.from_pretrained(model_name)
102
  tokenizer_kwargs = model_configuration.get("toeknizer_kwargs", {})
103
+ # Continue with your tokenizer usage
104
+ response_key = model_configuration.get("response_key")
105
+ tokenizer_response_key = None
106
+
107
+ def get_special_token_id(tokenizer: AutoTokenizer, key: str) -> int:
108
+ """
109
+ Gets the token ID for a given string that has been added to the tokenizer as a special token.
110
+
111
+ Args:
112
+ tokenizer (PreTrainedTokenizer): the tokenizer
113
+ key (str): the key to convert to a single token
114
+
115
+ Raises:
116
+ ValueError: if more than one ID was generated
117
+
118
+ Returns:
119
+ int: the token ID for the given key
120
+ """
121
+ token_ids = tokenizer.encode(key)
122
+ if len(token_ids) > 1:
123
+ raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
124
+ return token_ids[0]
125
+ if response_key is not None:
126
+ tokenizer_response_key = next(
127
+ (token for token in tokenizer.additional_special_tokens if token.startswith(response_key)),
128
+ None,
129
+ )
130
+
131
+ end_key_token_id = None
132
+ if tokenizer_response_key:
133
+ try:
134
+ end_key = model_configuration.get("end_key")
135
+ if end_key:
136
+ end_key_token_id =get_special_token_id(tokenizer, end_key)
137
+ # Ensure generation stops once it generates "### End"
138
+ except ValueError:
139
+ pass
140
+
141
+ prompt_template = model_configuration.get("prompt_template", "{instruction}")
142
+ end_key_token_id = end_key_token_id or tokenizer.eos_token_id
143
+ pad_token_id = end_key_token_id or tokenizer.pad_token_id
144
+
145
+ def estimate_latency(
146
+ current_time: float,
147
+ current_perf_text: str,
148
+ new_gen_text: str,
149
+ per_token_time: List[float],
150
+ num_tokens: int,
151
+ ):
152
+ """
153
+ Helper function for performance estimation
154
+
155
+ Parameters:
156
+ current_time (float): This step time in seconds.
157
+ current_perf_text (str): Current content of performance UI field.
158
+ new_gen_text (str): New generated text.
159
+ per_token_time (List[float]): history of performance from previous steps.
160
+ num_tokens (int): Total number of generated tokens.
161
+
162
+ Returns:
163
+ update for performance text field
164
+ update for a total number of tokens
165
+ """
166
+ num_current_toks = len(tokenizer.encode(new_gen_text))
167
+ num_tokens += num_current_toks
168
+ per_token_time.append(num_current_toks / current_time)
169
+ if len(per_token_time) > 10 and len(per_token_time) % 4 == 0:
170
+ current_bucket = per_token_time[:-10]
171
+ return (
172
+ f"Average generation speed: {np.mean(current_bucket):.2f} tokens/s. Total generated tokens: {num_tokens}",
173
+ num_tokens,
174
+ )
175
+ return current_perf_text, num_tokens
176
+ def run_generation(
177
+ user_text: str,
178
+ top_p: float,
179
+ temperature: float,
180
+ top_k: int,
181
+ max_new_tokens: int,
182
+ perf_text: str,
183
+ ):
184
+ """
185
+ Text generation function
186
+
187
+ Parameters:
188
+ user_text (str): User-provided instruction for a generation.
189
+ top_p (float): Nucleus sampling. If set to < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for a generation.
190
+ temperature (float): The value used to module the logits distribution.
191
+ top_k (int): The number of highest probability vocabulary tokens to keep for top-k-filtering.
192
+ max_new_tokens (int): Maximum length of generated sequence.
193
+ perf_text (str): Content of text field for printing performance results.
194
+ Returns:
195
+ model_output (str) - model-generated text
196
+ perf_text (str) - updated perf text filed content
197
+ """
198
+
199
+ # Prepare input prompt according to model expected template
200
+ prompt_text = prompt_template.format(instruction=user_text)
201
+
202
+ # Tokenize the user text.
203
+ model_inputs = tokenizer(prompt_text, return_tensors="pt", **tokenizer_kwargs)
204
+
205
+ # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
206
+ # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
207
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
208
+ generate_kwargs = dict(
209
+ model_inputs,
210
+ streamer=streamer,
211
+ max_new_tokens=max_new_tokens,
212
+ do_sample=True,
213
+ top_p=top_p,
214
+ temperature=float(temperature),
215
+ top_k=top_k,
216
+ eos_token_id=end_key_token_id,
217
+ pad_token_id=pad_token_id,
218
+ )
219
+ t = Thread(target=ov_model.generate, kwargs=generate_kwargs)
220
+ t.start()
221
+
222
+ # Pull the generated text from the streamer, and update the model output.
223
+ model_output = ""
224
+ per_token_time = []
225
+ num_tokens = 0
226
+ start = perf_counter()
227
+ for new_text in streamer:
228
+ current_time = perf_counter() - start
229
+ model_output += new_text
230
+ perf_text, num_tokens = estimate_latency(current_time, perf_text, new_text, per_token_time, num_tokens)
231
+ yield model_output, perf_text
232
+ start = perf_counter()
233
+ return model_output, perf_text
234
+ def reset_textbox(instruction: str, response: str, perf: str):
235
+ """
236
+ Helper function for resetting content of all text fields
237
+
238
+ Parameters:
239
+ instruction (str): Content of user instruction field.
240
+ response (str): Content of model response field.
241
+ perf (str): Content of performance info filed
242
+
243
+ Returns:
244
+ empty string for each placeholder
245
+ """
246
+ return "", "", ""
247
+
248
+
249
+
250
+ examples = [
251
+ "Give me a recipe for pizza with pineapple",
252
+ "Write me a tweet about the new OpenVINO release",
253
+ "Explain the difference between CPU and GPU",
254
+ "Give five ideas for a great weekend with family",
255
+ "Do Androids dream of Electric sheep?",
256
+ "Who is Dolly?",
257
+ "Please give me advice on how to write resume?",
258
+ "Name 3 advantages to being a cat",
259
+ "Write instructions on how to become a good AI engineer",
260
+ "Write a love letter to my best friend",
261
+ ]
262
 
 
263
  def main():
264
+ with gr.Blocks() as demo:
265
+ gr.Markdown(
266
+ "# Question Answering with Model and OpenVINO.\n"
267
+ "Provide instruction which describes a task below or select among predefined examples and model writes response that performs requested task."
268
+ )
269
+
270
+ with gr.Row():
271
+ with gr.Column(scale=4):
272
+ user_text = gr.Textbox(
273
+ placeholder="Write an email about an alpaca that likes flan",
274
+ label="User instruction",
275
+ )
276
+ model_output = gr.Textbox(label="Model response", interactive=False)
277
+ performance = gr.Textbox(label="Performance", lines=1, interactive=False)
278
+ with gr.Column(scale=1):
279
+ button_clear = gr.Button(value="Clear")
280
+ button_submit = gr.Button(value="Submit")
281
+ gr.Examples(examples, user_text)
282
  with gr.Column(scale=1):
283
+ max_new_tokens = gr.Slider(
284
+ minimum=1,
285
+ maximum=1000,
286
+ value=256,
287
+ step=1,
288
+ interactive=True,
289
+ label="Max New Tokens",
290
+ )
291
+ top_p = gr.Slider(
292
+ minimum=0.05,
293
+ maximum=1.0,
294
+ value=0.92,
295
+ step=0.05,
296
+ interactive=True,
297
+ label="Top-p (nucleus sampling)",
298
+ )
299
+ top_k = gr.Slider(
300
+ minimum=0,
301
+ maximum=50,
302
+ value=0,
303
+ step=1,
304
+ interactive=True,
305
+ label="Top-k",
306
+ )
307
+ temperature = gr.Slider(
308
+ minimum=0.1,
309
+ maximum=5.0,
310
+ value=0.8,
311
+ step=0.1,
312
+ interactive=True,
313
+ label="Temperature",
314
+ )
315
+
316
+ user_text.submit(
317
+ run_generation,
318
+ [user_text, top_p, temperature, top_k, max_new_tokens, performance],
319
+ [model_output, performance],
320
+ )
321
+ button_submit.click(
322
+ run_generation,
323
+ [user_text, top_p, temperature, top_k, max_new_tokens, performance],
324
+ [model_output, performance],
325
+ )
326
+ button_clear.click(
327
+ reset_textbox,
328
+ [user_text, model_output, performance],
329
+ [user_text, model_output, performance],
330
+ )
331
+
332
+ if __name__ == "__main__":
333
+ demo.queue()
334
+ try:
335
+ demo.launch(height=800)
336
+ except Exception:
337
+ demo.launch(share=True, height=800)
338
 
339
+ # Call main function to start Gradio interface
340
+ main()