chansung commited on
Commit
4e963f7
1 Parent(s): 5c9d90a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -8
app.py CHANGED
@@ -123,7 +123,12 @@ def submit(
123
  compute_selector,
124
  min_node_selector,
125
  max_node_selector,
126
- security_selector
 
 
 
 
 
127
  ):
128
  compute_resources = compute_selector.split("·")
129
  accelerator = compute_resources[0][:3].strip()
@@ -148,7 +153,19 @@ def submit(
148
  "model": {
149
  "framework": framework_selector.lower(),
150
  "image": {
151
- "huggingface": {}
 
 
 
 
 
 
 
 
 
 
 
 
152
  },
153
  "repository": repository_selector.lower(),
154
  "revision": "main",
@@ -322,7 +339,7 @@ Name for your new endpoint""")
322
  gr.Markdown("""### Custom Cuda Kernels
323
 
324
  TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""")
325
- _ = gr.Dropdown(
326
  value="Enabled",
327
  choices=["Enabled", "Disabled"],
328
  interactive=True,
@@ -347,7 +364,7 @@ Name for your new endpoint""")
347
  gr.Markdown("""### Max Input Length (per Query)
348
 
349
  Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""")
350
- _ = gr.Number(
351
  value=1024,
352
  interactive=True,
353
  show_label=False,
@@ -358,7 +375,7 @@ Name for your new endpoint""")
358
  gr.Markdown("""### Max Number of Tokens (per Query)
359
 
360
  The larger this value, the more memory each request will consume and the less effective batching can be.""")
361
- _ = gr.Number(
362
  value=1512,
363
  interactive=True,
364
  show_label=False,
@@ -370,7 +387,7 @@ Name for your new endpoint""")
370
  gr.Markdown("""### Max Batch Prefill Tokens
371
 
372
  Number of prefill tokens used during continuous batching. It can be useful to adjust this number since the prefill operation is memory-intensive and compute-bound.""")
373
- _ = gr.Number(
374
  value=2048,
375
  interactive=True,
376
  show_label=False,
@@ -381,7 +398,7 @@ Name for your new endpoint""")
381
  gr.Markdown("""### Max Batch Total Tokens
382
 
383
  Number of tokens that can be passed before forcing waiting queries to be put on the batch. A value of 1000 can fit 10 queries of 100 tokens or a single query of 1000 tokens.""")
384
- _ = gr.Number(
385
  value=None,
386
  interactive=True,
387
  show_label=False,
@@ -416,7 +433,12 @@ Name for your new endpoint""")
416
  compute_selector,
417
  min_node_selector,
418
  max_node_selector,
419
- security_selector],
 
 
 
 
 
420
  outputs=status_txt)
421
 
422
  with gr.Tab("AWS", elem_classes=["no-border"]):
 
123
  compute_selector,
124
  min_node_selector,
125
  max_node_selector,
126
+ security_selector,
127
+ custom_kernel,
128
+ max_input_length,
129
+ max_tokens,
130
+ max_batch_prefill_token,
131
+ max_batch_total_token
132
  ):
133
  compute_resources = compute_selector.split("·")
134
  accelerator = compute_resources[0][:3].strip()
 
153
  "model": {
154
  "framework": framework_selector.lower(),
155
  "image": {
156
+ "custom": {
157
+ health_route: "/health",
158
+ env: {
159
+ DISABLE_CUSTOM_KERNELS: "true" if custom_kernel == "Enabled" else "false",
160
+ MAX_BATCH_PREFILL_TOKENS: str(max_batch_prefill_token),
161
+ MAX_BATCH_TOTAL_TOKENS?: str(max_batch_total_token);
162
+ MAX_INPUT_LENGTH: str(max_input_length),
163
+ MAX_TOTAL_TOKENS: str(max_tokens),
164
+ MODEL_ID: repository_selector.lower(),
165
+ # QUANTIZE: 'bitsandbytes' | 'gptq';
166
+ },
167
+ url: "ghcr.io/huggingface/text-generation-inference:1.0.1",
168
+ }
169
  },
170
  "repository": repository_selector.lower(),
171
  "revision": "main",
 
339
  gr.Markdown("""### Custom Cuda Kernels
340
 
341
  TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""")
342
+ custom_kernel = gr.Dropdown(
343
  value="Enabled",
344
  choices=["Enabled", "Disabled"],
345
  interactive=True,
 
364
  gr.Markdown("""### Max Input Length (per Query)
365
 
366
  Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""")
367
+ max_input_length = gr.Number(
368
  value=1024,
369
  interactive=True,
370
  show_label=False,
 
375
  gr.Markdown("""### Max Number of Tokens (per Query)
376
 
377
  The larger this value, the more memory each request will consume and the less effective batching can be.""")
378
+ max_tokens = gr.Number(
379
  value=1512,
380
  interactive=True,
381
  show_label=False,
 
387
  gr.Markdown("""### Max Batch Prefill Tokens
388
 
389
  Number of prefill tokens used during continuous batching. It can be useful to adjust this number since the prefill operation is memory-intensive and compute-bound.""")
390
+ max_batch_prefill_token = gr.Number(
391
  value=2048,
392
  interactive=True,
393
  show_label=False,
 
398
  gr.Markdown("""### Max Batch Total Tokens
399
 
400
  Number of tokens that can be passed before forcing waiting queries to be put on the batch. A value of 1000 can fit 10 queries of 100 tokens or a single query of 1000 tokens.""")
401
+ max_batch_total_tokens = gr.Number(
402
  value=None,
403
  interactive=True,
404
  show_label=False,
 
433
  compute_selector,
434
  min_node_selector,
435
  max_node_selector,
436
+ security_selector,
437
+ custom_kernel,
438
+ max_input_length,
439
+ max_tokens,
440
+ max_batch_prefill_token,
441
+ max_batch_total_token],
442
  outputs=status_txt)
443
 
444
  with gr.Tab("AWS", elem_classes=["no-border"]):