chansung commited on
Commit
16d0614
·
1 Parent(s): 976429f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -75
app.py CHANGED
@@ -306,82 +306,83 @@ Name for your new endpoint""")
306
  )
307
 
308
  with gr.Column(elem_classes=["group-border"]):
309
- with gr.Column():
310
- gr.Markdown("""### Container Type
311
-
312
- Text Generation Inference is an optimized container for text generation task""")
313
- _ = gr.Textbox("Text Generation Inference", show_label=False, elem_classes=["no-label", "small-big"])
314
-
315
- with gr.Row():
316
- with gr.Column():
317
- gr.Markdown("""### Custom Cuda Kernels
318
-
319
- TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""")
320
- _ = gr.Dropdown(
321
- value="Enabled",
322
- choices=["Enabled", "Disabled"],
323
- interactive=True,
324
- show_label=False,
325
- elem_classes=["no-label", "small-big"]
326
- )
327
-
328
- with gr.Column():
329
- gr.Markdown("""### Quantization
330
-
331
- Quantization can reduce the model size and improve latency, with little degradation in model accuracy.""")
332
- _ = gr.Dropdown(
333
- value="None",
334
- choices=["None", "Bitsandbytes", "GPTQ"],
335
- interactive=True,
336
- show_label=False,
337
- elem_classes=["no-label", "small-big"]
338
- )
339
-
340
- with gr.Row():
341
- with gr.Column():
342
- gr.Markdown("""### Max Input Length (per Query)
343
-
344
- Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""")
345
- _ = gr.Number(
346
- value=1024,
347
- interactive=True,
348
- show_label=False,
349
- elem_classes=["no-label", "small-big"]
350
- )
351
-
352
- with gr.Column():
353
- gr.Markdown("""### Max Number of Tokens (per Query)
354
-
355
- The larger this value, the more memory each request will consume and the less effective batching can be.""")
356
- _ = gr.Number(
357
- value=1512,
358
- interactive=True,
359
- show_label=False,
360
- elem_classes=["no-label", "small-big"]
361
- )
362
-
363
- with gr.Row():
364
  with gr.Column():
365
- gr.Markdown("""### Max Batch Prefill Tokens
366
-
367
- Number of prefill tokens used during continuous batching. It can be useful to adjust this number since the prefill operation is memory-intensive and compute-bound.""")
368
- _ = gr.Number(
369
- value=2048,
370
- interactive=True,
371
- show_label=False,
372
- elem_classes=["no-label", "small-big"]
373
- )
374
-
375
- with gr.Column():
376
- gr.Markdown("""### Max Batch Total Tokens
377
-
378
- Number of tokens that can be passed before forcing waiting queries to be put on the batch. A value of 1000 can fit 10 queries of 100 tokens or a single query of 1000 tokens.""")
379
- _ = gr.Number(
380
- value=None,
381
- interactive=True,
382
- show_label=False,
383
- elem_classes=["no-label", "small-big"]
384
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
 
386
  submit_button = gr.Button(
387
  value="Submit",
 
306
  )
307
 
308
  with gr.Column(elem_classes=["group-border"]):
309
+ with gr.Accordion("Serving Container", open=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  with gr.Column():
311
+ gr.Markdown("""### Container Type
312
+
313
+ Text Generation Inference is an optimized container for text generation task""")
314
+ _ = gr.Textbox("Text Generation Inference", show_label=False, elem_classes=["no-label", "small-big"])
315
+
316
+ with gr.Row():
317
+ with gr.Column():
318
+ gr.Markdown("""### Custom Cuda Kernels
319
+
320
+ TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""")
321
+ _ = gr.Dropdown(
322
+ value="Enabled",
323
+ choices=["Enabled", "Disabled"],
324
+ interactive=True,
325
+ show_label=False,
326
+ elem_classes=["no-label", "small-big"]
327
+ )
328
+
329
+ with gr.Column():
330
+ gr.Markdown("""### Quantization
331
+
332
+ Quantization can reduce the model size and improve latency, with little degradation in model accuracy.""")
333
+ _ = gr.Dropdown(
334
+ value="None",
335
+ choices=["None", "Bitsandbytes", "GPTQ"],
336
+ interactive=True,
337
+ show_label=False,
338
+ elem_classes=["no-label", "small-big"]
339
+ )
340
+
341
+ with gr.Row():
342
+ with gr.Column():
343
+ gr.Markdown("""### Max Input Length (per Query)
344
+
345
+ Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""")
346
+ _ = gr.Number(
347
+ value=1024,
348
+ interactive=True,
349
+ show_label=False,
350
+ elem_classes=["no-label", "small-big"]
351
+ )
352
+
353
+ with gr.Column():
354
+ gr.Markdown("""### Max Number of Tokens (per Query)
355
+
356
+ The larger this value, the more memory each request will consume and the less effective batching can be.""")
357
+ _ = gr.Number(
358
+ value=1512,
359
+ interactive=True,
360
+ show_label=False,
361
+ elem_classes=["no-label", "small-big"]
362
+ )
363
+
364
+ with gr.Row():
365
+ with gr.Column():
366
+ gr.Markdown("""### Max Batch Prefill Tokens
367
+
368
+ Number of prefill tokens used during continuous batching. It can be useful to adjust this number since the prefill operation is memory-intensive and compute-bound.""")
369
+ _ = gr.Number(
370
+ value=2048,
371
+ interactive=True,
372
+ show_label=False,
373
+ elem_classes=["no-label", "small-big"]
374
+ )
375
+
376
+ with gr.Column():
377
+ gr.Markdown("""### Max Batch Total Tokens
378
+
379
+ Number of tokens that can be passed before forcing waiting queries to be put on the batch. A value of 1000 can fit 10 queries of 100 tokens or a single query of 1000 tokens.""")
380
+ _ = gr.Number(
381
+ value=None,
382
+ interactive=True,
383
+ show_label=False,
384
+ elem_classes=["no-label", "small-big"]
385
+ )
386
 
387
  submit_button = gr.Button(
388
  value="Submit",