Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -306,82 +306,83 @@ Name for your new endpoint""")
|
|
306 |
)
|
307 |
|
308 |
with gr.Column(elem_classes=["group-border"]):
|
309 |
-
with gr.
|
310 |
-
gr.Markdown("""### Container Type
|
311 |
-
|
312 |
-
Text Generation Inference is an optimized container for text generation task""")
|
313 |
-
_ = gr.Textbox("Text Generation Inference", show_label=False, elem_classes=["no-label", "small-big"])
|
314 |
-
|
315 |
-
with gr.Row():
|
316 |
-
with gr.Column():
|
317 |
-
gr.Markdown("""### Custom Cuda Kernels
|
318 |
-
|
319 |
-
TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""")
|
320 |
-
_ = gr.Dropdown(
|
321 |
-
value="Enabled",
|
322 |
-
choices=["Enabled", "Disabled"],
|
323 |
-
interactive=True,
|
324 |
-
show_label=False,
|
325 |
-
elem_classes=["no-label", "small-big"]
|
326 |
-
)
|
327 |
-
|
328 |
-
with gr.Column():
|
329 |
-
gr.Markdown("""### Quantization
|
330 |
-
|
331 |
-
Quantization can reduce the model size and improve latency, with little degradation in model accuracy.""")
|
332 |
-
_ = gr.Dropdown(
|
333 |
-
value="None",
|
334 |
-
choices=["None", "Bitsandbytes", "GPTQ"],
|
335 |
-
interactive=True,
|
336 |
-
show_label=False,
|
337 |
-
elem_classes=["no-label", "small-big"]
|
338 |
-
)
|
339 |
-
|
340 |
-
with gr.Row():
|
341 |
-
with gr.Column():
|
342 |
-
gr.Markdown("""### Max Input Length (per Query)
|
343 |
-
|
344 |
-
Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""")
|
345 |
-
_ = gr.Number(
|
346 |
-
value=1024,
|
347 |
-
interactive=True,
|
348 |
-
show_label=False,
|
349 |
-
elem_classes=["no-label", "small-big"]
|
350 |
-
)
|
351 |
-
|
352 |
-
with gr.Column():
|
353 |
-
gr.Markdown("""### Max Number of Tokens (per Query)
|
354 |
-
|
355 |
-
The larger this value, the more memory each request will consume and the less effective batching can be.""")
|
356 |
-
_ = gr.Number(
|
357 |
-
value=1512,
|
358 |
-
interactive=True,
|
359 |
-
show_label=False,
|
360 |
-
elem_classes=["no-label", "small-big"]
|
361 |
-
)
|
362 |
-
|
363 |
-
with gr.Row():
|
364 |
with gr.Column():
|
365 |
-
gr.Markdown("""###
|
366 |
-
|
367 |
-
|
368 |
-
_ = gr.
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
385 |
|
386 |
submit_button = gr.Button(
|
387 |
value="Submit",
|
|
|
306 |
)
|
307 |
|
308 |
with gr.Column(elem_classes=["group-border"]):
|
309 |
+
with gr.Accordion("Serving Container", open=False):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
310 |
with gr.Column():
|
311 |
+
gr.Markdown("""### Container Type
|
312 |
+
|
313 |
+
Text Generation Inference is an optimized container for text generation task""")
|
314 |
+
_ = gr.Textbox("Text Generation Inference", show_label=False, elem_classes=["no-label", "small-big"])
|
315 |
+
|
316 |
+
with gr.Row():
|
317 |
+
with gr.Column():
|
318 |
+
gr.Markdown("""### Custom Cuda Kernels
|
319 |
+
|
320 |
+
TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""")
|
321 |
+
_ = gr.Dropdown(
|
322 |
+
value="Enabled",
|
323 |
+
choices=["Enabled", "Disabled"],
|
324 |
+
interactive=True,
|
325 |
+
show_label=False,
|
326 |
+
elem_classes=["no-label", "small-big"]
|
327 |
+
)
|
328 |
+
|
329 |
+
with gr.Column():
|
330 |
+
gr.Markdown("""### Quantization
|
331 |
+
|
332 |
+
Quantization can reduce the model size and improve latency, with little degradation in model accuracy.""")
|
333 |
+
_ = gr.Dropdown(
|
334 |
+
value="None",
|
335 |
+
choices=["None", "Bitsandbytes", "GPTQ"],
|
336 |
+
interactive=True,
|
337 |
+
show_label=False,
|
338 |
+
elem_classes=["no-label", "small-big"]
|
339 |
+
)
|
340 |
+
|
341 |
+
with gr.Row():
|
342 |
+
with gr.Column():
|
343 |
+
gr.Markdown("""### Max Input Length (per Query)
|
344 |
+
|
345 |
+
Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""")
|
346 |
+
_ = gr.Number(
|
347 |
+
value=1024,
|
348 |
+
interactive=True,
|
349 |
+
show_label=False,
|
350 |
+
elem_classes=["no-label", "small-big"]
|
351 |
+
)
|
352 |
+
|
353 |
+
with gr.Column():
|
354 |
+
gr.Markdown("""### Max Number of Tokens (per Query)
|
355 |
+
|
356 |
+
The larger this value, the more memory each request will consume and the less effective batching can be.""")
|
357 |
+
_ = gr.Number(
|
358 |
+
value=1512,
|
359 |
+
interactive=True,
|
360 |
+
show_label=False,
|
361 |
+
elem_classes=["no-label", "small-big"]
|
362 |
+
)
|
363 |
+
|
364 |
+
with gr.Row():
|
365 |
+
with gr.Column():
|
366 |
+
gr.Markdown("""### Max Batch Prefill Tokens
|
367 |
+
|
368 |
+
Number of prefill tokens used during continuous batching. It can be useful to adjust this number since the prefill operation is memory-intensive and compute-bound.""")
|
369 |
+
_ = gr.Number(
|
370 |
+
value=2048,
|
371 |
+
interactive=True,
|
372 |
+
show_label=False,
|
373 |
+
elem_classes=["no-label", "small-big"]
|
374 |
+
)
|
375 |
+
|
376 |
+
with gr.Column():
|
377 |
+
gr.Markdown("""### Max Batch Total Tokens
|
378 |
+
|
379 |
+
Number of tokens that can be passed before forcing waiting queries to be put on the batch. A value of 1000 can fit 10 queries of 100 tokens or a single query of 1000 tokens.""")
|
380 |
+
_ = gr.Number(
|
381 |
+
value=None,
|
382 |
+
interactive=True,
|
383 |
+
show_label=False,
|
384 |
+
elem_classes=["no-label", "small-big"]
|
385 |
+
)
|
386 |
|
387 |
submit_button = gr.Button(
|
388 |
value="Submit",
|