Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -123,7 +123,12 @@ def submit(
|
|
123 |
compute_selector,
|
124 |
min_node_selector,
|
125 |
max_node_selector,
|
126 |
-
security_selector
|
|
|
|
|
|
|
|
|
|
|
127 |
):
|
128 |
compute_resources = compute_selector.split("·")
|
129 |
accelerator = compute_resources[0][:3].strip()
|
@@ -148,7 +153,19 @@ def submit(
|
|
148 |
"model": {
|
149 |
"framework": framework_selector.lower(),
|
150 |
"image": {
|
151 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
},
|
153 |
"repository": repository_selector.lower(),
|
154 |
"revision": "main",
|
@@ -322,7 +339,7 @@ Name for your new endpoint""")
|
|
322 |
gr.Markdown("""### Custom Cuda Kernels
|
323 |
|
324 |
TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""")
|
325 |
-
|
326 |
value="Enabled",
|
327 |
choices=["Enabled", "Disabled"],
|
328 |
interactive=True,
|
@@ -347,7 +364,7 @@ Name for your new endpoint""")
|
|
347 |
gr.Markdown("""### Max Input Length (per Query)
|
348 |
|
349 |
Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""")
|
350 |
-
|
351 |
value=1024,
|
352 |
interactive=True,
|
353 |
show_label=False,
|
@@ -358,7 +375,7 @@ Name for your new endpoint""")
|
|
358 |
gr.Markdown("""### Max Number of Tokens (per Query)
|
359 |
|
360 |
The larger this value, the more memory each request will consume and the less effective batching can be.""")
|
361 |
-
|
362 |
value=1512,
|
363 |
interactive=True,
|
364 |
show_label=False,
|
@@ -370,7 +387,7 @@ Name for your new endpoint""")
|
|
370 |
gr.Markdown("""### Max Batch Prefill Tokens
|
371 |
|
372 |
Number of prefill tokens used during continuous batching. It can be useful to adjust this number since the prefill operation is memory-intensive and compute-bound.""")
|
373 |
-
|
374 |
value=2048,
|
375 |
interactive=True,
|
376 |
show_label=False,
|
@@ -381,7 +398,7 @@ Name for your new endpoint""")
|
|
381 |
gr.Markdown("""### Max Batch Total Tokens
|
382 |
|
383 |
Number of tokens that can be passed before forcing waiting queries to be put on the batch. A value of 1000 can fit 10 queries of 100 tokens or a single query of 1000 tokens.""")
|
384 |
-
|
385 |
value=None,
|
386 |
interactive=True,
|
387 |
show_label=False,
|
@@ -416,7 +433,12 @@ Name for your new endpoint""")
|
|
416 |
compute_selector,
|
417 |
min_node_selector,
|
418 |
max_node_selector,
|
419 |
-
security_selector
|
|
|
|
|
|
|
|
|
|
|
420 |
outputs=status_txt)
|
421 |
|
422 |
with gr.Tab("AWS", elem_classes=["no-border"]):
|
|
|
123 |
compute_selector,
|
124 |
min_node_selector,
|
125 |
max_node_selector,
|
126 |
+
security_selector,
|
127 |
+
custom_kernel,
|
128 |
+
max_input_length,
|
129 |
+
max_tokens,
|
130 |
+
max_batch_prefill_token,
|
131 |
+
max_batch_total_token
|
132 |
):
|
133 |
compute_resources = compute_selector.split("·")
|
134 |
accelerator = compute_resources[0][:3].strip()
|
|
|
153 |
"model": {
|
154 |
"framework": framework_selector.lower(),
|
155 |
"image": {
|
156 |
+
"custom": {
|
157 |
+
health_route: "/health",
|
158 |
+
env: {
|
159 |
+
DISABLE_CUSTOM_KERNELS: "true" if custom_kernel == "Enabled" else "false",
|
160 |
+
MAX_BATCH_PREFILL_TOKENS: str(max_batch_prefill_token),
|
161 |
+
MAX_BATCH_TOTAL_TOKENS?: str(max_batch_total_token);
|
162 |
+
MAX_INPUT_LENGTH: str(max_input_length),
|
163 |
+
MAX_TOTAL_TOKENS: str(max_tokens),
|
164 |
+
MODEL_ID: repository_selector.lower(),
|
165 |
+
# QUANTIZE: 'bitsandbytes' | 'gptq';
|
166 |
+
},
|
167 |
+
url: "ghcr.io/huggingface/text-generation-inference:1.0.1",
|
168 |
+
}
|
169 |
},
|
170 |
"repository": repository_selector.lower(),
|
171 |
"revision": "main",
|
|
|
339 |
gr.Markdown("""### Custom Cuda Kernels
|
340 |
|
341 |
TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""")
|
342 |
+
custom_kernel = gr.Dropdown(
|
343 |
value="Enabled",
|
344 |
choices=["Enabled", "Disabled"],
|
345 |
interactive=True,
|
|
|
364 |
gr.Markdown("""### Max Input Length (per Query)
|
365 |
|
366 |
Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""")
|
367 |
+
max_input_length = gr.Number(
|
368 |
value=1024,
|
369 |
interactive=True,
|
370 |
show_label=False,
|
|
|
375 |
gr.Markdown("""### Max Number of Tokens (per Query)
|
376 |
|
377 |
The larger this value, the more memory each request will consume and the less effective batching can be.""")
|
378 |
+
max_tokens = gr.Number(
|
379 |
value=1512,
|
380 |
interactive=True,
|
381 |
show_label=False,
|
|
|
387 |
gr.Markdown("""### Max Batch Prefill Tokens
|
388 |
|
389 |
Number of prefill tokens used during continuous batching. It can be useful to adjust this number since the prefill operation is memory-intensive and compute-bound.""")
|
390 |
+
max_batch_prefill_token = gr.Number(
|
391 |
value=2048,
|
392 |
interactive=True,
|
393 |
show_label=False,
|
|
|
398 |
gr.Markdown("""### Max Batch Total Tokens
|
399 |
|
400 |
Number of tokens that can be passed before forcing waiting queries to be put on the batch. A value of 1000 can fit 10 queries of 100 tokens or a single query of 1000 tokens.""")
|
401 |
+
max_batch_total_tokens = gr.Number(
|
402 |
value=None,
|
403 |
interactive=True,
|
404 |
show_label=False,
|
|
|
433 |
compute_selector,
|
434 |
min_node_selector,
|
435 |
max_node_selector,
|
436 |
+
security_selector,
|
437 |
+
custom_kernel,
|
438 |
+
max_input_length,
|
439 |
+
max_tokens,
|
440 |
+
max_batch_prefill_token,
|
441 |
+
max_batch_total_token],
|
442 |
outputs=status_txt)
|
443 |
|
444 |
with gr.Tab("AWS", elem_classes=["no-border"]):
|