Spaces:

chansung
/

hf-inference-endpoint

Runtime error

App Files Files Community

chansung commited on Aug 28, 2023

Commit

16d0614

1 Parent(s): 976429f

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -75

app.py CHANGED Viewed

@@ -306,82 +306,83 @@ Name for your new endpoint""")
                     )
         with gr.Column(elem_classes=["group-border"]):
-            with gr.Column():
-                gr.Markdown("""### Container Type
-Text Generation Inference is an optimized container for text generation task""")
-                _ = gr.Textbox("Text Generation Inference", show_label=False, elem_classes=["no-label", "small-big"])
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("""### Custom Cuda Kernels
-TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""")
-                    _ = gr.Dropdown(
-                        value="Enabled",
-                        choices=["Enabled", "Disabled"],
-                        interactive=True,
-                        show_label=False,
-                        elem_classes=["no-label", "small-big"]
-                    )
-                with gr.Column():
-                    gr.Markdown("""### Quantization
-Quantization can reduce the model size and improve latency, with little degradation in model accuracy.""")
-                    _ = gr.Dropdown(
-                        value="None",
-                        choices=["None", "Bitsandbytes", "GPTQ"],
-                        interactive=True,
-                        show_label=False,
-                        elem_classes=["no-label", "small-big"]
-                    )
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("""### Max Input Length (per Query)
-Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""")
-                    _ = gr.Number(
-                        value=1024,
-                        interactive=True,
-                        show_label=False,
-                        elem_classes=["no-label", "small-big"]
-                    )
-                with gr.Column():
-                    gr.Markdown("""### Max Number of Tokens (per Query)
-The larger this value, the more memory each request will consume and the less effective batching can be.""")
-                    _ = gr.Number(
-                        value=1512,
-                        interactive=True,
-                        show_label=False,
-                        elem_classes=["no-label", "small-big"]
-                    )
-            with gr.Row():
                 with gr.Column():
-                    gr.Markdown("""### Max Batch Prefill Tokens
-Number of prefill tokens used during continuous batching. It can be useful to adjust this number since the prefill operation is memory-intensive and compute-bound.""")
-                    _ = gr.Number(
-                        value=2048,
-                        interactive=True,
-                        show_label=False,
-                        elem_classes=["no-label", "small-big"]
-                    )
-                with gr.Column():
-                    gr.Markdown("""### Max Batch Total Tokens
-Number of tokens that can be passed before forcing waiting queries to be put on the batch. A value of 1000 can fit 10 queries of 100 tokens or a single query of 1000 tokens.""")
-                    _ = gr.Number(
-                        value=None,
-                        interactive=True,
-                        show_label=False,
-                        elem_classes=["no-label", "small-big"]
-                    )
         submit_button = gr.Button(
             value="Submit",

                     )
         with gr.Column(elem_classes=["group-border"]):
+            with gr.Accordion("Serving Container", open=False):
                 with gr.Column():
+                    gr.Markdown("""### Container Type
+    Text Generation Inference is an optimized container for text generation task""")
+                    _ = gr.Textbox("Text Generation Inference", show_label=False, elem_classes=["no-label", "small-big"])
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("""### Custom Cuda Kernels
+    TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""")
+                        _ = gr.Dropdown(
+                            value="Enabled",
+                            choices=["Enabled", "Disabled"],
+                            interactive=True,
+                            show_label=False,
+                            elem_classes=["no-label", "small-big"]
+                        )
+                    with gr.Column():
+                        gr.Markdown("""### Quantization
+    Quantization can reduce the model size and improve latency, with little degradation in model accuracy.""")
+                        _ = gr.Dropdown(
+                            value="None",
+                            choices=["None", "Bitsandbytes", "GPTQ"],
+                            interactive=True,
+                            show_label=False,
+                            elem_classes=["no-label", "small-big"]
+                        )
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("""### Max Input Length (per Query)
+    Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""")
+                        _ = gr.Number(
+                            value=1024,
+                            interactive=True,
+                            show_label=False,
+                            elem_classes=["no-label", "small-big"]
+                        )
+                    with gr.Column():
+                        gr.Markdown("""### Max Number of Tokens (per Query)
+    The larger this value, the more memory each request will consume and the less effective batching can be.""")
+                        _ = gr.Number(
+                            value=1512,
+                            interactive=True,
+                            show_label=False,
+                            elem_classes=["no-label", "small-big"]
+                        )
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("""### Max Batch Prefill Tokens
+    Number of prefill tokens used during continuous batching. It can be useful to adjust this number since the prefill operation is memory-intensive and compute-bound.""")
+                        _ = gr.Number(
+                            value=2048,
+                            interactive=True,
+                            show_label=False,
+                            elem_classes=["no-label", "small-big"]
+                        )
+                    with gr.Column():
+                        gr.Markdown("""### Max Batch Total Tokens
+    Number of tokens that can be passed before forcing waiting queries to be put on the batch. A value of 1000 can fit 10 queries of 100 tokens or a single query of 1000 tokens.""")
+                        _ = gr.Number(
+                            value=None,
+                            interactive=True,
+                            show_label=False,
+                            elem_classes=["no-label", "small-big"]
+                        )
         submit_button = gr.Button(
             value="Submit",