chansung's picture
Update app.py
c360670
raw
history blame
16.7 kB
import time
import json
import requests
import gradio as gr
STYLE = """
.no-border {
border: none !important;
}
.group-border {
padding: 10px;
border-width: 1px;
border-radius: 10px;
border-color: gray;
border-style: solid;
box-shadow: 1px 1px 3px;
}
.control-label-font {
font-size: 13pt !important;
}
.control-button {
background: none !important;
border-color: #69ade2 !important;
border-width: 2px !important;
color: #69ade2 !important;
}
.center {
text-align: center;
}
.right {
text-align: right;
}
.no-label {
padding: 0px !important;
}
.no-label > label > span {
display: none;
}
.small-big {
font-size: 12pt !important;
}
"""
def avaliable_providers():
providers = []
headers = {
"Content-Type": "application/json",
}
endpoint_url = "https://api.endpoints.huggingface.cloud/v2/provider"
response = requests.get(endpoint_url, headers=headers)
providers = {}
for provider in response.json()['vendors']:
if provider['status'] == 'available':
regions = {}
availability = False
for region in provider['regions']:
if region["status"] == "available":
regions[region['name']] = {
"label": region['label'],
"computes": region['computes']
}
availability = True
if availability:
providers[provider['name']] = regions
return providers
providers = avaliable_providers()
def update_regions(provider):
avalialbe_regions = []
regions = providers[provider]
for region, attributes in regions.items():
avalialbe_regions.append(f"{region}[{attributes['label']}]")
return gr.Dropdown.update(
choices=avalialbe_regions,
value=avalialbe_regions[0] if len(avalialbe_regions) > 0 else None
)
def update_compute_options(provider, region):
avalialbe_compute_options = []
computes = providers[provider][region.split("[")[0].strip()]["computes"]
for compute in computes:
if compute['status'] == 'available':
accelerator = compute['accelerator']
numAccelerators = compute['numAccelerators']
memoryGb = compute['memoryGb']
architecture = compute['architecture']
instanceType = compute['instanceType']
pricePerHour = compute['pricePerHour']
type = f"{numAccelerators}vCPU {memoryGb}{architecture}" if accelerator == "cpu" else f"{numAccelerators}x {architecture}"
avalialbe_compute_options.append(
f"{compute['accelerator'].upper()} [{compute['instanceSize']}] 路 {type}{instanceType} 路 ${pricePerHour}/hour"
)
return gr.Dropdown.update(
choices=avalialbe_compute_options,
value=avalialbe_compute_options[0] if len(avalialbe_compute_options) > 0 else None
)
def submit(
hf_account_input,
hf_token_input,
endpoint_name_input,
provider_selector,
region_selector,
repository_selector,
task_selector,
framework_selector,
compute_selector,
min_node_selector,
max_node_selector,
security_selector,
custom_kernel,
max_input_length,
max_tokens,
max_batch_prefill_token,
max_batch_total_token
):
compute_resources = compute_selector.split("路")
accelerator = compute_resources[0][:3].strip()
size_l_index = compute_resources[0].index("[") - 1
size_r_index = compute_resources[0].index("]")
size = compute_resources[0][size_l_index : size_r_index].strip()
type = compute_resources[-2].strip()
payload = {
"accountId": hf_account_input.strip(),
"compute": {
"accelerator": accelerator.lower(),
"instanceSize": size[1:],
"instanceType": type,
"scaling": {
"maxReplica": int(max_node_selector),
"minReplica": int(min_node_selector)
}
},
"model": {
"framework": framework_selector.lower(),
"image": {
"custom": {
"health_route": "/health",
"env": {
"DISABLE_CUSTOM_KERNELS": "true" if custom_kernel == "Enabled" else "false",
"MAX_BATCH_PREFILL_TOKENS": str(max_batch_prefill_token),
"MAX_BATCH_TOTAL_TOKENS": str(max_batch_total_token),
"MAX_INPUT_LENGTH": str(max_input_length),
"MAX_TOTAL_TOKENS": str(max_tokens),
"MODEL_ID": repository_selector.lower(),
# QUANTIZE: 'bitsandbytes' | 'gptq';
},
"url": "ghcr.io/huggingface/text-generation-inference:1.0.1",
}
},
"repository": repository_selector.lower(),
"revision": "main",
"task": task_selector.lower()
},
"name": endpoint_name_input.strip(),
"provider": {
"region": region_selector.split("/")[0].lower(),
"vendor": provider_selector.lower()
},
"type": security_selector.lower()
}
print(payload)
payload = json.dumps(payload)
print(payload)
headers = {
"Authorization": f"Bearer {hf_token_input.strip()}",
"Content-Type": "application/json",
}
endpoint_url = f"https://api.endpoints.huggingface.cloud/v2/endpoint/{hf_account_input.strip()}"
print(endpoint_url)
response = requests.post(endpoint_url, headers=headers, data=payload)
if response.status_code == 400:
return f"{response.text}. Malformed data in {payload}"
elif response.status_code == 401:
return "Invalid token"
elif response.status_code == 409:
return f"Endpoint {endpoint_name_input} already exists"
elif response.status_code == 202:
return f"Endpoint {endpoint_name_input} created successfully on {provider_selector.lower()} using {repository_selector.lower()}@main.\nPlease check out the progress at https://ui.endpoints.huggingface.co/endpoints."
else:
return f"something went wrong {response.status_code} = {response.text}"
with gr.Blocks(css=STYLE) as hf_endpoint:
with gr.Tab("Hugging Face", elem_classes=["no-border"]):
gr.Markdown("# Deploy LLM on 馃 Hugging Face Inference Endpoint", elem_classes=["center"])
with gr.Column(elem_classes=["group-border"]):
with gr.Row():
with gr.Column():
gr.Markdown("""### Hugging Face account ID (name)""")
hf_account_input = gr.Textbox(show_label=False, elem_classes=["no-label", "small-big"])
with gr.Column():
gr.Markdown("### Hugging Face access token")
hf_token_input = gr.Textbox(show_label=False, type="password", elem_classes=["no-label", "small-big"])
with gr.Row():
with gr.Column():
gr.Markdown("""### Target model
Model from the Hugging Face hub""")
repository_selector = gr.Textbox(
value="NousResearch/Nous-Hermes-Llama2-13b",
interactive=False,
show_label=False,
elem_classes=["no-label", "small-big"]
)
with gr.Column():
gr.Markdown("""### Target model version(branch)
Branch name of the Model""")
revision_selector = gr.Textbox(
value=f"main",
interactive=False,
show_label=False,
elem_classes=["no-label", "small-big"]
)
with gr.Column(elem_classes=["group-border"]):
with gr.Column():
gr.Markdown("""### Endpoint name
Name for your new endpoint""")
endpoint_name_input = gr.Textbox(show_label=False, elem_classes=["no-label", "small-big"])
with gr.Row():
with gr.Column():
gr.Markdown("""### Cloud Provider""")
provider_selector = gr.Dropdown(
choices=providers.keys(),
interactive=True,
show_label=False,
elem_classes=["no-label", "small-big"]
)
with gr.Column():
gr.Markdown("""### Cloud Region""")
region_selector = gr.Dropdown(
[],
value="",
interactive=True,
show_label=False,
elem_classes=["no-label", "small-big"]
)
with gr.Row(visible=False):
with gr.Column():
gr.Markdown("### Task")
task_selector = gr.Textbox(
value="text-generation",
interactive=False,
show_label=False,
elem_classes=["no-label", "small-big"]
)
with gr.Column():
gr.Markdown("### Framework")
framework_selector = gr.Textbox(
value="PyTorch",
interactive=False,
show_label=False,
elem_classes=["no-label", "small-big"]
)
with gr.Column():
gr.Markdown("""### Compute Instance Type""")
compute_selector = gr.Dropdown(
[],
value="",
interactive=True,
show_label=False,
elem_classes=["no-label", "small-big"]
)
with gr.Row():
with gr.Column():
gr.Markdown("""### Min Number of Nodes""")
min_node_selector = gr.Number(
value=1,
interactive=True,
show_label=False,
elem_classes=["no-label", "small-big"]
)
with gr.Column():
gr.Markdown("""### Max Number of Nodes""")
max_node_selector = gr.Number(
value=1,
interactive=True,
show_label=False,
elem_classes=["no-label", "small-big"]
)
with gr.Column():
gr.Markdown("""### Security Level""")
security_selector = gr.Radio(
choices=["Protected", "Public", "Private"],
value="Public",
interactive=True,
show_label=False,
elem_classes=["no-label", "small-big"]
)
with gr.Column(elem_classes=["group-border"]):
with gr.Accordion("Serving Container", open=False, elem_classes=["no-border"]):
with gr.Column():
gr.Markdown("""### Container Type
Text Generation Inference is an optimized container for text generation task""")
_ = gr.Textbox("Text Generation Inference", show_label=False, elem_classes=["no-label", "small-big"])
with gr.Row():
with gr.Column():
gr.Markdown("""### Custom Cuda Kernels
TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""")
custom_kernel = gr.Dropdown(
value="Enabled",
choices=["Enabled", "Disabled"],
interactive=True,
show_label=False,
elem_classes=["no-label", "small-big"]
)
with gr.Column():
gr.Markdown("""### Quantization
Quantization can reduce the model size and improve latency, with little degradation in model accuracy.""")
_ = gr.Dropdown(
value="None",
choices=["None", "Bitsandbytes", "GPTQ"],
interactive=True,
show_label=False,
elem_classes=["no-label", "small-big"]
)
with gr.Row():
with gr.Column():
gr.Markdown("""### Max Input Length (per Query)
Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""")
max_input_length = gr.Number(
value=1024,
interactive=True,
show_label=False,
elem_classes=["no-label", "small-big"]
)
with gr.Column():
gr.Markdown("""### Max Number of Tokens (per Query)
The larger this value, the more memory each request will consume and the less effective batching can be.""")
max_tokens = gr.Number(
value=1512,
interactive=True,
show_label=False,
elem_classes=["no-label", "small-big"]
)
with gr.Row():
with gr.Column():
gr.Markdown("""### Max Batch Prefill Tokens
Number of prefill tokens used during continuous batching. It can be useful to adjust this number since the prefill operation is memory-intensive and compute-bound.""")
max_batch_prefill_token = gr.Number(
value=2048,
interactive=True,
show_label=False,
elem_classes=["no-label", "small-big"]
)
with gr.Column():
gr.Markdown("""### Max Batch Total Tokens
Number of tokens that can be passed before forcing waiting queries to be put on the batch. A value of 1000 can fit 10 queries of 100 tokens or a single query of 1000 tokens.""")
max_batch_total_token = gr.Number(
value=None,
interactive=True,
show_label=False,
elem_classes=["no-label", "small-big"]
)
submit_button = gr.Button(
value="Submit",
elem_classes=["control-label-font", "control-button"]
)
status_txt = gr.Textbox(
value="any status update will be displayed here",
interactive=False,
elem_classes=["no-label"]
)
provider_selector.change(update_regions, inputs=provider_selector, outputs=region_selector)
region_selector.change(update_compute_options, inputs=[provider_selector, region_selector], outputs=compute_selector)
submit_button.click(
submit,
inputs=[
hf_account_input,
hf_token_input,
endpoint_name_input,
provider_selector,
region_selector,
repository_selector,
task_selector,
framework_selector,
compute_selector,
min_node_selector,
max_node_selector,
security_selector,
custom_kernel,
max_input_length,
max_tokens,
max_batch_prefill_token,
max_batch_total_token],
outputs=status_txt)
with gr.Tab("AWS", elem_classes=["no-border"]):
gr.Markdown("# Deploy LLM on 馃 Hugging Face Inference Endpoint", elem_classes=["center"])
with gr.Tab("GCP", elem_classes=["no-border"]):
gr.Markdown("# Deploy LLM on 馃 Hugging Face Inference Endpoint", elem_classes=["center"])
with gr.Tab("Azure", elem_classes=["no-border"]):
gr.Markdown("# Deploy LLM on 馃 Hugging Face Inference Endpoint", elem_classes=["center"])
with gr.Tab("Lambdalabs", elem_classes=["no-border"]):
gr.Markdown("# Deploy LLM on 馃 Hugging Face Inference Endpoint", elem_classes=["center"])
hf_endpoint.launch(enable_queue=True, debug=True)