Spaces:
Running
Running
baptistecolle
HF staff
Add torchao int4 weight only quantization as an option (#34)
8766911
verified
from typing import List | |
import gradio as gr | |
from src.leaderboard import get_leaderboard_df | |
from src.llm_perf import get_llm_perf_df | |
# from attention_implementations import get_attn_decode_fig, get_attn_prefill_fig | |
# from custom_kernels import get_kernel_decode_fig, get_kernel_prefill_fig | |
from src.map import get_lat_score_mem_fig | |
def create_control_panel( | |
machine: str, | |
subsets: List[str], | |
backends: List[str], | |
hardware_provider: str, | |
hardware_type: str, | |
): | |
# controls | |
machine_value = gr.State(value=machine) | |
subsets_value = gr.State(value=subsets) | |
backends_value = gr.State(value=backends) | |
hardware_type_value = gr.State(value=hardware_type) | |
if hardware_provider == "nvidia": | |
backends = ["pytorch"] | |
attention_implementations = ["Eager", "SDPA", "FAv2"] | |
quantizations = ["Unquantized", "BnB.4bit", "BnB.8bit", "AWQ.4bit", "GPTQ.4bit", "torchao.4bit"] | |
kernels = [ | |
"No Kernel", | |
"GPTQ.ExllamaV1", | |
"GPTQ.ExllamaV2", | |
"AWQ.GEMM", | |
"AWQ.GEMV", | |
] | |
elif hardware_provider == "intel": | |
backends = ["pytorch", "onnxruntime", "openvino"] | |
attention_implementations = ["Eager"] | |
quantizations = ["Unquantized"] | |
kernels = ["No Kernel"] | |
else: | |
raise ValueError(f"Unknown hardware provider: {hardware_provider}") | |
with gr.Accordion("Control Panel ποΈ", open=False, elem_id="control-panel"): | |
with gr.Row(): | |
with gr.Column(scale=2, variant="panel"): | |
score_slider = gr.Slider( | |
label="Open LLM Score (%) π", | |
info="ποΈ Slide to minimum Open LLM score", | |
value=0, | |
elem_id="threshold-slider", | |
) | |
with gr.Column(scale=2, variant="panel"): | |
memory_slider = gr.Slider( | |
label="Peak Memory (MB) π", | |
info="ποΈ Slide to maximum Peak Memory", | |
minimum=0, | |
maximum=80 * 1024, | |
value=80 * 1024, | |
elem_id="memory-slider", | |
) | |
with gr.Column(scale=1, variant="panel"): | |
backend_checkboxes = gr.CheckboxGroup( | |
label="Backends π", | |
choices=backends, | |
value=backends, | |
info="βοΈ Select the backends", | |
elem_id="backend-checkboxes", | |
) | |
with gr.Row(): | |
with gr.Column(scale=1, variant="panel"): | |
datatype_checkboxes = gr.CheckboxGroup( | |
label="Precision π₯", | |
choices=["float32", "float16", "bfloat16"], | |
value=["float32", "float16", "bfloat16"], | |
info="βοΈ Select the load data types", | |
elem_id="dtype-checkboxes", | |
) | |
with gr.Column(scale=1, variant="panel"): | |
optimization_checkboxes = gr.CheckboxGroup( | |
label="Attentions ποΈ", | |
choices=attention_implementations, | |
value=attention_implementations, | |
info="βοΈ Select the optimization", | |
elem_id="optimization-checkboxes", | |
) | |
with gr.Row(): | |
with gr.Column(scale=1, variant="panel"): | |
quantization_checkboxes = gr.CheckboxGroup( | |
label="Quantizations ποΈ", | |
choices=quantizations, | |
value=quantizations, | |
info="βοΈ Select the quantization schemes", | |
elem_id="quantization-checkboxes", | |
elem_classes="boxed-option", | |
) | |
with gr.Column(scale=1, variant="panel"): | |
kernels_checkboxes = gr.CheckboxGroup( | |
label="Kernels βοΈ", | |
choices=kernels, | |
value=kernels, | |
info="βοΈ Select the custom kernels", | |
elem_id="kernel-checkboxes", | |
elem_classes="boxed-option", | |
) | |
with gr.Row(): | |
filter_button = gr.Button( | |
value="Filter π", | |
elem_id="filter-button", | |
elem_classes="boxed-option", | |
) | |
return ( | |
filter_button, | |
machine_value, | |
backends_value, | |
hardware_type_value, | |
subsets_value, | |
score_slider, | |
memory_slider, | |
backend_checkboxes, | |
datatype_checkboxes, | |
optimization_checkboxes, | |
quantization_checkboxes, | |
kernels_checkboxes, | |
) | |
def filter_rows_fn( | |
machine, | |
subsets, | |
backends, | |
hardware_type, | |
# inputs | |
score, | |
memory, | |
backend_checkboxes, | |
precisions, | |
attentions, | |
quantizations, | |
kernels, | |
# interactive | |
columns, | |
search, | |
): | |
llm_perf_df = get_llm_perf_df( | |
machine=machine, subsets=subsets, backends=backends, hardware_type=hardware_type | |
) | |
# print(attentions) | |
# print(llm_perf_df["Attention ποΈ"].unique()) | |
filtered_llm_perf_df = llm_perf_df[ | |
llm_perf_df["Model π€"].str.contains(search, case=False) | |
& llm_perf_df["Backend π"].isin(backend_checkboxes) | |
& llm_perf_df["Precision π₯"].isin(precisions) | |
& llm_perf_df["Attention ποΈ"].isin(attentions) | |
& llm_perf_df["Quantization ποΈ"].isin(quantizations) | |
& llm_perf_df["Kernel βοΈ"].isin(kernels) | |
& (llm_perf_df["Open LLM Score (%)"] >= score) | |
& (llm_perf_df["Memory (MB)"] <= memory) | |
] | |
selected_filtered_llm_perf_df = select_columns_fn( | |
machine, subsets, backends, hardware_type, columns, search, filtered_llm_perf_df | |
) | |
selected_filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_llm_perf_df) | |
# filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df) | |
# filtered_bt_decode_fig = get_bt_decode_fig(filtered_df) | |
# filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df) | |
# filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df) | |
# filtered_quant_prefill_fig = get_quant_prefill_fig(filtered_df) | |
# filtered_quant_decode_fig = get_quant_decode_fig(filtered_df) | |
return [ | |
selected_filtered_llm_perf_df, | |
selected_filtered_lat_score_mem_fig, | |
# filtered_bt_prefill_fig, | |
# filtered_bt_decode_fig, | |
# filtered_fa2_prefill_fig, | |
# filtered_fa2_decode_fig, | |
# filtered_quant_prefill_fig, | |
# filtered_quant_decode_fig, | |
] | |
def create_control_callback( | |
# button | |
filter_button, | |
# fixed | |
machine_value, | |
subsets_value, | |
backends_value, | |
hardware_type_value, | |
# inputs | |
score_slider, | |
memory_slider, | |
backend_checkboxes, | |
datatype_checkboxes, | |
optimization_checkboxes, | |
quantization_checkboxes, | |
kernels_checkboxes, | |
# interactive | |
columns_checkboxes, | |
search_bar, | |
# outputs | |
leaderboard_table, | |
lat_score_mem_plot, | |
# attn_prefill_plot, | |
# attn_decode_plot, | |
# fa2_prefill_plot, | |
# fa2_decode_plot, | |
# quant_prefill_plot, | |
# quant_decode_plot, | |
): | |
filter_button.click( | |
fn=filter_rows_fn, | |
inputs=[ | |
# fixed | |
machine_value, | |
subsets_value, | |
backends_value, | |
hardware_type_value, | |
# inputs | |
score_slider, | |
memory_slider, | |
backend_checkboxes, | |
datatype_checkboxes, | |
optimization_checkboxes, | |
quantization_checkboxes, | |
kernels_checkboxes, | |
# interactive | |
columns_checkboxes, | |
search_bar, | |
], | |
outputs=[ | |
leaderboard_table, | |
lat_score_mem_plot, | |
# attn_prefill_plot, | |
# attn_decode_plot, | |
# fa2_prefill_plot, | |
# fa2_decode_plot, | |
# quant_prefill_plot, | |
# quant_decode_plot, | |
], | |
) | |
def select_columns_fn( | |
machine, subsets, backends, hardware_type, columns, search, llm_perf_df=None | |
): | |
if llm_perf_df is None: | |
llm_perf_df = get_llm_perf_df( | |
machine=machine, | |
subsets=subsets, | |
backends=backends, | |
hardware_type=hardware_type, | |
) | |
selected_leaderboard_df = get_leaderboard_df(llm_perf_df) | |
selected_leaderboard_df = selected_leaderboard_df[ | |
selected_leaderboard_df["Model π€"].str.contains(search, case=False) | |
] | |
selected_leaderboard_df = selected_leaderboard_df[columns] | |
return selected_leaderboard_df | |
def create_select_callback( | |
# fixed | |
machine_value, | |
subsets_value, | |
backends_value, | |
hardware_type_value, | |
# interactive | |
columns_checkboxes, | |
search_bar, | |
# outputs | |
leaderboard_table, | |
): | |
columns_checkboxes.change( | |
fn=select_columns_fn, | |
inputs=[ | |
machine_value, | |
subsets_value, | |
backends_value, | |
hardware_type_value, | |
columns_checkboxes, | |
search_bar, | |
], | |
outputs=[leaderboard_table], | |
) | |
search_bar.change( | |
fn=select_columns_fn, | |
inputs=[ | |
machine_value, | |
subsets_value, | |
backends_value, | |
hardware_type_value, | |
columns_checkboxes, | |
search_bar, | |
], | |
outputs=[leaderboard_table], | |
) | |