|
import os |
|
import re |
|
import webbrowser |
|
import pandas as pd |
|
import gradio as gr |
|
from huggingface_hub import HfApi |
|
from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError |
|
from accelerate.commands.estimate import create_empty_model, check_has_model |
|
from accelerate.utils import convert_bytes, calculate_maximum_sizes |
|
|
|
|
|
HAS_DISCUSSION = True |
|
MODEL_NAME = None |
|
LIBRARY = None |
|
USER_TOKEN = None |
|
TOKEN = os.environ.get("HUGGINGFACE_API_LOGIN", None) |
|
|
|
def check_for_discussion(model_name:str): |
|
"Checks if an automated discussion has been opened on the model by `model-sizer-bot`" |
|
global TOKEN |
|
api = HfApi(token=TOKEN) |
|
discussions = list(api.get_repo_discussions(model_name)) |
|
return any(discussion.title == "[AUTOMATED] Model Memory Requirements" and discussion.author == "model-sizer-bot" for discussion in discussions) |
|
|
|
def report_results(): |
|
"Reports the results of a memory calculation to the model's discussion page, and opens a new tab to it afterwards" |
|
global MODEL_NAME, LIBRARY, TOKEN, USER_TOKEN |
|
api = HfApi(token=TOKEN) |
|
results, data = calculate_memory(MODEL_NAME, LIBRARY, ["fp32", "fp16", "int8", "int4"], access_token=USER_TOKEN, raw=True) |
|
minimum = data[0] |
|
|
|
USER_TOKEN = None |
|
post = f"""# Model Memory Requirements\n |
|
You will need about {minimum[1]} VRAM to load this model for inference, and {minimum[3]} VRAM to train it using Adam. |
|
|
|
These calculations were measured from the [Model Memory Utility Space](https://hf.co/spaces/hf-accelerate/model-memory-utility) on the Hub. |
|
|
|
The minimum recommended vRAM needed for this model assumes using [Accelerate or `device_map="auto"`](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) and is denoted by the size of the "largest layer". |
|
When performing inference, expect to add up to an additional 20% to this, as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/). More tests will be performed in the future to get a more accurate benchmark for each model. |
|
When training with `Adam`, you can expect roughly 4x the reported results to be used. (1x for the model, 1x for the gradients, and 2x for the optimizer). |
|
## Results: |
|
{results} |
|
""" |
|
discussion = api.create_discussion(MODEL_NAME, "[AUTOMATED] Model Memory Requirements", description=post) |
|
webbrowser.open_new_tab(discussion.url) |
|
|
|
def convert_url_to_name(url:str): |
|
"Converts a model URL to its name on the Hub" |
|
results = re.findall(r"huggingface.co\/(.*?)#", url) |
|
if len(results) < 1: |
|
raise ValueError(f"URL {url} is not a valid model URL to the Hugging Face Hub") |
|
return results[0] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def calc_vram_f32(model, optimizer, sequence_len, micro_batch_size, device_count, gradient_checkpointing): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
param_element_size = 4 |
|
vram_model = sum([param.nelement() * param_element_size for param in model.parameters()]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vram_buffer = sum([buf.nelement() * buf.element_size() for buf in model.buffers()]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
optimizer_state_size_per_param = 4 if 'sgd' in optimizer else (2 if '8bit' in optimizer else 8) |
|
vram_optimizer = sum([param.nelement() * optimizer_state_size_per_param for param in model.parameters()]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gradient_element_size = 4 |
|
vram_gradient = sum([param.nelement() * gradient_element_size for param in model.parameters()]) |
|
|
|
|
|
|
|
s = sequence_len |
|
b = micro_batch_size |
|
h = model.config.hidden_size |
|
L = model.config.num_hidden_layers |
|
t = device_count |
|
a = model.config.num_attention_heads |
|
print(f's={s} b={b} h={h} L={L} t={t} a={a}') |
|
|
|
sbHL = s * b * h * L |
|
print(f'sbHL = {sbHL / 1e9} GB') |
|
|
|
print(f'10 + {24 / t} + {5 * a * s / (h * t)}') |
|
|
|
vram_activation = sbHL * (10 + 24 / t) if gradient_checkpointing else sbHL * (10 + 24 / t + 5 * a * s / (h * t)) |
|
|
|
return { |
|
|
|
'param_element_size': param_element_size, |
|
'total': vram_model + vram_buffer + vram_optimizer + vram_activation, |
|
'model': vram_model, |
|
'buffer': vram_buffer, |
|
'optimizer': vram_optimizer, |
|
'activation': vram_activation, |
|
} |
|
def bytes_by_dtype(bytes, dtype): |
|
if dtype in ("fp16", "bf16", "float16/bfloat16"): |
|
return bytes / 2 |
|
elif dtype == "int8": |
|
return bytes / 4 |
|
elif dtype == "int4": |
|
return bytes / 8 |
|
else: |
|
return bytes |
|
|
|
def calculate_memory(model_name:str, library:str, dtypes:list, optimizer:str, access_token:str, raw=False): |
|
"Calculates the memory usage for a model" |
|
if library == "auto": |
|
library = None |
|
if "http" in model_name and "//" in model_name: |
|
try: |
|
model_name = convert_url_to_name(model_name) |
|
except ValueError: |
|
raise gr.Error(f"URL `{model_name}` is not a valid model URL to the Hugging Face Hub") |
|
try: |
|
model = create_empty_model(model_name, library_name=library, trust_remote_code=True, access_token=access_token) |
|
except GatedRepoError: |
|
raise gr.Error(f"Model `{model_name}` is a gated model, please ensure to pass in your access token and try again if you have access. You can find your access token here : https://huggingface.co/settings/tokens. ") |
|
except RepositoryNotFoundError: |
|
raise gr.Error(f"Model `{model_name}` was not found on the Hub, please try another model name.") |
|
except ValueError as e: |
|
raise gr.Error(f"Model `{model_name}` does not have any library metadata on the Hub, please manually select a library_name to use (such as `transformers`)") |
|
except (RuntimeError, OSError) as e: |
|
library = check_has_model(e) |
|
if library != "unknown": |
|
raise gr.Error(f"Tried to load `{model_name}` with `{library}` but a possible model to load was not found inside the repo.") |
|
|
|
total_size, largest_layer = calculate_maximum_sizes(model) |
|
|
|
data = [] |
|
|
|
title = f"Memory Usage for '{model_name}'" |
|
|
|
vram_f32 = calc_vram_f32(model, optimizer=optimizer, sequence_len=2048, micro_batch_size=1, device_count=1, gradient_checkpointing=True) |
|
|
|
for dtype in dtypes: |
|
param_element_size = bytes_by_dtype(vram_f32['param_element_size'], dtype) |
|
vram_model = bytes_by_dtype(vram_f32['model'], dtype) |
|
vram_buffer = vram_f32['buffer'] |
|
vram_optimizer = vram_f32['optimizer'] |
|
vram_activation = vram_f32['activation'] |
|
row = { |
|
"dtype": dtype, |
|
'inference_total': convert_bytes(vram_model), |
|
'training_total': convert_bytes(vram_model + vram_buffer + vram_optimizer + vram_activation), |
|
'model': convert_bytes(vram_model), |
|
'buffer': convert_bytes(vram_buffer), |
|
'optimizer': convert_bytes(vram_optimizer), |
|
'activation': convert_bytes(vram_activation), |
|
} |
|
|
|
data.append(row) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
global HAS_DISCUSSION, MODEL_NAME, LIBRARY |
|
HAS_DISCUSSION = check_for_discussion(model_name) |
|
MODEL_NAME = model_name |
|
LIBRARY = library |
|
|
|
if raw: |
|
return pd.DataFrame(data).to_markdown(index=False), data |
|
|
|
results = [ |
|
f'## {title}', |
|
gr.update(visible=True, value=pd.DataFrame(data)), |
|
|
|
] |
|
return results |
|
|
|
with gr.Blocks() as demo: |
|
with gr.Column(): |
|
gr.Markdown( |
|
"""<img src="https://huggingface.co/spaces/hf-accelerate/model-memory-usage/resolve/main/measure_model_size.png" style="float: left;" width="250" height="250"><h1>🤗 Model Memory Calculator</h1> |
|
This tool is modified from https://huggingface.co/spaces/hf-accelerate/model-memory-usage with the following changes: |
|
|
|
- Focus on transformers and gives more detailed estimation based on more configs |
|
- Will auto-calculate the proper batch size given a VRAM constraint later |
|
- LoRA/QLoRA etc. will be supported later |
|
|
|
Note: |
|
|
|
- inference_total = model |
|
- training_total = model + buffer + optimizer + activation |
|
|
|
""" |
|
) |
|
out_text = gr.Markdown() |
|
out = gr.DataFrame(headers=[ |
|
"dtype", |
|
'inference_total', |
|
'training_total', |
|
'model', |
|
'buffer', |
|
'optimizer', |
|
'activation', |
|
], |
|
interactive=False, |
|
visible=False, |
|
) |
|
with gr.Row(): |
|
inp = gr.Textbox(label="Model Name or URL", value="bert-base-cased") |
|
with gr.Row(): |
|
library = gr.Radio(["transformers"], label="Library", value="transformers") |
|
dtypes = gr.CheckboxGroup( |
|
["float32", "float16/bfloat16", "int8", "int4"], |
|
value=["float32", "float16/bfloat16", "int8", "int4"], |
|
label="Model Precision", |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
optimizer = gr.Dropdown(choices=["adamw_hf", "adamw_torch", "sgd", "lion_32bit", "adamw_8bit", "lion_8bit", "paged_adamw_8bit", "paged_lion_8bit"], |
|
value="adamw_hf", label="Optimizer", allow_custom_value=True) |
|
access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)") |
|
with gr.Row(): |
|
btn = gr.Button("Calculate Memory Usage") |
|
|
|
USER_TOKEN = access_token |
|
|
|
btn.click( |
|
calculate_memory, inputs=[inp, library, dtypes, optimizer, access_token], outputs=[out_text, out], |
|
) |
|
|
|
|
|
|
|
|
|
demo.launch() |