model-memory-usage

Runtime error

File size: 15,009 Bytes

import os
import re
import webbrowser
import pandas as pd
import gradio as gr
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError
from accelerate.commands.estimate import create_empty_model, check_has_model
from accelerate.utils import convert_bytes, calculate_maximum_sizes

# We need to store them as globals because gradio doesn't have a way for us to pass them in to the button
HAS_DISCUSSION = True
MODEL_NAME = None
LIBRARY = None
USER_TOKEN = None
TOKEN = os.environ.get("HUGGINGFACE_API_LOGIN", None)

def check_for_discussion(model_name:str):
    "Checks if an automated discussion has been opened on the model by `model-sizer-bot`"
    global TOKEN
    api = HfApi(token=TOKEN)
    discussions = list(api.get_repo_discussions(model_name))
    return any(discussion.title == "[AUTOMATED] Model Memory Requirements" and discussion.author == "model-sizer-bot" for discussion in discussions)

def report_results():
    "Reports the results of a memory calculation to the model's discussion page, and opens a new tab to it afterwards"
    global MODEL_NAME, LIBRARY, TOKEN, USER_TOKEN
    api = HfApi(token=TOKEN)
    results, data = calculate_memory(MODEL_NAME, LIBRARY, ["fp32", "fp16", "int8", "int4"], access_token=USER_TOKEN, raw=True)
    minimum = data[0]

    USER_TOKEN = None
    post = f"""# Model Memory Requirements\n
You will need about {minimum[1]} VRAM to load this model for inference, and {minimum[3]} VRAM to train it using Adam.
    
These calculations were measured from the [Model Memory Utility Space](https://hf.co/spaces/hf-accelerate/model-memory-utility) on the Hub.
    
The minimum recommended vRAM needed for this model assumes using [Accelerate or `device_map="auto"`](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) and is denoted by the size of the "largest layer". 
When performing inference, expect to add up to an additional 20% to this, as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/). More tests will be performed in the future to get a more accurate benchmark for each model.
When training with `Adam`, you can expect roughly 4x the reported results to be used. (1x for the model, 1x for the gradients, and 2x for the optimizer).
## Results:
{results}
"""
    discussion = api.create_discussion(MODEL_NAME, "[AUTOMATED] Model Memory Requirements", description=post)
    webbrowser.open_new_tab(discussion.url)

def convert_url_to_name(url:str):
    "Converts a model URL to its name on the Hub"
    results = re.findall(r"huggingface.co\/(.*?)#", url)
    if len(results) < 1:
        raise ValueError(f"URL {url} is not a valid model URL to the Hugging Face Hub")
    return results[0]

# Based on the following doc:
#
# - https://huggingface.co/docs/transformers/v4.31.0/perf_train_gpu_one#anatomy-of-models-memory
# - https://blog.eleuther.ai/transformer-math/
# - https://kipp.ly/transformer-inference-arithmetic/
# - https://github.com/ray-project/llm-numbers
#
def calc_vram_f32(model, optimizer, sequence_len, micro_batch_size, device_count, gradient_checkpointing):
    # is_16bit = cfg.bf16 or cfg.bfloat16 or cfg.load_in_8bit or cfg.fp16 or cfg.float16

    # if torch.cuda.device_count() > 1 or cfg.fsdp or os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true" or cfg.adapter:
    #   return { 'supported': False }

    # Model Weights
    #
    # Hf doc counts:
    #
    # - 4 bytes * number of parameters for fp32 training
    # - 6 bytes * number of parameters for mixed precision training (maintains a model in fp32 and one in fp16 in memory)
    #
    # But we follow https://blog.eleuther.ai/transformer-math/#model-weights to count 2 bytes here for mixed precision training,
    # leave the rest to optimizor state.
    #
    # Here we calculate only for fp32, will adjust for each dtype outside.
    #
    # for param in model.parameters():
    #   print(f'{type(param)} {param.shape} {param.element_size()}')
    #
    # print(f'total parameters = {sum([param.nelement() for param in model.parameters()])}')

    param_element_size = 4
    vram_model = sum([param.nelement() * param_element_size for param in model.parameters()])

    # Buffers
    #
    # Buffers are tensors that do not require gradients and not registered as parameters.
    # e.g. mean and std in batch norm layers.
    # - https://github.com/huggingface/transformers/blob/d4bd33cc9f11ca48635e54983d75249c78d72e2a/src/transformers/modeling_utils.py#L1897
    # - https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2
    #
    # for buf in model.buffers():
    #   print(f'buf.element_size() = {buf.element_size()}')
    vram_buffer = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Optimizer States:
    # - 8 bytes * number of parameters for normal AdamW (maintains 2 states)
    # - 2 bytes * number of parameters for 8-bit AdamW optimizers like bitsandbytes
    # - 4 bytes * number of parameters for optimizers like SGD with momentum (maintains only 1 state)
    #
    # For now we use AdamW/SGD as the baseline for the estimation, even for other more memory-efficient optimizers
    # ADAMW_HF = "adamw_hf"
    # ADAMW_TORCH = "adamw_torch"
    # ADAMW_TORCH_FUSED = "adamw_torch_fused"
    # ADAMW_TORCH_XLA = "adamw_torch_xla"
    # ADAMW_APEX_FUSED = "adamw_apex_fused"
    # ADAFACTOR = "adafactor"
    # ADAMW_ANYPRECISION = "adamw_anyprecision"
    # SGD = "sgd"
    # ADAGRAD = "adagrad"
    # ADAMW_BNB = "adamw_bnb_8bit"
    # ADAMW_8BIT = "adamw_8bit"  # just an alias for adamw_bnb_8bit
    # LION_8BIT = "lion_8bit"
    # LION = "lion_32bit"
    # PAGED_ADAMW = "paged_adamw_32bit"
    # PAGED_ADAMW_8BIT = "paged_adamw_8bit"
    # PAGED_LION = "paged_lion_32bit"
    # PAGED_LION_8BIT = "paged_lion_8bit"
    # optimizer = cfg.optimizer
    optimizer_state_size_per_param = 4 if 'sgd' in optimizer else (2 if '8bit' in optimizer else 8)
    vram_optimizer = sum([param.nelement() * optimizer_state_size_per_param for param in model.parameters()])

    # Gradients
    #
    # 4 bytes * number of parameters for either fp32 or mixed precision training (gradients are always kept in fp32)
    # but we will follow transformer-math to treat it conditionally outside
    # for now we ignores whether is mixed precision training
    #
    gradient_element_size = 4 # 2 if is_16bit else 4
    vram_gradient = sum([param.nelement() * gradient_element_size for param in model.parameters()])

    # Forward Activations
    # size depends on many factors, the key ones being sequence length, hidden size and batch size.
    s = sequence_len # cfg.sequence_len
    b = micro_batch_size # cfg.micro_batch_size
    h = model.config.hidden_size
    L = model.config.num_hidden_layers
    t = device_count # max(1, torch.cuda.device_count()) # len(DataParallel(model).device_ids)  #torch.cuda.device_count()
    a = model.config.num_attention_heads
    print(f's={s} b={b} h={h} L={L} t={t} a={a}')

    sbHL = s * b * h * L
    print(f'sbHL = {sbHL / 1e9} GB')

    print(f'10 + {24 / t} + {5 * a * s / (h * t)}')

    vram_activation = sbHL * (10 + 24 / t) if gradient_checkpointing else sbHL * (10 + 24 / t + 5 * a * s / (h * t))

    return {
        # 'supported': True,
        'param_element_size': param_element_size,
        'total': vram_model + vram_buffer + vram_optimizer + vram_activation,
        'model': vram_model,
        'buffer': vram_buffer,
        'optimizer': vram_optimizer,
        'activation': vram_activation,
  }
def bytes_by_dtype(bytes, dtype):
    if dtype in ("fp16",  "bf16", "float16/bfloat16"):
        return bytes / 2
    elif dtype == "int8":
        return bytes / 4
    elif dtype == "int4":
        return bytes / 8
    else:
        return bytes

def calculate_memory(model_name:str, library:str, dtypes:list, optimizer:str, access_token:str, raw=False):
    "Calculates the memory usage for a model"
    if library == "auto":
        library = None
    if "http" in model_name and "//" in model_name:
        try:
            model_name = convert_url_to_name(model_name)
        except ValueError:
            raise gr.Error(f"URL `{model_name}` is not a valid model URL to the Hugging Face Hub")
    try:
        model = create_empty_model(model_name, library_name=library, trust_remote_code=True, access_token=access_token)
    except GatedRepoError:
        raise gr.Error(f"Model `{model_name}` is a gated model, please ensure to pass in your access token and try again if you have access. You can find your access token here : https://huggingface.co/settings/tokens. ")
    except RepositoryNotFoundError:
        raise gr.Error(f"Model `{model_name}` was not found on the Hub, please try another model name.")
    except ValueError as e:
        raise gr.Error(f"Model `{model_name}` does not have any library metadata on the Hub, please manually select a library_name to use (such as `transformers`)")
    except (RuntimeError, OSError) as e:
        library = check_has_model(e)
        if library != "unknown":
            raise gr.Error(f"Tried to load `{model_name}` with `{library}` but a possible model to load was not found inside the repo.")
    
    total_size, largest_layer = calculate_maximum_sizes(model)

    data = []

    title = f"Memory Usage for '{model_name}'"

    vram_f32 = calc_vram_f32(model, optimizer=optimizer, sequence_len=2048, micro_batch_size=1, device_count=1, gradient_checkpointing=True)

    for dtype in dtypes:
        param_element_size = bytes_by_dtype(vram_f32['param_element_size'], dtype)
        vram_model = bytes_by_dtype(vram_f32['model'], dtype)
        vram_buffer = vram_f32['buffer']
        vram_optimizer = vram_f32['optimizer']
        vram_activation = vram_f32['activation']
        row = {
            "dtype": dtype,
            'inference_total': convert_bytes(vram_model),
            'training_total': convert_bytes(vram_model + vram_buffer + vram_optimizer + vram_activation),
            'model': convert_bytes(vram_model),
            'buffer': convert_bytes(vram_buffer),
            'optimizer': convert_bytes(vram_optimizer),
            'activation': convert_bytes(vram_activation),
        }

        data.append(row)
        # dtype_total_size = total_size
        # dtype_largest_layer = largest_layer[0]
        # if dtype in ("fp16",  "bf16", "float16/bfloat16"):
        #     dtype_total_size /= 2
        #     dtype_largest_layer /= 2
        # elif dtype == "int8":
        #     dtype_total_size /= 4
        #     dtype_largest_layer /= 4
        # elif dtype == "int4":
        #     dtype_total_size /= 8
        #     dtype_largest_layer /= 8
        # dtype_training_size = convert_bytes(dtype_total_size * 4)
        # dtype_total_size = convert_bytes(dtype_total_size)
        # dtype_largest_layer = convert_bytes(dtype_largest_layer)
        # data.append({
        #     "dtype": dtype,
        #     "Largest Layer or Residual Group": dtype_largest_layer,
        #     "Total Size": dtype_total_size,
        #     "Training using Adam": dtype_training_size,
        #     "Test": 12345
        # })
        # data.append({
        #     "dtype": dtype,
        #     "Largest Layer or Residual Group": dtype_largest_layer,
        #     "Total Size": dtype_total_size,
        #     "Training using Adam": dtype_training_size,
        #     "Test": 12345
        # })
    global HAS_DISCUSSION, MODEL_NAME, LIBRARY
    HAS_DISCUSSION = check_for_discussion(model_name)
    MODEL_NAME = model_name
    LIBRARY = library

    if raw:
        return pd.DataFrame(data).to_markdown(index=False), data
    
    results = [
        f'## {title}', 
        gr.update(visible=True, value=pd.DataFrame(data)), 
        # gr.update(visible=not HAS_DISCUSSION)
    ]
    return results

with gr.Blocks() as demo:
    with gr.Column():
        gr.Markdown(
            """<img src="https://huggingface.co/spaces/hf-accelerate/model-memory-usage/resolve/main/measure_model_size.png" style="float: left;" width="250" height="250"><h1>🤗 Model Memory Calculator</h1>
    This tool is modified from https://huggingface.co/spaces/hf-accelerate/model-memory-usage with the following changes:
    
    - Focus on transformers and gives more detailed estimation based on more configs
    - Will auto-calculate the proper batch size given a VRAM constraint later
    - LoRA/QLoRA etc. will be supported later

    Note:

    - inference_total = model
    - training_total = model + buffer + optimizer + activation

    """
        )
        out_text = gr.Markdown()
        out = gr.DataFrame(headers=[
              "dtype",
              'inference_total',
              'training_total',
              'model',
              'buffer',
              'optimizer',
              'activation',
            ],
            interactive=False,
            visible=False,
        )
        with gr.Row():
            inp = gr.Textbox(label="Model Name or URL", value="bert-base-cased")
        with gr.Row():
            library = gr.Radio(["transformers"], label="Library", value="transformers")
            dtypes = gr.CheckboxGroup(
                ["float32", "float16/bfloat16", "int8", "int4"],
                value=["float32", "float16/bfloat16", "int8", "int4"],
                label="Model Precision",
            )
              # ADAMW_HF = "adamw_hf"
              # ADAMW_TORCH = "adamw_torch"
              # ADAMW_TORCH_FUSED = "adamw_torch_fused"
              # ADAMW_TORCH_XLA = "adamw_torch_xla"
              # ADAMW_APEX_FUSED = "adamw_apex_fused"
              # ADAFACTOR = "adafactor"
              # ADAMW_ANYPRECISION = "adamw_anyprecision"
              # SGD = "sgd"
              # ADAGRAD = "adagrad"
              # ADAMW_BNB = "adamw_bnb_8bit"
              # ADAMW_8BIT = "adamw_8bit"  # just an alias for adamw_bnb_8bit
              # LION_8BIT = "lion_8bit"
              # LION = "lion_32bit"
              # PAGED_ADAMW = "paged_adamw_32bit"
              # PAGED_ADAMW_8BIT = "paged_adamw_8bit"
              # PAGED_LION = "paged_lion_32bit"
              # PAGED_LION_8BIT = "paged_lion_8bit"
            optimizer = gr.Dropdown(choices=["adamw_hf", "adamw_torch", "sgd", "lion_32bit", "adamw_8bit", "lion_8bit", "paged_adamw_8bit", "paged_lion_8bit"],
              value="adamw_hf", label="Optimizer", allow_custom_value=True)
            access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)")
        with gr.Row():
            btn = gr.Button("Calculate Memory Usage")
            # post_to_hub = gr.Button(value = "Report results in this model repo's discussions!\n(Will open in a new tab)", visible=False)
    USER_TOKEN = access_token

    btn.click(
        calculate_memory, inputs=[inp, library, dtypes, optimizer, access_token], outputs=[out_text, out],
    )
    
    # post_to_hub.click(report_results).then(lambda: gr.Button.update(visible=False), outputs=post_to_hub)


demo.launch() # (share=True, inline=False, debug=True)