utensil commited on
Commit
9fc12c1
1 Parent(s): a168076

Adapt my vram calc code to this UI and create a prototype

Browse files
Files changed (1) hide show
  1. app.py +209 -47
app.py CHANGED
@@ -31,18 +31,14 @@ def report_results():
31
 
32
  USER_TOKEN = None
33
  post = f"""# Model Memory Requirements\n
34
-
35
  You will need about {minimum[1]} VRAM to load this model for inference, and {minimum[3]} VRAM to train it using Adam.
36
 
37
  These calculations were measured from the [Model Memory Utility Space](https://hf.co/spaces/hf-accelerate/model-memory-utility) on the Hub.
38
 
39
  The minimum recommended vRAM needed for this model assumes using [Accelerate or `device_map="auto"`](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) and is denoted by the size of the "largest layer".
40
  When performing inference, expect to add up to an additional 20% to this, as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/). More tests will be performed in the future to get a more accurate benchmark for each model.
41
-
42
  When training with `Adam`, you can expect roughly 4x the reported results to be used. (1x for the model, 1x for the gradients, and 2x for the optimizer).
43
-
44
  ## Results:
45
-
46
  {results}
47
  """
48
  discussion = api.create_discussion(MODEL_NAME, "[AUTOMATED] Model Memory Requirements", description=post)
@@ -55,7 +51,123 @@ def convert_url_to_name(url:str):
55
  raise ValueError(f"URL {url} is not a valid model URL to the Hugging Face Hub")
56
  return results[0]
57
 
58
- def calculate_memory(model_name:str, library:str, options:list, access_token:str, raw=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  "Calculates the memory usage for a model"
60
  if library == "auto":
61
  library = None
@@ -82,27 +194,54 @@ def calculate_memory(model_name:str, library:str, options:list, access_token:str
82
  data = []
83
 
84
  title = f"Memory Usage for '{model_name}'"
85
- for dtype in options:
86
- dtype_total_size = total_size
87
- dtype_largest_layer = largest_layer[0]
88
- if dtype in ("fp16", "bf16", "float16/bfloat16"):
89
- dtype_total_size /= 2
90
- dtype_largest_layer /= 2
91
- elif dtype == "int8":
92
- dtype_total_size /= 4
93
- dtype_largest_layer /= 4
94
- elif dtype == "int4":
95
- dtype_total_size /= 8
96
- dtype_largest_layer /= 8
97
- dtype_training_size = convert_bytes(dtype_total_size * 4)
98
- dtype_total_size = convert_bytes(dtype_total_size)
99
- dtype_largest_layer = convert_bytes(dtype_largest_layer)
100
- data.append({
101
  "dtype": dtype,
102
- "Largest Layer or Residual Group": dtype_largest_layer,
103
- "Total Size": dtype_total_size,
104
- "Training using Adam": dtype_training_size
105
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  global HAS_DISCUSSION, MODEL_NAME, LIBRARY
107
  HAS_DISCUSSION = check_for_discussion(model_name)
108
  MODEL_NAME = model_name
@@ -114,7 +253,7 @@ def calculate_memory(model_name:str, library:str, options:list, access_token:str
114
  results = [
115
  f'## {title}',
116
  gr.update(visible=True, value=pd.DataFrame(data)),
117
- gr.update(visible=not HAS_DISCUSSION)
118
  ]
119
  return results
120
 
@@ -122,48 +261,71 @@ with gr.Blocks() as demo:
122
  with gr.Column():
123
  gr.Markdown(
124
  """<img src="https://huggingface.co/spaces/hf-accelerate/model-memory-usage/resolve/main/measure_model_size.png" style="float: left;" width="250" height="250"><h1>🤗 Model Memory Calculator</h1>
 
 
 
 
 
125
 
126
- This tool will help you calculate how much vRAM is needed to train and perform big model inference
127
- on a model hosted on the 🤗 Hugging Face Hub. The minimum recommended vRAM needed for a model
128
- is denoted as the size of the "largest layer", and training of a model is roughly 4x its size (for Adam).
129
-
130
- These calculations are accurate within a few percent at most, such as `bert-base-cased` being 413.68 MB and the calculator estimating 413.18 MB.
131
-
132
- When performing inference, expect to add up to an additional 20% to this as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/).
133
- More tests will be performed in the future to get a more accurate benchmark for each model.
134
 
135
- Currently this tool supports all models hosted that use `transformers` and `timm`.
 
136
 
137
- To use this tool pass in the URL or model name of the model you want to calculate the memory usage for,
138
- select which framework it originates from ("auto" will try and detect it from the model metadata), and
139
- what precisions you want to use."""
140
  )
141
  out_text = gr.Markdown()
142
- out = gr.DataFrame(
143
- headers=["dtype", "Largest Layer", "Total Size", "Training using Adam"],
 
 
 
 
 
 
 
144
  interactive=False,
145
  visible=False,
146
  )
147
  with gr.Row():
148
  inp = gr.Textbox(label="Model Name or URL", value="bert-base-cased")
149
  with gr.Row():
150
- library = gr.Radio(["auto", "transformers", "timm"], label="Library", value="auto")
151
- options = gr.CheckboxGroup(
152
  ["float32", "float16/bfloat16", "int8", "int4"],
153
- value="float32",
154
  label="Model Precision",
155
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)")
157
  with gr.Row():
158
  btn = gr.Button("Calculate Memory Usage")
159
- post_to_hub = gr.Button(value = "Report results in this model repo's discussions!\n(Will open in a new tab)", visible=False)
160
  USER_TOKEN = access_token
161
 
162
  btn.click(
163
- calculate_memory, inputs=[inp, library, options, access_token], outputs=[out_text, out, post_to_hub],
164
  )
165
 
166
- post_to_hub.click(report_results).then(lambda: gr.Button.update(visible=False), outputs=post_to_hub)
167
 
168
 
169
- demo.launch()
 
31
 
32
  USER_TOKEN = None
33
  post = f"""# Model Memory Requirements\n
 
34
  You will need about {minimum[1]} VRAM to load this model for inference, and {minimum[3]} VRAM to train it using Adam.
35
 
36
  These calculations were measured from the [Model Memory Utility Space](https://hf.co/spaces/hf-accelerate/model-memory-utility) on the Hub.
37
 
38
  The minimum recommended vRAM needed for this model assumes using [Accelerate or `device_map="auto"`](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) and is denoted by the size of the "largest layer".
39
  When performing inference, expect to add up to an additional 20% to this, as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/). More tests will be performed in the future to get a more accurate benchmark for each model.
 
40
  When training with `Adam`, you can expect roughly 4x the reported results to be used. (1x for the model, 1x for the gradients, and 2x for the optimizer).
 
41
  ## Results:
 
42
  {results}
43
  """
44
  discussion = api.create_discussion(MODEL_NAME, "[AUTOMATED] Model Memory Requirements", description=post)
 
51
  raise ValueError(f"URL {url} is not a valid model URL to the Hugging Face Hub")
52
  return results[0]
53
 
54
+ # Based on the following doc:
55
+ #
56
+ # - https://huggingface.co/docs/transformers/v4.31.0/perf_train_gpu_one#anatomy-of-models-memory
57
+ # - https://blog.eleuther.ai/transformer-math/
58
+ # - https://kipp.ly/transformer-inference-arithmetic/
59
+ # - https://github.com/ray-project/llm-numbers
60
+ #
61
+ def calc_vram_f32(model, optimizer, sequence_len, micro_batch_size, device_count, gradient_checkpointing):
62
+ # is_16bit = cfg.bf16 or cfg.bfloat16 or cfg.load_in_8bit or cfg.fp16 or cfg.float16
63
+
64
+ # if torch.cuda.device_count() > 1 or cfg.fsdp or os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true" or cfg.adapter:
65
+ # return { 'supported': False }
66
+
67
+ # Model Weights
68
+ #
69
+ # Hf doc counts:
70
+ #
71
+ # - 4 bytes * number of parameters for fp32 training
72
+ # - 6 bytes * number of parameters for mixed precision training (maintains a model in fp32 and one in fp16 in memory)
73
+ #
74
+ # But we follow https://blog.eleuther.ai/transformer-math/#model-weights to count 2 bytes here for mixed precision training,
75
+ # leave the rest to optimizor state.
76
+ #
77
+ # Here we calculate only for fp32, will adjust for each dtype outside.
78
+ #
79
+ # for param in model.parameters():
80
+ # print(f'{type(param)} {param.shape} {param.element_size()}')
81
+ #
82
+ # print(f'total parameters = {sum([param.nelement() for param in model.parameters()])}')
83
+
84
+ param_element_size = 4
85
+ vram_model = sum([param.nelement() * param_element_size for param in model.parameters()])
86
+
87
+ # Buffers
88
+ #
89
+ # Buffers are tensors that do not require gradients and not registered as parameters.
90
+ # e.g. mean and std in batch norm layers.
91
+ # - https://github.com/huggingface/transformers/blob/d4bd33cc9f11ca48635e54983d75249c78d72e2a/src/transformers/modeling_utils.py#L1897
92
+ # - https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2
93
+ #
94
+ # for buf in model.buffers():
95
+ # print(f'buf.element_size() = {buf.element_size()}')
96
+ vram_buffer = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])
97
+
98
+ # Optimizer States:
99
+ # - 8 bytes * number of parameters for normal AdamW (maintains 2 states)
100
+ # - 2 bytes * number of parameters for 8-bit AdamW optimizers like bitsandbytes
101
+ # - 4 bytes * number of parameters for optimizers like SGD with momentum (maintains only 1 state)
102
+ #
103
+ # For now we use AdamW/SGD as the baseline for the estimation, even for other more memory-efficient optimizers
104
+ # ADAMW_HF = "adamw_hf"
105
+ # ADAMW_TORCH = "adamw_torch"
106
+ # ADAMW_TORCH_FUSED = "adamw_torch_fused"
107
+ # ADAMW_TORCH_XLA = "adamw_torch_xla"
108
+ # ADAMW_APEX_FUSED = "adamw_apex_fused"
109
+ # ADAFACTOR = "adafactor"
110
+ # ADAMW_ANYPRECISION = "adamw_anyprecision"
111
+ # SGD = "sgd"
112
+ # ADAGRAD = "adagrad"
113
+ # ADAMW_BNB = "adamw_bnb_8bit"
114
+ # ADAMW_8BIT = "adamw_8bit" # just an alias for adamw_bnb_8bit
115
+ # LION_8BIT = "lion_8bit"
116
+ # LION = "lion_32bit"
117
+ # PAGED_ADAMW = "paged_adamw_32bit"
118
+ # PAGED_ADAMW_8BIT = "paged_adamw_8bit"
119
+ # PAGED_LION = "paged_lion_32bit"
120
+ # PAGED_LION_8BIT = "paged_lion_8bit"
121
+ # optimizer = cfg.optimizer
122
+ optimizer_state_size_per_param = 4 if 'sgd' in optimizer else (2 if '8bit' in optimizer else 8)
123
+ vram_optimizer = sum([param.nelement() * optimizer_state_size_per_param for param in model.parameters()])
124
+
125
+ # Gradients
126
+ #
127
+ # 4 bytes * number of parameters for either fp32 or mixed precision training (gradients are always kept in fp32)
128
+ # but we will follow transformer-math to treat it conditionally outside
129
+ # for now we ignores whether is mixed precision training
130
+ #
131
+ gradient_element_size = 4 # 2 if is_16bit else 4
132
+ vram_gradient = sum([param.nelement() * gradient_element_size for param in model.parameters()])
133
+
134
+ # Forward Activations
135
+ # size depends on many factors, the key ones being sequence length, hidden size and batch size.
136
+ s = sequence_len # cfg.sequence_len
137
+ b = micro_batch_size # cfg.micro_batch_size
138
+ h = model.config.hidden_size
139
+ L = model.config.num_hidden_layers
140
+ t = device_count # max(1, torch.cuda.device_count()) # len(DataParallel(model).device_ids) #torch.cuda.device_count()
141
+ a = model.config.num_attention_heads
142
+ print(f's={s} b={b} h={h} L={L} t={t} a={a}')
143
+
144
+ sbHL = s * b * h * L
145
+ print(f'sbHL = {sbHL / 1e9} GB')
146
+
147
+ print(f'10 + {24 / t} + {5 * a * s / (h * t)}')
148
+
149
+ vram_activation = sbHL * (10 + 24 / t) if gradient_checkpointing else sbHL * (10 + 24 / t + 5 * a * s / (h * t))
150
+
151
+ return {
152
+ # 'supported': True,
153
+ 'param_element_size': param_element_size,
154
+ 'total': vram_model + vram_buffer + vram_optimizer + vram_activation,
155
+ 'model': vram_model,
156
+ 'buffer': vram_buffer,
157
+ 'optimizer': vram_optimizer,
158
+ 'activation': vram_activation,
159
+ }
160
+ def bytes_by_dtype(bytes, dtype):
161
+ if dtype in ("fp16", "bf16", "float16/bfloat16"):
162
+ return bytes / 2
163
+ elif dtype == "int8":
164
+ return bytes / 4
165
+ elif dtype == "int4":
166
+ return bytes / 8
167
+ else:
168
+ return bytes
169
+
170
+ def calculate_memory(model_name:str, library:str, dtypes:list, optimizer:str, access_token:str, raw=False):
171
  "Calculates the memory usage for a model"
172
  if library == "auto":
173
  library = None
 
194
  data = []
195
 
196
  title = f"Memory Usage for '{model_name}'"
197
+
198
+ vram_f32 = calc_vram_f32(model, optimizer=optimizer, sequence_len=2048, micro_batch_size=1, device_count=1, gradient_checkpointing=True)
199
+
200
+ for dtype in dtypes:
201
+ param_element_size = bytes_by_dtype(vram_f32['param_element_size'], dtype)
202
+ vram_model = bytes_by_dtype(vram_f32['model'], dtype)
203
+ vram_buffer = vram_f32['buffer']
204
+ vram_optimizer = vram_f32['optimizer']
205
+ vram_activation = vram_f32['activation']
206
+ row = {
 
 
 
 
 
 
207
  "dtype": dtype,
208
+ 'inference_total': convert_bytes(vram_model + vram_activation),
209
+ 'training_total': convert_bytes(vram_model + vram_buffer + vram_optimizer + vram_activation),
210
+ 'model': convert_bytes(vram_model),
211
+ 'buffer': convert_bytes(vram_buffer),
212
+ 'optimizer': convert_bytes(vram_optimizer),
213
+ 'activation': convert_bytes(vram_activation),
214
+ }
215
+
216
+ data.append(row)
217
+ # dtype_total_size = total_size
218
+ # dtype_largest_layer = largest_layer[0]
219
+ # if dtype in ("fp16", "bf16", "float16/bfloat16"):
220
+ # dtype_total_size /= 2
221
+ # dtype_largest_layer /= 2
222
+ # elif dtype == "int8":
223
+ # dtype_total_size /= 4
224
+ # dtype_largest_layer /= 4
225
+ # elif dtype == "int4":
226
+ # dtype_total_size /= 8
227
+ # dtype_largest_layer /= 8
228
+ # dtype_training_size = convert_bytes(dtype_total_size * 4)
229
+ # dtype_total_size = convert_bytes(dtype_total_size)
230
+ # dtype_largest_layer = convert_bytes(dtype_largest_layer)
231
+ # data.append({
232
+ # "dtype": dtype,
233
+ # "Largest Layer or Residual Group": dtype_largest_layer,
234
+ # "Total Size": dtype_total_size,
235
+ # "Training using Adam": dtype_training_size,
236
+ # "Test": 12345
237
+ # })
238
+ # data.append({
239
+ # "dtype": dtype,
240
+ # "Largest Layer or Residual Group": dtype_largest_layer,
241
+ # "Total Size": dtype_total_size,
242
+ # "Training using Adam": dtype_training_size,
243
+ # "Test": 12345
244
+ # })
245
  global HAS_DISCUSSION, MODEL_NAME, LIBRARY
246
  HAS_DISCUSSION = check_for_discussion(model_name)
247
  MODEL_NAME = model_name
 
253
  results = [
254
  f'## {title}',
255
  gr.update(visible=True, value=pd.DataFrame(data)),
256
+ # gr.update(visible=not HAS_DISCUSSION)
257
  ]
258
  return results
259
 
 
261
  with gr.Column():
262
  gr.Markdown(
263
  """<img src="https://huggingface.co/spaces/hf-accelerate/model-memory-usage/resolve/main/measure_model_size.png" style="float: left;" width="250" height="250"><h1>🤗 Model Memory Calculator</h1>
264
+ This tool is modified from https://huggingface.co/spaces/hf-accelerate/model-memory-usage with the following changes:
265
+
266
+ - Focus on transformers and gives more detailed estimation based on more configs
267
+ - Will auto-calculate the proper batch size given a VRAM constraint later
268
+ - LoRA/QLoRA etc. will be supported later
269
 
270
+ Note:
 
 
 
 
 
 
 
271
 
272
+ - inference_total = model
273
+ - training_total = model + buffer + optimizer + activation
274
 
275
+ """
 
 
276
  )
277
  out_text = gr.Markdown()
278
+ out = gr.DataFrame(headers=[
279
+ "dtype",
280
+ 'inference_total',
281
+ 'training_total',
282
+ 'model',
283
+ 'buffer',
284
+ 'optimizer',
285
+ 'activation',
286
+ ],
287
  interactive=False,
288
  visible=False,
289
  )
290
  with gr.Row():
291
  inp = gr.Textbox(label="Model Name or URL", value="bert-base-cased")
292
  with gr.Row():
293
+ library = gr.Radio(["transformers"], label="Library", value="transformers")
294
+ dtypes = gr.CheckboxGroup(
295
  ["float32", "float16/bfloat16", "int8", "int4"],
296
+ value=["float32", "float16/bfloat16", "int8", "int4"],
297
  label="Model Precision",
298
  )
299
+ # ADAMW_HF = "adamw_hf"
300
+ # ADAMW_TORCH = "adamw_torch"
301
+ # ADAMW_TORCH_FUSED = "adamw_torch_fused"
302
+ # ADAMW_TORCH_XLA = "adamw_torch_xla"
303
+ # ADAMW_APEX_FUSED = "adamw_apex_fused"
304
+ # ADAFACTOR = "adafactor"
305
+ # ADAMW_ANYPRECISION = "adamw_anyprecision"
306
+ # SGD = "sgd"
307
+ # ADAGRAD = "adagrad"
308
+ # ADAMW_BNB = "adamw_bnb_8bit"
309
+ # ADAMW_8BIT = "adamw_8bit" # just an alias for adamw_bnb_8bit
310
+ # LION_8BIT = "lion_8bit"
311
+ # LION = "lion_32bit"
312
+ # PAGED_ADAMW = "paged_adamw_32bit"
313
+ # PAGED_ADAMW_8BIT = "paged_adamw_8bit"
314
+ # PAGED_LION = "paged_lion_32bit"
315
+ # PAGED_LION_8BIT = "paged_lion_8bit"
316
+ optimizer = gr.Dropdown(choices=["adamw_hf", "adamw_torch", "sgd", "lion_32bit", "adamw_8bit", "lion_8bit", "paged_adamw_8bit", "paged_lion_8bit"],
317
+ value="adamw_hf", label="Optimizer", allow_custom_value=True)
318
  access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)")
319
  with gr.Row():
320
  btn = gr.Button("Calculate Memory Usage")
321
+ # post_to_hub = gr.Button(value = "Report results in this model repo's discussions!\n(Will open in a new tab)", visible=False)
322
  USER_TOKEN = access_token
323
 
324
  btn.click(
325
+ calculate_memory, inputs=[inp, library, dtypes, optimizer, access_token], outputs=[out_text, out],
326
  )
327
 
328
+ # post_to_hub.click(report_results).then(lambda: gr.Button.update(visible=False), outputs=post_to_hub)
329
 
330
 
331
+ demo.launch(share=True, inline=False, debug=True)