|
""" |
|
Prints out the ratio of activation memory for the MLP layer when using ReLU vs GELU. |
|
""" |
|
|
|
import torch |
|
import torch.nn as nn |
|
|
|
import act_mem |
|
import layers |
|
|
|
if __name__ == "__main__": |
|
batch_size, seq_len, d_model, dropout_prob = 1, 128, 1024, 0.1 |
|
print(f"Batch size: {batch_size}, sequence length: {seq_len}, d_model: {d_model}, dropout_prob: {dropout_prob} ") |
|
dtype = torch.bfloat16 |
|
inputs = torch.randn( |
|
batch_size, |
|
seq_len, |
|
d_model, |
|
device="cuda", |
|
requires_grad=True, |
|
dtype=dtype, |
|
) |
|
|
|
act_fn_dict = {"ReLU": nn.ReLU() , "GELU": nn.GELU(), "silu": nn.SiLU()} |
|
|
|
outputs = [] |
|
mem_bytes = [] |
|
|
|
for name, act_fn in act_fn_dict.items(): |
|
if name == "silu": |
|
mlp = layers.SwiGLUMLP( |
|
d_model=d_model, |
|
intermediate_size=4 * d_model, |
|
act_fn=act_fn, |
|
dropout_prob=dropout_prob, |
|
device="cuda", |
|
dtype=dtype, |
|
) |
|
else: |
|
mlp = layers.MLP( |
|
d_model=d_model, |
|
act_fn=act_fn, |
|
dropout_prob=dropout_prob, |
|
device="cuda", |
|
dtype=dtype, |
|
) |
|
with act_mem.AllocatedMemContext() as mem, act_mem.SavedTensorContext( |
|
ignored_tensors=mlp.parameters() |
|
) as saved: |
|
out = mlp(inputs) |
|
outputs.append(out) |
|
stm = saved.saved_tensor_mem |
|
assert mem.delta["current"] == stm |
|
print(f"{name} bytes: {act_mem.B_to_GiB(stm)}") |
|
mem_bytes.append(stm) |
|
|
|
print(f"ReLU/GELU act mem ratio: {mem_bytes[0]/mem_bytes[1]}") |
|
|