Hennara commited on
Commit
6bf4672
1 Parent(s): 3301526

add utils file

Browse files
Files changed (2) hide show
  1. app.py +5 -0
  2. utils.py +138 -0
app.py CHANGED
@@ -1,4 +1,9 @@
1
  import streamlit as st
 
 
 
2
 
3
  x = st.slider('Select a value')
 
 
4
  st.write(x, 'squared is', x * x)
 
1
  import streamlit as st
2
+ from utils import memory_moe_mlp, memory_mlp_layer, memory_for_attention_layer
3
+
4
+ st.title("Model Memory Usage Calculator")
5
 
6
  x = st.slider('Select a value')
7
+ hidden_size = st.slider("The Hidden size (d_model | d)", min_value=128, step=128)
8
+
9
  st.write(x, 'squared is', x * x)
utils.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ def memory_for_attention_layer(precession: int,
3
+ seq_len: int,
4
+ batch_size: int,
5
+ hidden_size: int,
6
+ num_heads: int):
7
+ """
8
+ head_dim = hidden_size // num_heads
9
+
10
+ Model Parameters:
11
+ q_proj: (hidden_size, num_heads * head_dim)
12
+ k_proj: (hidden_size, num_key_value_heads * head_dim)
13
+ v_proj: (hidden_size, num_key_value_heads * head_dim)
14
+ o_proj: (hidden_size, hidden_size)
15
+
16
+ Total parameters = 3 * hidden_size * num_heads * head_dim + hidden_size^2
17
+
18
+ Memory required for model parameters = (3 * hidden_size * num_heads * head_dim + hidden_size^2)
19
+
20
+ Gradients:
21
+ Gradients have the same size as the model parameters.
22
+ Memory required for gradients = (3 * hidden_size * num_heads * head_dim + hidden_size^2)
23
+
24
+ Optimizer States:
25
+ Assuming Adam optimizer with two states per parameter (momentum and variance).
26
+ Memory required for optimizer states = 2 * (3 * hidden_size * num_heads * head_dim + hidden_size^2)
27
+
28
+ Activations:
29
+ query_states: (batch_size, num_heads, q_len, head_dim)
30
+ key_states: (batch_size, num_key_value_heads, q_len, head_dim)
31
+ value_states: (batch_size, num_key_value_heads, q_len, head_dim)
32
+ attn_weights: (batch_size, num_heads, q_len, q_len)
33
+ attn_output: (batch_size, q_len, hidden_size)
34
+ Total activations = batch_size * (num_heads * q_len * head_dim + 2 * num_key_value_heads * q_len * head_dim + num_heads * q_len^2 + q_len * hidden_size)
35
+
36
+ Memory required for activations = batch_size * (num_heads * q_len * head_dim + 2 * num_key_value_heads * q_len * head_dim + num_heads * q_len^2 + q_len * hidden_size)
37
+
38
+ Temporary Memory:
39
+ Additional temporary memory for intermediate computations and buffer storage.
40
+ Assuming 20% of the total memory as temporary memory.
41
+
42
+ total_memory = (model_parameters + gradients + optimizer_states + activations) * (1 + temporary_memory_factor)
43
+
44
+ ((3 * hidden_size * num_heads * head_dim + hidden_size^2) +
45
+ (3 * hidden_size * num_heads * head_dim + hidden_size^2) +
46
+ 2 * (3 * hidden_size * num_heads * head_dim + hidden_size^2) +
47
+ batch_size * (num_heads * q_len * head_dim + 2 * num_key_value_heads * q_len * head_dim + num_heads * q_len^2 + q_len * hidden_size)) * (1 + 0.2)
48
+
49
+ """
50
+ head_dim = hidden_size // num_heads
51
+ # Model Memory (3 * hidden_size * num_heads * head_dim + hidden_size^2)
52
+ model_memory = 3 * hidden_size * num_heads * head_dim + hidden_size ** 2
53
+
54
+ # Gradients = model_memory
55
+ gradients = model_memory
56
+
57
+ # Optimizer
58
+ optimizer = 2 * model_memory
59
+
60
+ # Activation
61
+ activation = batch_size * (3 * num_heads * seq_len * head_dim +
62
+ num_heads * seq_len ** 2 +
63
+ seq_len * hidden_size
64
+ )
65
+ total_memory = (model_memory + gradients + optimizer + activation) * precession
66
+
67
+ return total_memory
68
+
69
+
70
+ def memory_mlp_layer(precession: int,
71
+ seq_len: int,
72
+ batch_size: int,
73
+ hidden_size: int,
74
+ intermediate_size: int):
75
+ """
76
+ MLP model
77
+ gate_proj (hidden_size, intermediate_size)
78
+ up_proj (hidden_size, intermediate_size)
79
+ down_proj (intermediate_size, hidden_size)
80
+
81
+ Memory required for gate_proj weights = intermediate_size * hidden_size
82
+ Memory required for up_proj weights = intermediate_size * hidden_size
83
+ Memory required for down_proj weights = intermediate_size * hidden_size
84
+
85
+ model memory = 3 * (hidden_size * intermediate_size)
86
+ gradient = model_memory
87
+ optimizer = 2 * model_memory
88
+ activations = batch_size * seq_len * hidden_size + 2 * batch_size * seq_len * intermediate_size
89
+
90
+ total_memory = 3 * (hidden_size * intermediate_size) + 3 * (hidden_size * intermediate_size) + 6 * (hidden_size * intermediate_size) + batch_size * (2 * intermediate_size + hidden_size)
91
+ total_memory = (hidden_size * intermediate_size) * 12 + Batch_size * seq_len * (2 * intermediate_size + hidden_size)
92
+
93
+ Args:
94
+ hidden_size:
95
+ intermediate_size:
96
+ batch_size:
97
+ seq_len:
98
+
99
+ Returns:
100
+
101
+ """
102
+ model_memory = 3 * (hidden_size * intermediate_size)
103
+ gradient = model_memory
104
+ optimizer = 2 * model_memory
105
+ activation = batch_size * seq_len * (2 * intermediate_size + hidden_size)
106
+ total_memory = (model_memory + gradient + hidden_size + activation) * precession
107
+ return total_memory
108
+
109
+
110
+ def memory_moe_mlp(precession: int,
111
+ seq_len: int,
112
+ batch_size: int,
113
+ hidden_size: int,
114
+ intermediate_size: int,
115
+ num_expert: int,
116
+ top_k: int):
117
+ # model memory
118
+ gat_memory = hidden_size * num_expert
119
+ # The result in byte
120
+ moe_mlp = memory_mlp_layer(precession, seq_len, batch_size, hidden_size, intermediate_size) * num_expert
121
+
122
+ # total model memory The result in byte
123
+ model_memory = gat_memory * precession + moe_mlp
124
+
125
+ # optimizer and gradient as before.
126
+ # activation
127
+ max_memory_activation = (
128
+ (batch_size * seq_len * num_expert * precession) + # Router logits
129
+ (batch_size * seq_len * top_k * precession) + # Routing weights
130
+ (batch_size * seq_len * top_k * precession) + # Selected experts
131
+ (batch_size * seq_len * hidden_size * precession) + # Final hidden states
132
+ (batch_size * seq_len * hidden_size * precession) + # Current state (worst-case)
133
+ (batch_size * seq_len * hidden_size * precession) # Current hidden states (worst-case)
134
+ )
135
+ total_memory = model_memory + model_memory + 2 * model_memory + max_memory_activation
136
+
137
+ return total_memory
138
+