Spaces:
Running
Running
updateModel (#2)
Browse files- update app.py with new model (150e6c8ddabb5058f4c8e6f9bed70521f097c148)
Co-authored-by: Jihyun Ryu <[email protected]>
- .gitignore +2 -0
- app.py +11 -1
- model.py +27 -15
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
./out
|
app.py
CHANGED
@@ -34,11 +34,21 @@ model_info = {
|
|
34 |
'description': "Trained on Facebook Emotion Dialogues dataset, excluding emotion annotations for simpler conversations, using a default batch size of 64.",
|
35 |
'logo': '🍷'
|
36 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
"whole_conversation": {
|
38 |
'url': 'https://huggingface.co/HannahLin271/NanoGPT/resolve/main/wholeConversation.pt',
|
39 |
'description': "Trained on entire conversations from the Facebook Emotion Dialogues dataset, excluding tags other than <bot> and <human>,, using a default batch size of 64",
|
40 |
'logo': '🍵'
|
41 |
-
}
|
42 |
}
|
43 |
model_list = { }
|
44 |
model_choices = list(model_info.keys())
|
|
|
34 |
'description': "Trained on Facebook Emotion Dialogues dataset, excluding emotion annotations for simpler conversations, using a default batch size of 64.",
|
35 |
'logo': '🍷'
|
36 |
},
|
37 |
+
"single_conversation_rope": {
|
38 |
+
'url': 'https://huggingface.co/HannahLin271/NanoGPT/resolve/main/singleConversation_RoPE.pt',
|
39 |
+
'description': "Trained on Facebook Emotion Dialogues dataset, excluding emotion annotations for simpler conversations, using a default batch size of 64, using RoPE positional embedding.",
|
40 |
+
'logo': '🥤'
|
41 |
+
},
|
42 |
+
"single_conversation_relative": {
|
43 |
+
'url': 'https://huggingface.co/HannahLin271/NanoGPT/resolve/main/singleConversation_Relative.pt',
|
44 |
+
'description': "Trained on entire conversations from the Facebook Emotion Dialogues dataset, excluding tags other than <bot> and <human>,, using a default batch size of 64, using relative positional embedding.",
|
45 |
+
'logo': '🧋'
|
46 |
+
},
|
47 |
"whole_conversation": {
|
48 |
'url': 'https://huggingface.co/HannahLin271/NanoGPT/resolve/main/wholeConversation.pt',
|
49 |
'description': "Trained on entire conversations from the Facebook Emotion Dialogues dataset, excluding tags other than <bot> and <human>,, using a default batch size of 64",
|
50 |
'logo': '🍵'
|
51 |
+
},
|
52 |
}
|
53 |
model_list = { }
|
54 |
model_choices = list(model_info.keys())
|
model.py
CHANGED
@@ -51,13 +51,11 @@ class CausalSelfAttention(nn.Module):
|
|
51 |
|
52 |
def forward(self, x):
|
53 |
B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
|
54 |
-
|
55 |
# calculate query, key, values for all heads in batch and move head forward to be the batch dim
|
56 |
q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
|
57 |
k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
58 |
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
59 |
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
60 |
-
|
61 |
# causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
|
62 |
if self.flash:
|
63 |
# efficient attention using Flash Attention CUDA kernels
|
@@ -119,8 +117,8 @@ class GPTConfig:
|
|
119 |
Improving positional Embeddings
|
120 |
1. RoPE
|
121 |
2. Relative
|
122 |
-
3. Dynamic
|
123 |
"""
|
|
|
124 |
################################### 1. RoPE ###################################
|
125 |
class RotaryPositionalEmbedding(nn.Module):
|
126 |
def __init__(self, config):
|
@@ -147,6 +145,22 @@ class RotaryPositionalEmbedding(nn.Module):
|
|
147 |
rope1, rope2 = rope[..., :dim // 2], rope[..., dim // 2:]
|
148 |
return torch.cat([x1 * rope1 - x2 * rope2, x1 * rope2 + x2 * rope1], dim=-1)
|
149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
# MARK: - GPT Model
|
152 |
class GPT(nn.Module):
|
@@ -161,6 +175,7 @@ class GPT(nn.Module):
|
|
161 |
wte = nn.Embedding(config.vocab_size, config.n_embd), # token embedding
|
162 |
wpe = nn.Embedding(config.block_size, config.n_embd), # positional embedding
|
163 |
rope = RotaryPositionalEmbedding(config), # improving PE: 1. RoPE
|
|
|
164 |
drop = nn.Dropout(config.dropout), # dropout layer
|
165 |
h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), # the transformer
|
166 |
ln_f = LayerNorm(config.n_embd, bias=config.bias), # layer norm at the output of the model
|
@@ -216,26 +231,23 @@ class GPT(nn.Module):
|
|
216 |
# 0. Default NanoGPT
|
217 |
pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
|
218 |
x = self.transformer.drop(tok_emb + pos_emb)
|
|
|
219 |
elif self.config.pos_embd == 'rope':
|
220 |
# 1. RoPE
|
221 |
rope = self.transformer.rope.forward(t, device=device) # (t, n_embd)
|
222 |
pos_emb = rope.unsqueeze(0).expand(b, -1, -1) # (b, t, n_embd)
|
223 |
x = RotaryPositionalEmbedding.apply_rotary_embedding(tok_emb, pos_emb)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
else:
|
225 |
raise ValueError(f"Unknown positional embedding type: {self.config.pos_embedding_type}")
|
226 |
|
227 |
-
################################### 0. Default of Nano GPT ###################################
|
228 |
-
# pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
|
229 |
-
##TODO: the embedding here is simple, just sum them up, could improve with more sophisticated tokenization
|
230 |
-
# x = self.transformer.drop(tok_emb + pos_emb)
|
231 |
-
|
232 |
-
# ################################### 1. RoPE ###################################
|
233 |
-
# rope = self.transformer.rope.forward(t, device=device) # (t, n_embd)
|
234 |
-
# pos_emb = rope.unsqueeze(0).expand(b, -1, -1) # (b, t, n_embd)
|
235 |
-
# x = RotaryPositionalEmbedding.apply_rotary_embedding(tok_emb, pos_emb)
|
236 |
-
|
237 |
-
#########################################################################################################
|
238 |
-
|
239 |
for block in self.transformer.h:
|
240 |
x = block(x)
|
241 |
x = self.transformer.ln_f(x)
|
|
|
51 |
|
52 |
def forward(self, x):
|
53 |
B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
|
|
|
54 |
# calculate query, key, values for all heads in batch and move head forward to be the batch dim
|
55 |
q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
|
56 |
k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
57 |
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
58 |
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
|
|
59 |
# causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
|
60 |
if self.flash:
|
61 |
# efficient attention using Flash Attention CUDA kernels
|
|
|
117 |
Improving positional Embeddings
|
118 |
1. RoPE
|
119 |
2. Relative
|
|
|
120 |
"""
|
121 |
+
|
122 |
################################### 1. RoPE ###################################
|
123 |
class RotaryPositionalEmbedding(nn.Module):
|
124 |
def __init__(self, config):
|
|
|
145 |
rope1, rope2 = rope[..., :dim // 2], rope[..., dim // 2:]
|
146 |
return torch.cat([x1 * rope1 - x2 * rope2, x1 * rope2 + x2 * rope1], dim=-1)
|
147 |
|
148 |
+
################################### 2. Relative ###################################
|
149 |
+
|
150 |
+
class RelativePositionalEmbedding(nn.Module):
|
151 |
+
def __init__(self, config):
|
152 |
+
super().__init__()
|
153 |
+
self.n_embd = config.n_embd
|
154 |
+
self.max_relative_positions = config.block_size
|
155 |
+
self.relative_embeddings = nn.Embedding(2 * self.max_relative_positions - 1, config.n_embd)
|
156 |
+
|
157 |
+
def forward(self, seq_len, device):
|
158 |
+
range_vec = torch.arange(seq_len, device=device)
|
159 |
+
relative_positions = range_vec.unsqueeze(0) - range_vec.unsqueeze(1)
|
160 |
+
relative_positions += self.max_relative_positions - 1
|
161 |
+
relative_embeddings = self.relative_embeddings(relative_positions.to(torch.long))
|
162 |
+
return relative_embeddings # (seq_len, seq_len, n_embd)
|
163 |
+
|
164 |
|
165 |
# MARK: - GPT Model
|
166 |
class GPT(nn.Module):
|
|
|
175 |
wte = nn.Embedding(config.vocab_size, config.n_embd), # token embedding
|
176 |
wpe = nn.Embedding(config.block_size, config.n_embd), # positional embedding
|
177 |
rope = RotaryPositionalEmbedding(config), # improving PE: 1. RoPE
|
178 |
+
relative = RelativePositionalEmbedding(config), # improving PE: 2. Relative
|
179 |
drop = nn.Dropout(config.dropout), # dropout layer
|
180 |
h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), # the transformer
|
181 |
ln_f = LayerNorm(config.n_embd, bias=config.bias), # layer norm at the output of the model
|
|
|
231 |
# 0. Default NanoGPT
|
232 |
pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
|
233 |
x = self.transformer.drop(tok_emb + pos_emb)
|
234 |
+
|
235 |
elif self.config.pos_embd == 'rope':
|
236 |
# 1. RoPE
|
237 |
rope = self.transformer.rope.forward(t, device=device) # (t, n_embd)
|
238 |
pos_emb = rope.unsqueeze(0).expand(b, -1, -1) # (b, t, n_embd)
|
239 |
x = RotaryPositionalEmbedding.apply_rotary_embedding(tok_emb, pos_emb)
|
240 |
+
|
241 |
+
elif self.config.pos_embd == 'relative':
|
242 |
+
# 2. Relative
|
243 |
+
relative = self.transformer.relative(t, device=device)
|
244 |
+
tok_emb_expanded = tok_emb.unsqueeze(2)
|
245 |
+
relative = relative.unsqueeze(0)
|
246 |
+
relative_emb = relative + tok_emb_expanded # (batch_size, seq_len, seq_len, n_embd)
|
247 |
+
x = self.transformer.drop(relative_emb.sum(dim=2))
|
248 |
else:
|
249 |
raise ValueError(f"Unknown positional embedding type: {self.config.pos_embedding_type}")
|
250 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
for block in self.transformer.h:
|
252 |
x = block(x)
|
253 |
x = self.transformer.ln_f(x)
|