HannahLin271 ryuruz commited on
Commit
8add2ec
·
verified ·
1 Parent(s): 5c6ee75

updateModel (#2)

Browse files

- update app.py with new model (150e6c8ddabb5058f4c8e6f9bed70521f097c148)


Co-authored-by: Jihyun Ryu <[email protected]>

Files changed (3) hide show
  1. .gitignore +2 -0
  2. app.py +11 -1
  3. model.py +27 -15
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__
2
+ ./out
app.py CHANGED
@@ -34,11 +34,21 @@ model_info = {
34
  'description': "Trained on Facebook Emotion Dialogues dataset, excluding emotion annotations for simpler conversations, using a default batch size of 64.",
35
  'logo': '🍷'
36
  },
 
 
 
 
 
 
 
 
 
 
37
  "whole_conversation": {
38
  'url': 'https://huggingface.co/HannahLin271/NanoGPT/resolve/main/wholeConversation.pt',
39
  'description': "Trained on entire conversations from the Facebook Emotion Dialogues dataset, excluding tags other than <bot> and <human>,, using a default batch size of 64",
40
  'logo': '🍵'
41
- }
42
  }
43
  model_list = { }
44
  model_choices = list(model_info.keys())
 
34
  'description': "Trained on Facebook Emotion Dialogues dataset, excluding emotion annotations for simpler conversations, using a default batch size of 64.",
35
  'logo': '🍷'
36
  },
37
+ "single_conversation_rope": {
38
+ 'url': 'https://huggingface.co/HannahLin271/NanoGPT/resolve/main/singleConversation_RoPE.pt',
39
+ 'description': "Trained on Facebook Emotion Dialogues dataset, excluding emotion annotations for simpler conversations, using a default batch size of 64, using RoPE positional embedding.",
40
+ 'logo': '🥤'
41
+ },
42
+ "single_conversation_relative": {
43
+ 'url': 'https://huggingface.co/HannahLin271/NanoGPT/resolve/main/singleConversation_Relative.pt',
44
+ 'description': "Trained on entire conversations from the Facebook Emotion Dialogues dataset, excluding tags other than <bot> and <human>,, using a default batch size of 64, using relative positional embedding.",
45
+ 'logo': '🧋'
46
+ },
47
  "whole_conversation": {
48
  'url': 'https://huggingface.co/HannahLin271/NanoGPT/resolve/main/wholeConversation.pt',
49
  'description': "Trained on entire conversations from the Facebook Emotion Dialogues dataset, excluding tags other than <bot> and <human>,, using a default batch size of 64",
50
  'logo': '🍵'
51
+ },
52
  }
53
  model_list = { }
54
  model_choices = list(model_info.keys())
model.py CHANGED
@@ -51,13 +51,11 @@ class CausalSelfAttention(nn.Module):
51
 
52
  def forward(self, x):
53
  B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
54
-
55
  # calculate query, key, values for all heads in batch and move head forward to be the batch dim
56
  q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
57
  k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
58
  q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
59
  v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
60
-
61
  # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
62
  if self.flash:
63
  # efficient attention using Flash Attention CUDA kernels
@@ -119,8 +117,8 @@ class GPTConfig:
119
  Improving positional Embeddings
120
  1. RoPE
121
  2. Relative
122
- 3. Dynamic
123
  """
 
124
  ################################### 1. RoPE ###################################
125
  class RotaryPositionalEmbedding(nn.Module):
126
  def __init__(self, config):
@@ -147,6 +145,22 @@ class RotaryPositionalEmbedding(nn.Module):
147
  rope1, rope2 = rope[..., :dim // 2], rope[..., dim // 2:]
148
  return torch.cat([x1 * rope1 - x2 * rope2, x1 * rope2 + x2 * rope1], dim=-1)
149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
  # MARK: - GPT Model
152
  class GPT(nn.Module):
@@ -161,6 +175,7 @@ class GPT(nn.Module):
161
  wte = nn.Embedding(config.vocab_size, config.n_embd), # token embedding
162
  wpe = nn.Embedding(config.block_size, config.n_embd), # positional embedding
163
  rope = RotaryPositionalEmbedding(config), # improving PE: 1. RoPE
 
164
  drop = nn.Dropout(config.dropout), # dropout layer
165
  h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), # the transformer
166
  ln_f = LayerNorm(config.n_embd, bias=config.bias), # layer norm at the output of the model
@@ -216,26 +231,23 @@ class GPT(nn.Module):
216
  # 0. Default NanoGPT
217
  pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
218
  x = self.transformer.drop(tok_emb + pos_emb)
 
219
  elif self.config.pos_embd == 'rope':
220
  # 1. RoPE
221
  rope = self.transformer.rope.forward(t, device=device) # (t, n_embd)
222
  pos_emb = rope.unsqueeze(0).expand(b, -1, -1) # (b, t, n_embd)
223
  x = RotaryPositionalEmbedding.apply_rotary_embedding(tok_emb, pos_emb)
 
 
 
 
 
 
 
 
224
  else:
225
  raise ValueError(f"Unknown positional embedding type: {self.config.pos_embedding_type}")
226
 
227
- ################################### 0. Default of Nano GPT ###################################
228
- # pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
229
- ##TODO: the embedding here is simple, just sum them up, could improve with more sophisticated tokenization
230
- # x = self.transformer.drop(tok_emb + pos_emb)
231
-
232
- # ################################### 1. RoPE ###################################
233
- # rope = self.transformer.rope.forward(t, device=device) # (t, n_embd)
234
- # pos_emb = rope.unsqueeze(0).expand(b, -1, -1) # (b, t, n_embd)
235
- # x = RotaryPositionalEmbedding.apply_rotary_embedding(tok_emb, pos_emb)
236
-
237
- #########################################################################################################
238
-
239
  for block in self.transformer.h:
240
  x = block(x)
241
  x = self.transformer.ln_f(x)
 
51
 
52
  def forward(self, x):
53
  B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
 
54
  # calculate query, key, values for all heads in batch and move head forward to be the batch dim
55
  q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
56
  k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
57
  q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
58
  v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
 
59
  # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
60
  if self.flash:
61
  # efficient attention using Flash Attention CUDA kernels
 
117
  Improving positional Embeddings
118
  1. RoPE
119
  2. Relative
 
120
  """
121
+
122
  ################################### 1. RoPE ###################################
123
  class RotaryPositionalEmbedding(nn.Module):
124
  def __init__(self, config):
 
145
  rope1, rope2 = rope[..., :dim // 2], rope[..., dim // 2:]
146
  return torch.cat([x1 * rope1 - x2 * rope2, x1 * rope2 + x2 * rope1], dim=-1)
147
 
148
+ ################################### 2. Relative ###################################
149
+
150
+ class RelativePositionalEmbedding(nn.Module):
151
+ def __init__(self, config):
152
+ super().__init__()
153
+ self.n_embd = config.n_embd
154
+ self.max_relative_positions = config.block_size
155
+ self.relative_embeddings = nn.Embedding(2 * self.max_relative_positions - 1, config.n_embd)
156
+
157
+ def forward(self, seq_len, device):
158
+ range_vec = torch.arange(seq_len, device=device)
159
+ relative_positions = range_vec.unsqueeze(0) - range_vec.unsqueeze(1)
160
+ relative_positions += self.max_relative_positions - 1
161
+ relative_embeddings = self.relative_embeddings(relative_positions.to(torch.long))
162
+ return relative_embeddings # (seq_len, seq_len, n_embd)
163
+
164
 
165
  # MARK: - GPT Model
166
  class GPT(nn.Module):
 
175
  wte = nn.Embedding(config.vocab_size, config.n_embd), # token embedding
176
  wpe = nn.Embedding(config.block_size, config.n_embd), # positional embedding
177
  rope = RotaryPositionalEmbedding(config), # improving PE: 1. RoPE
178
+ relative = RelativePositionalEmbedding(config), # improving PE: 2. Relative
179
  drop = nn.Dropout(config.dropout), # dropout layer
180
  h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), # the transformer
181
  ln_f = LayerNorm(config.n_embd, bias=config.bias), # layer norm at the output of the model
 
231
  # 0. Default NanoGPT
232
  pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
233
  x = self.transformer.drop(tok_emb + pos_emb)
234
+
235
  elif self.config.pos_embd == 'rope':
236
  # 1. RoPE
237
  rope = self.transformer.rope.forward(t, device=device) # (t, n_embd)
238
  pos_emb = rope.unsqueeze(0).expand(b, -1, -1) # (b, t, n_embd)
239
  x = RotaryPositionalEmbedding.apply_rotary_embedding(tok_emb, pos_emb)
240
+
241
+ elif self.config.pos_embd == 'relative':
242
+ # 2. Relative
243
+ relative = self.transformer.relative(t, device=device)
244
+ tok_emb_expanded = tok_emb.unsqueeze(2)
245
+ relative = relative.unsqueeze(0)
246
+ relative_emb = relative + tok_emb_expanded # (batch_size, seq_len, seq_len, n_embd)
247
+ x = self.transformer.drop(relative_emb.sum(dim=2))
248
  else:
249
  raise ValueError(f"Unknown positional embedding type: {self.config.pos_embedding_type}")
250
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  for block in self.transformer.h:
252
  x = block(x)
253
  x = self.transformer.ln_f(x)