Commit
·
bd1c718
1
Parent(s):
34d341b
small fixes and tokenizer config
Browse files- configuration_yalm.py +1 -1
- modeling_yalm.py → modelling_yalm.py +7 -7
- tokenizer_config.json +9 -0
configuration_yalm.py
CHANGED
|
@@ -106,7 +106,7 @@ class YalmConfig(PretrainedConfig):
|
|
| 106 |
self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
|
| 107 |
self.activation_type = activation_type
|
| 108 |
self.max_position_embeddings = max_position_embeddings
|
| 109 |
-
self.apply_residual_connection_post_layernorm =
|
| 110 |
self.initializer_range = initializer_range
|
| 111 |
self.layernorm_epsilon = layernorm_epsilon
|
| 112 |
self.attention_dropout = attention_dropout
|
|
|
|
| 106 |
self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
|
| 107 |
self.activation_type = activation_type
|
| 108 |
self.max_position_embeddings = max_position_embeddings
|
| 109 |
+
self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
|
| 110 |
self.initializer_range = initializer_range
|
| 111 |
self.layernorm_epsilon = layernorm_epsilon
|
| 112 |
self.attention_dropout = attention_dropout
|
modeling_yalm.py → modelling_yalm.py
RENAMED
|
@@ -327,7 +327,7 @@ class YalmSelfAttention(nn.Module):
|
|
| 327 |
attention_scores += attention_mask
|
| 328 |
attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
|
| 329 |
|
| 330 |
-
|
| 331 |
|
| 332 |
# =========================
|
| 333 |
# Context layer. [sq, b, hp]
|
|
@@ -498,9 +498,9 @@ class YalmTransformerLayer(nn.Module):
|
|
| 498 |
else:
|
| 499 |
residual = hidden_states
|
| 500 |
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
layernorm_input = attention_output + residual
|
| 505 |
|
| 506 |
# Layer norm post the self attention.
|
|
@@ -510,9 +510,9 @@ class YalmTransformerLayer(nn.Module):
|
|
| 510 |
mlp_output = self.mlp(layernorm_output)
|
| 511 |
residual = layernorm_input
|
| 512 |
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
output = mlp_output + residual
|
| 517 |
|
| 518 |
if use_cache:
|
|
|
|
| 327 |
attention_scores += attention_mask
|
| 328 |
attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
|
| 329 |
|
| 330 |
+
attention_probs = self.attention_dropout(attention_probs) # TODO: why the fuck no scale???
|
| 331 |
|
| 332 |
# =========================
|
| 333 |
# Context layer. [sq, b, hp]
|
|
|
|
| 498 |
else:
|
| 499 |
residual = hidden_states
|
| 500 |
|
| 501 |
+
attention_output = torch.nn.functional.dropout(
|
| 502 |
+
attention_output, p=self.hidden_dropout, training=self.training # TODO: why the fuck no scale???
|
| 503 |
+
)
|
| 504 |
layernorm_input = attention_output + residual
|
| 505 |
|
| 506 |
# Layer norm post the self attention.
|
|
|
|
| 510 |
mlp_output = self.mlp(layernorm_output)
|
| 511 |
residual = layernorm_input
|
| 512 |
|
| 513 |
+
mlp_output = torch.nn.functional.dropout(
|
| 514 |
+
mlp_output, p=self.hidden_dropout, training=self.training # TODO: why the fuck no scale???
|
| 515 |
+
)
|
| 516 |
output = mlp_output + residual
|
| 517 |
|
| 518 |
if use_cache:
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"auto_map": {
|
| 3 |
+
"AutoTokenizer": ["tokenization_yalm.YalmTokenizer", null]
|
| 4 |
+
},
|
| 5 |
+
"tokenizer_class": "YalmTokenizer",
|
| 6 |
+
"bos_token": "<s>",
|
| 7 |
+
"eos_token": "</s>",
|
| 8 |
+
"unk_token": "<unk>"
|
| 9 |
+
}
|