yagizdevre commited on
Commit
43bd63d
·
1 Parent(s): c062128

added new model

Browse files
Files changed (3) hide show
  1. config.json +11 -8
  2. configuration_ministu.py +1 -1
  3. model.safetensors +2 -2
config.json CHANGED
@@ -2,10 +2,11 @@
2
  "model_type": "ministu",
3
  "_name_or_path": "STU_500M",
4
  "architectures": ["MiniSTU"],
5
- "n_embd": 768,
6
  "n_heads": 8,
7
  "n_layers": 12,
8
  "seq_len": 8192,
 
9
  "window_size": 1024,
10
  "vocab_size": 200064,
11
  "mlp_scale": 12,
@@ -15,17 +16,18 @@
15
  "use_hankel_L": false,
16
  "num_epochs": 1,
17
  "global_bsz": 524288,
18
- "bsz": 1,
19
  "warmup_steps": 1907,
20
- "eval_period": 25,
21
- "save_period": 4500,
22
  "max_lr": 3.0e-3,
23
  "min_lr": 3.0e-5,
24
  "max_norm": 1.0,
25
- "dilation": 1,
26
  "fsdp": true,
27
  "ddp": false,
28
  "mixed_precision": true,
 
29
  "use_cpu_offload": false,
30
  "sharding_strategy": "full_shard",
31
  "state_dict_type": "full",
@@ -46,10 +48,11 @@
46
  "MLP"
47
  ],
48
  "use_activation_checkpointing": true,
49
- "use_flash_fft": false,
50
  "use_approx": true,
51
  "use_attn": true,
52
  "softcap": 50.0,
 
 
53
  "torch_compile": false
54
- }
55
-
 
2
  "model_type": "ministu",
3
  "_name_or_path": "STU_500M",
4
  "architectures": ["MiniSTU"],
5
+ "n_embd": 896,
6
  "n_heads": 8,
7
  "n_layers": 12,
8
  "seq_len": 8192,
9
+ "weight_tying": true,
10
  "window_size": 1024,
11
  "vocab_size": 200064,
12
  "mlp_scale": 12,
 
16
  "use_hankel_L": false,
17
  "num_epochs": 1,
18
  "global_bsz": 524288,
19
+ "bsz": 2,
20
  "warmup_steps": 1907,
21
+ "eval_period": 50,
22
+ "save_period": 500,
23
  "max_lr": 3.0e-3,
24
  "min_lr": 3.0e-5,
25
  "max_norm": 1.0,
26
+ "dilation": 2,
27
  "fsdp": true,
28
  "ddp": false,
29
  "mixed_precision": true,
30
+ "torch_dtype": "bfloat16",
31
  "use_cpu_offload": false,
32
  "sharding_strategy": "full_shard",
33
  "state_dict_type": "full",
 
48
  "MLP"
49
  ],
50
  "use_activation_checkpointing": true,
51
+ "use_flash_fft": true,
52
  "use_approx": true,
53
  "use_attn": true,
54
  "softcap": 50.0,
55
+ "theta": 10000.0,
56
+ "use_alibi": false,
57
  "torch_compile": false
58
+ }
 
configuration_ministu.py CHANGED
@@ -7,7 +7,7 @@ class MiniSTUConfig(PretrainedConfig):
7
  def __init__(
8
  self,
9
  bsz: int = 1,
10
- n_embd: int = 768,
11
  n_heads: int = 8,
12
  n_layers: int = 12,
13
  seq_len: int = 8192,
 
7
  def __init__(
8
  self,
9
  bsz: int = 1,
10
+ n_embd: int = 896,
11
  n_heads: int = 8,
12
  n_layers: int = 12,
13
  seq_len: int = 8192,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:101763e8119c492ada9816aa38006cbf6ba8bbc0530224510d62b2c7e20a8bfd
3
- size 1140654808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b89bf828452423367e484d5922f23e381fe3cbcd1e9751036ed4c23b9f2af19
3
+ size 1460045528