liuxz0801 commited on
Commit
7aed995
·
1 Parent(s): 3c9da2a

7B更新config

Browse files
Files changed (2) hide show
  1. config.json +5 -9
  2. modeling_telechat.py +1 -2
config.json CHANGED
@@ -7,12 +7,12 @@
7
  "AutoConfig": "configuration_telechat.TelechatConfig",
8
  "AutoModelForCausalLM": "modeling_telechat.TelechatForCausalLM"
9
  },
10
- "attention_dropout": 0.0,
11
  "attention_softmax_in_fp32": true,
12
  "bias_dropout_fusion": true,
13
  "bos_token_id": 1,
14
  "eos_token_id": 2,
15
- "hidden_dropout": 0.0,
16
  "hidden_size": 4096,
17
  "initializer_range": 0.02,
18
  "layer_norm_epsilon": 1e-05,
@@ -21,21 +21,17 @@
21
  "n_head": 32,
22
  "n_inner": null,
23
  "n_layer": 30,
24
- "offset_alibi": 100,
25
  "pad_token_id": 3,
26
- "pretraining_tp": 2,
27
  "seq_length": 8192,
28
  "skip_bias_add": true,
29
  "skip_bias_add_qkv": false,
30
  "slow_but_exact": false,
31
- "transformers_version": "4.24.0",
32
  "unk_token_id": 0,
33
  "use_cache": true,
34
  "vocab_size": 160256,
35
  "ffn_hidden_size": 12288,
36
  "flash_attn":true,
37
- "training_seqlen":4096,
38
- "logn":false,
39
  "embed_layernorm":false
40
- }
41
-
 
7
  "AutoConfig": "configuration_telechat.TelechatConfig",
8
  "AutoModelForCausalLM": "modeling_telechat.TelechatForCausalLM"
9
  },
10
+ "attention_dropout": 0.1,
11
  "attention_softmax_in_fp32": true,
12
  "bias_dropout_fusion": true,
13
  "bos_token_id": 1,
14
  "eos_token_id": 2,
15
+ "hidden_dropout": 0.1,
16
  "hidden_size": 4096,
17
  "initializer_range": 0.02,
18
  "layer_norm_epsilon": 1e-05,
 
21
  "n_head": 32,
22
  "n_inner": null,
23
  "n_layer": 30,
 
24
  "pad_token_id": 3,
 
25
  "seq_length": 8192,
26
  "skip_bias_add": true,
27
  "skip_bias_add_qkv": false,
28
  "slow_but_exact": false,
29
+ "transformers_version": "4.30.0",
30
  "unk_token_id": 0,
31
  "use_cache": true,
32
  "vocab_size": 160256,
33
  "ffn_hidden_size": 12288,
34
  "flash_attn":true,
35
+ "training_seqlen":8192,
 
36
  "embed_layernorm":false
37
+ }
 
modeling_telechat.py CHANGED
@@ -105,8 +105,7 @@ class RotaryEmbedding(torch.nn.Module):
105
  return ntk_alpha
106
 
107
  def forward(self, x, seq_dim=0, seq_len=None):
108
- if seq_len is None:
109
- seq_len = x.shape[seq_dim]
110
  seq_len = max(seq_len, self.config.training_seqlen)
111
  ntk_alpha = self.get_ntk_alpha(seq_len)
112
  self.mscale = float(self.get_mscale(seq_len / self.config.training_seqlen))
 
105
  return ntk_alpha
106
 
107
  def forward(self, x, seq_dim=0, seq_len=None):
108
+ seq_len = x.shape[seq_dim]
 
109
  seq_len = max(seq_len, self.config.training_seqlen)
110
  ntk_alpha = self.get_ntk_alpha(seq_len)
111
  self.mscale = float(self.get_mscale(seq_len / self.config.training_seqlen))