lijie.wang commited on
Commit
25ba07f
·
1 Parent(s): e141dd7

del origin 3T

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. model_hubs/Skywork-13B-Base-3T/config.json +0 -27
  2. model_hubs/Skywork-13B-Base-3T/configuration_skywork.py +0 -89
  3. model_hubs/Skywork-13B-Base-3T/generation_config.json +0 -10
  4. model_hubs/Skywork-13B-Base-3T/modeling_skywork.py +0 -911
  5. model_hubs/Skywork-13B-Base-3T/pytorch_model-00001-of-00053.bin +0 -3
  6. model_hubs/Skywork-13B-Base-3T/pytorch_model-00002-of-00053.bin +0 -3
  7. model_hubs/Skywork-13B-Base-3T/pytorch_model-00003-of-00053.bin +0 -3
  8. model_hubs/Skywork-13B-Base-3T/pytorch_model-00004-of-00053.bin +0 -3
  9. model_hubs/Skywork-13B-Base-3T/pytorch_model-00005-of-00053.bin +0 -3
  10. model_hubs/Skywork-13B-Base-3T/pytorch_model-00006-of-00053.bin +0 -3
  11. model_hubs/Skywork-13B-Base-3T/pytorch_model-00007-of-00053.bin +0 -3
  12. model_hubs/Skywork-13B-Base-3T/pytorch_model-00008-of-00053.bin +0 -3
  13. model_hubs/Skywork-13B-Base-3T/pytorch_model-00009-of-00053.bin +0 -3
  14. model_hubs/Skywork-13B-Base-3T/pytorch_model-00010-of-00053.bin +0 -3
  15. model_hubs/Skywork-13B-Base-3T/pytorch_model-00011-of-00053.bin +0 -3
  16. model_hubs/Skywork-13B-Base-3T/pytorch_model-00012-of-00053.bin +0 -3
  17. model_hubs/Skywork-13B-Base-3T/pytorch_model-00013-of-00053.bin +0 -3
  18. model_hubs/Skywork-13B-Base-3T/pytorch_model-00014-of-00053.bin +0 -3
  19. model_hubs/Skywork-13B-Base-3T/pytorch_model-00015-of-00053.bin +0 -3
  20. model_hubs/Skywork-13B-Base-3T/pytorch_model-00016-of-00053.bin +0 -3
  21. model_hubs/Skywork-13B-Base-3T/pytorch_model-00017-of-00053.bin +0 -3
  22. model_hubs/Skywork-13B-Base-3T/pytorch_model-00018-of-00053.bin +0 -3
  23. model_hubs/Skywork-13B-Base-3T/pytorch_model-00019-of-00053.bin +0 -3
  24. model_hubs/Skywork-13B-Base-3T/pytorch_model-00020-of-00053.bin +0 -3
  25. model_hubs/Skywork-13B-Base-3T/pytorch_model-00021-of-00053.bin +0 -3
  26. model_hubs/Skywork-13B-Base-3T/pytorch_model-00022-of-00053.bin +0 -3
  27. model_hubs/Skywork-13B-Base-3T/pytorch_model-00023-of-00053.bin +0 -3
  28. model_hubs/Skywork-13B-Base-3T/pytorch_model-00024-of-00053.bin +0 -3
  29. model_hubs/Skywork-13B-Base-3T/pytorch_model-00025-of-00053.bin +0 -3
  30. model_hubs/Skywork-13B-Base-3T/pytorch_model-00026-of-00053.bin +0 -3
  31. model_hubs/Skywork-13B-Base-3T/pytorch_model-00027-of-00053.bin +0 -3
  32. model_hubs/Skywork-13B-Base-3T/pytorch_model-00028-of-00053.bin +0 -3
  33. model_hubs/Skywork-13B-Base-3T/pytorch_model-00029-of-00053.bin +0 -3
  34. model_hubs/Skywork-13B-Base-3T/pytorch_model-00030-of-00053.bin +0 -3
  35. model_hubs/Skywork-13B-Base-3T/pytorch_model-00031-of-00053.bin +0 -3
  36. model_hubs/Skywork-13B-Base-3T/pytorch_model-00032-of-00053.bin +0 -3
  37. model_hubs/Skywork-13B-Base-3T/pytorch_model-00033-of-00053.bin +0 -3
  38. model_hubs/Skywork-13B-Base-3T/pytorch_model-00034-of-00053.bin +0 -3
  39. model_hubs/Skywork-13B-Base-3T/pytorch_model-00035-of-00053.bin +0 -3
  40. model_hubs/Skywork-13B-Base-3T/pytorch_model-00036-of-00053.bin +0 -3
  41. model_hubs/Skywork-13B-Base-3T/pytorch_model-00037-of-00053.bin +0 -3
  42. model_hubs/Skywork-13B-Base-3T/pytorch_model-00038-of-00053.bin +0 -3
  43. model_hubs/Skywork-13B-Base-3T/pytorch_model-00039-of-00053.bin +0 -3
  44. model_hubs/Skywork-13B-Base-3T/pytorch_model-00040-of-00053.bin +0 -3
  45. model_hubs/Skywork-13B-Base-3T/pytorch_model-00041-of-00053.bin +0 -3
  46. model_hubs/Skywork-13B-Base-3T/pytorch_model-00042-of-00053.bin +0 -3
  47. model_hubs/Skywork-13B-Base-3T/pytorch_model-00043-of-00053.bin +0 -3
  48. model_hubs/Skywork-13B-Base-3T/pytorch_model-00044-of-00053.bin +0 -3
  49. model_hubs/Skywork-13B-Base-3T/pytorch_model-00045-of-00053.bin +0 -3
  50. model_hubs/Skywork-13B-Base-3T/pytorch_model-00046-of-00053.bin +0 -3
model_hubs/Skywork-13B-Base-3T/config.json DELETED
@@ -1,27 +0,0 @@
1
- {
2
- "architectures": [
3
- "SkyworkForCausalLM"
4
- ],
5
- "auto_map": {
6
- "AutoConfig": "configuration_skywork.SkyworkConfig",
7
- "AutoModelForCausalLM": "modeling_skywork.SkyworkForCausalLM"
8
- },
9
- "bos_token_id": 1,
10
- "eos_token_id": 2,
11
- "pad_token_id": 0,
12
- "hidden_act": "silu",
13
- "hidden_size": 4608,
14
- "initializer_range": 0.01,
15
- "intermediate_size": 12288,
16
- "max_position_embeddings": 131072,
17
- "model_type": "skywork",
18
- "num_attention_heads": 36,
19
- "num_hidden_layers": 52,
20
- "num_key_value_heads": 36,
21
- "rms_norm_eps": 1e-06,
22
- "tie_word_embeddings": false,
23
- "torch_dtype": "bfloat16",
24
- "transformers_version": "4.33.1",
25
- "use_cache": true,
26
- "vocab_size": 65519
27
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model_hubs/Skywork-13B-Base-3T/configuration_skywork.py DELETED
@@ -1,89 +0,0 @@
1
- # Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
2
- # This code is built upon Huggingface's transformers repository.
3
-
4
-
5
- from transformers.configuration_utils import PretrainedConfig
6
- from transformers.utils import logging
7
-
8
-
9
- logger = logging.get_logger(__name__)
10
-
11
- LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
12
-
13
-
14
- class SkyworkConfig(PretrainedConfig):
15
-
16
- model_type = "skywork"
17
- keys_to_ignore_at_inference = ["past_key_values"]
18
-
19
- def __init__(
20
- self,
21
- vocab_size=32000,
22
- hidden_size=4096,
23
- intermediate_size=11008,
24
- num_hidden_layers=32,
25
- num_attention_heads=32,
26
- num_key_value_heads=None,
27
- hidden_act="silu",
28
- max_position_embeddings=2048,
29
- initializer_range=0.02,
30
- rms_norm_eps=1e-6,
31
- use_cache=True,
32
- pad_token_id=None,
33
- bos_token_id=1,
34
- eos_token_id=2,
35
- pretraining_tp=1,
36
- tie_word_embeddings=False,
37
- rope_theta=10000.0,
38
- rope_scaling=None,
39
- **kwargs,
40
- ):
41
- self.vocab_size = vocab_size
42
- self.max_position_embeddings = max_position_embeddings
43
- self.hidden_size = hidden_size
44
- self.intermediate_size = intermediate_size
45
- self.num_hidden_layers = num_hidden_layers
46
- self.num_attention_heads = num_attention_heads
47
-
48
- # for backward compatibility
49
- if num_key_value_heads is None:
50
- num_key_value_heads = num_attention_heads
51
-
52
- self.num_key_value_heads = num_key_value_heads
53
- self.hidden_act = hidden_act
54
- self.initializer_range = initializer_range
55
- self.rms_norm_eps = rms_norm_eps
56
- self.pretraining_tp = pretraining_tp
57
- self.use_cache = use_cache
58
- self.rope_theta = rope_theta
59
- self.rope_scaling = rope_scaling
60
- self._rope_scaling_validation()
61
-
62
- super().__init__(
63
- pad_token_id=pad_token_id,
64
- bos_token_id=bos_token_id,
65
- eos_token_id=eos_token_id,
66
- tie_word_embeddings=tie_word_embeddings,
67
- **kwargs,
68
- )
69
-
70
- def _rope_scaling_validation(self):
71
- """
72
- Validate the `rope_scaling` configuration.
73
- """
74
- if self.rope_scaling is None:
75
- return
76
-
77
- if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
78
- raise ValueError(
79
- "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
80
- f"got {self.rope_scaling}"
81
- )
82
- rope_scaling_type = self.rope_scaling.get("type", None)
83
- rope_scaling_factor = self.rope_scaling.get("factor", None)
84
- if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic", "ntk"]:
85
- raise ValueError(
86
- f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
87
- )
88
- if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
89
- raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model_hubs/Skywork-13B-Base-3T/generation_config.json DELETED
@@ -1,10 +0,0 @@
1
- {
2
- "bos_token_id": 1,
3
- "do_sample": true,
4
- "eos_token_id": 2,
5
- "max_length": 4096,
6
- "pad_token_id": 0,
7
- "temperature": 0.6,
8
- "top_p": 0.9,
9
- "transformers_version": "4.33.1"
10
- }
 
 
 
 
 
 
 
 
 
 
 
model_hubs/Skywork-13B-Base-3T/modeling_skywork.py DELETED
@@ -1,911 +0,0 @@
1
- # Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
2
- # This code is built upon Huggingface's transformers repository.
3
-
4
- import math
5
- from typing import List, Optional, Tuple, Union
6
-
7
- import torch
8
- import torch.nn.functional as F
9
- import torch.utils.checkpoint
10
- from torch import nn
11
- from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
12
-
13
- from transformers.activations import ACT2FN
14
- from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
15
- from transformers.modeling_utils import PreTrainedModel
16
- from transformers.utils import logging
17
- from .configuration_skywork import SkyworkConfig
18
-
19
-
20
- logger = logging.get_logger(__name__)
21
-
22
- _CONFIG_FOR_DOC = "SkyworkConfig"
23
-
24
-
25
- # Copied from transformers.models.bart.modeling_bart._make_causal_mask
26
- def _make_causal_mask(
27
- input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
28
- ):
29
- """
30
- Make causal mask used for bi-directional self-attention.
31
- """
32
- bsz, tgt_len = input_ids_shape
33
- mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
34
- mask_cond = torch.arange(mask.size(-1), device=device)
35
- mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
36
- mask = mask.to(dtype)
37
-
38
- if past_key_values_length > 0:
39
- mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
40
- return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
41
-
42
-
43
- # Copied from transformers.models.bart.modeling_bart._expand_mask
44
- def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
45
- """
46
- Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
47
- """
48
- bsz, src_len = mask.size()
49
- tgt_len = tgt_len if tgt_len is not None else src_len
50
-
51
- expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
52
-
53
- inverted_mask = 1.0 - expanded_mask
54
-
55
- return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
56
-
57
-
58
- class SkyworkRMSNorm(nn.Module):
59
- def __init__(self, hidden_size, eps=1e-6):
60
- """
61
- SkyworkRMSNorm is equivalent to T5LayerNorm
62
- """
63
- super().__init__()
64
- self.weight = nn.Parameter(torch.ones(hidden_size))
65
- self.variance_epsilon = eps
66
-
67
- def forward(self, hidden_states):
68
- input_dtype = hidden_states.dtype
69
- hidden_states = hidden_states.to(torch.float32)
70
- variance = hidden_states.pow(2).mean(-1, keepdim=True)
71
- hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
72
- return self.weight * hidden_states.to(input_dtype)
73
-
74
-
75
- class SkyworkRotaryEmbedding(torch.nn.Module):
76
- def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
77
- super().__init__()
78
-
79
- self.dim = dim
80
- self.max_position_embeddings = max_position_embeddings
81
- self.base = base
82
- inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
83
- self.register_buffer("inv_freq", inv_freq, persistent=False)
84
-
85
- # Build here to make `torch.jit.trace` work.
86
- self._set_cos_sin_cache(
87
- seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
88
- )
89
-
90
- def _set_cos_sin_cache(self, seq_len, device, dtype):
91
- self.max_seq_len_cached = seq_len
92
- t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
93
-
94
- freqs = torch.einsum("i,j->ij", t, self.inv_freq)
95
- # Different from paper, but it uses a different permutation in order to obtain the same calculation
96
- emb = torch.cat((freqs, freqs), dim=-1)
97
- self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
98
- self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
99
-
100
- def forward(self, x, seq_len=None):
101
- # x: [bs, num_attention_heads, seq_len, head_size]
102
- if seq_len > self.max_seq_len_cached:
103
- self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
104
-
105
- return (
106
- self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
107
- self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
108
- )
109
-
110
-
111
- class SkyworkLinearScalingRotaryEmbedding(SkyworkRotaryEmbedding):
112
- """SkyworkRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
113
-
114
- def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
115
- self.scaling_factor = scaling_factor
116
- super().__init__(dim, max_position_embeddings, base, device)
117
-
118
- def _set_cos_sin_cache(self, seq_len, device, dtype):
119
- self.max_seq_len_cached = seq_len
120
- t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
121
- t = t / self.scaling_factor
122
-
123
- freqs = torch.einsum("i,j->ij", t, self.inv_freq)
124
- # Different from paper, but it uses a different permutation in order to obtain the same calculation
125
- emb = torch.cat((freqs, freqs), dim=-1)
126
- self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
127
- self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
128
-
129
-
130
- class SkyworkDynamicNTKScalingRotaryEmbedding(SkyworkRotaryEmbedding):
131
- """SkyworkRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
132
-
133
- def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
134
- self.scaling_factor = scaling_factor
135
- super().__init__(dim, max_position_embeddings, base, device)
136
-
137
- def _set_cos_sin_cache(self, seq_len, device, dtype):
138
- self.max_seq_len_cached = seq_len
139
-
140
- if seq_len > self.max_position_embeddings:
141
- base = self.base * (
142
- (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
143
- ) ** (self.dim / (self.dim - 2))
144
- inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
145
- self.register_buffer("inv_freq", inv_freq, persistent=False)
146
-
147
- t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
148
-
149
- freqs = torch.einsum("i,j->ij", t, self.inv_freq)
150
- # Different from paper, but it uses a different permutation in order to obtain the same calculation
151
- emb = torch.cat((freqs, freqs), dim=-1)
152
- self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
153
- self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
154
-
155
-
156
-
157
- class SkyworkNTKScalingRotaryEmbedding(torch.nn.Module):
158
- def __init__(self, dim, max_position_embeddings=2048, base=10000, scaling_factor=100, device=None):
159
- super().__init__()
160
-
161
- self.dim = dim
162
- self.max_position_embeddings = max_position_embeddings
163
- self.base = base * scaling_factor
164
- inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
165
- self.register_buffer("inv_freq", inv_freq, persistent=False)
166
-
167
- # Build here to make `torch.jit.trace` work.
168
- self._set_cos_sin_cache(
169
- seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
170
- )
171
-
172
- def _set_cos_sin_cache(self, seq_len, device, dtype):
173
- self.max_seq_len_cached = seq_len
174
- t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
175
- freqs = torch.einsum("i,j->ij", t, self.inv_freq)
176
- emb = torch.cat((freqs, freqs), dim=-1)
177
- self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
178
- self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
179
-
180
- def forward(self, x, seq_len=None):
181
- if seq_len > self.max_seq_len_cached:
182
- self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
183
-
184
- return (
185
- self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
186
- self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
187
- )
188
-
189
- def rotate_half(x):
190
- """Rotates half the hidden dims of the input."""
191
- x1 = x[..., : x.shape[-1] // 2]
192
- x2 = x[..., x.shape[-1] // 2 :]
193
- return torch.cat((-x2, x1), dim=-1)
194
-
195
-
196
- def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
197
- # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
198
- cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
199
- sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
200
- cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
201
- sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
202
- q_embed = (q * cos) + (rotate_half(q) * sin)
203
- k_embed = (k * cos) + (rotate_half(k) * sin)
204
- return q_embed, k_embed
205
-
206
-
207
- class SkyworkMLP(nn.Module):
208
- def __init__(self, config):
209
- super().__init__()
210
- self.config = config
211
- self.hidden_size = config.hidden_size
212
- self.intermediate_size = config.intermediate_size
213
- self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
214
- self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
215
- self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
216
- self.act_fn = ACT2FN[config.hidden_act]
217
-
218
- def forward(self, x):
219
- if self.config.pretraining_tp > 1:
220
- slice = self.intermediate_size // self.config.pretraining_tp
221
- gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
222
- up_proj_slices = self.up_proj.weight.split(slice, dim=0)
223
- down_proj_slices = self.down_proj.weight.split(slice, dim=1)
224
-
225
- gate_proj = torch.cat(
226
- [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
227
- )
228
- up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
229
-
230
- intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
231
- down_proj = [
232
- F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
233
- ]
234
- down_proj = sum(down_proj)
235
- else:
236
- down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
237
-
238
- return down_proj
239
-
240
-
241
- def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
242
- """
243
- This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
244
- num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
245
- """
246
- batch, num_key_value_heads, slen, head_dim = hidden_states.shape
247
- if n_rep == 1:
248
- return hidden_states
249
- hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
250
- return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
251
-
252
-
253
- class SkyworkAttention(nn.Module):
254
- """Multi-headed attention from 'Attention Is All You Need' paper"""
255
-
256
- def __init__(self, config: SkyworkConfig):
257
- super().__init__()
258
- self.config = config
259
- self.hidden_size = config.hidden_size
260
- self.num_heads = config.num_attention_heads
261
- self.head_dim = self.hidden_size // self.num_heads
262
- self.num_key_value_heads = config.num_key_value_heads
263
- self.num_key_value_groups = self.num_heads // self.num_key_value_heads
264
- self.max_position_embeddings = config.max_position_embeddings
265
- self.rope_theta = config.rope_theta
266
-
267
- if (self.head_dim * self.num_heads) != self.hidden_size:
268
- raise ValueError(
269
- f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
270
- f" and `num_heads`: {self.num_heads})."
271
- )
272
- self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
273
- self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
274
- self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
275
- self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
276
- self._init_rope()
277
-
278
- def _init_rope(self):
279
- if self.config.rope_scaling is None:
280
- self.rotary_emb = SkyworkRotaryEmbedding(
281
- self.head_dim,
282
- max_position_embeddings=self.max_position_embeddings,
283
- base=self.rope_theta,
284
- )
285
- else:
286
- scaling_type = self.config.rope_scaling["type"]
287
- scaling_factor = self.config.rope_scaling["factor"]
288
- if scaling_type == "linear":
289
- self.rotary_emb = SkyworkLinearScalingRotaryEmbedding(
290
- self.head_dim,
291
- max_position_embeddings=self.max_position_embeddings,
292
- scaling_factor=scaling_factor,
293
- base=self.rope_theta,
294
- )
295
- elif scaling_type == "dynamic":
296
- self.rotary_emb = SkyworkDynamicNTKScalingRotaryEmbedding(
297
- self.head_dim,
298
- max_position_embeddings=self.max_position_embeddings,
299
- scaling_factor=scaling_factor,
300
- base=self.rope_theta,
301
- )
302
- elif scaling_type == "ntk":
303
- self.rotary_emb = SkyworkNTKScalingRotaryEmbedding(
304
- self.head_dim,
305
- max_position_embeddings=self.max_position_embeddings,
306
- scaling_factor=scaling_factor,
307
- base=self.rope_theta,
308
- )
309
- else:
310
- raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
311
- print('-'*80)
312
- print(f"USING COSTOM MODELING, scaling_type is {scaling_type}, scaling_factor is {scaling_factor}")
313
-
314
- def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
315
- return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
316
-
317
- def forward(
318
- self,
319
- hidden_states: torch.Tensor,
320
- attention_mask: Optional[torch.Tensor] = None,
321
- position_ids: Optional[torch.LongTensor] = None,
322
- past_key_value: Optional[Tuple[torch.Tensor]] = None,
323
- output_attentions: bool = False,
324
- use_cache: bool = False,
325
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
326
- bsz, q_len, _ = hidden_states.size()
327
-
328
- if self.config.pretraining_tp > 1:
329
- key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
330
- query_slices = self.q_proj.weight.split(
331
- (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
332
- )
333
- key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
334
- value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
335
-
336
- query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
337
- query_states = torch.cat(query_states, dim=-1)
338
-
339
- key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
340
- key_states = torch.cat(key_states, dim=-1)
341
-
342
- value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
343
- value_states = torch.cat(value_states, dim=-1)
344
-
345
- else:
346
- query_states = self.q_proj(hidden_states)
347
- key_states = self.k_proj(hidden_states)
348
- value_states = self.v_proj(hidden_states)
349
-
350
- query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
351
- key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
352
- value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
353
-
354
- kv_seq_len = key_states.shape[-2]
355
- if past_key_value is not None:
356
- kv_seq_len += past_key_value[0].shape[-2]
357
- cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
358
- query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
359
-
360
- if past_key_value is not None:
361
- # reuse k, v, self_attention
362
- key_states = torch.cat([past_key_value[0], key_states], dim=2)
363
- value_states = torch.cat([past_key_value[1], value_states], dim=2)
364
-
365
- past_key_value = (key_states, value_states) if use_cache else None
366
-
367
- # repeat k/v heads if n_kv_heads < n_heads
368
- key_states = repeat_kv(key_states, self.num_key_value_groups)
369
- value_states = repeat_kv(value_states, self.num_key_value_groups)
370
-
371
- attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
372
-
373
- if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
374
- raise ValueError(
375
- f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
376
- f" {attn_weights.size()}"
377
- )
378
-
379
- if attention_mask is not None:
380
- if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
381
- raise ValueError(
382
- f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
383
- )
384
- attn_weights = attn_weights + attention_mask
385
-
386
- # upcast attention to fp32
387
- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
388
- attn_output = torch.matmul(attn_weights, value_states)
389
-
390
- if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
391
- raise ValueError(
392
- f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
393
- f" {attn_output.size()}"
394
- )
395
-
396
- attn_output = attn_output.transpose(1, 2).contiguous()
397
- attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
398
-
399
- if self.config.pretraining_tp > 1:
400
- attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
401
- o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
402
- attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
403
- else:
404
- attn_output = self.o_proj(attn_output)
405
-
406
- if not output_attentions:
407
- attn_weights = None
408
-
409
- return attn_output, attn_weights, past_key_value
410
-
411
-
412
- class SkyworkDecoderLayer(nn.Module):
413
- def __init__(self, config: SkyworkConfig):
414
- super().__init__()
415
- self.hidden_size = config.hidden_size
416
- self.self_attn = SkyworkAttention(config=config)
417
- self.mlp = SkyworkMLP(config)
418
- self.input_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
419
- self.post_attention_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
420
-
421
- def forward(
422
- self,
423
- hidden_states: torch.Tensor,
424
- attention_mask: Optional[torch.Tensor] = None,
425
- position_ids: Optional[torch.LongTensor] = None,
426
- past_key_value: Optional[Tuple[torch.Tensor]] = None,
427
- output_attentions: Optional[bool] = False,
428
- use_cache: Optional[bool] = False,
429
- ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
430
- """
431
- Args:
432
- hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
433
- attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
434
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
435
- output_attentions (`bool`, *optional*):
436
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under
437
- returned tensors for more detail.
438
- use_cache (`bool`, *optional*):
439
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
440
- (see `past_key_values`).
441
- past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
442
- """
443
-
444
- residual = hidden_states
445
-
446
- hidden_states = self.input_layernorm(hidden_states)
447
-
448
- # Self Attention
449
- hidden_states, self_attn_weights, present_key_value = self.self_attn(
450
- hidden_states=hidden_states,
451
- attention_mask=attention_mask,
452
- position_ids=position_ids,
453
- past_key_value=past_key_value,
454
- output_attentions=output_attentions,
455
- use_cache=use_cache,
456
- )
457
- hidden_states = residual + hidden_states
458
-
459
- # Fully Connected
460
- residual = hidden_states
461
- hidden_states = self.post_attention_layernorm(hidden_states)
462
- hidden_states = self.mlp(hidden_states)
463
- hidden_states = residual + hidden_states
464
-
465
- outputs = (hidden_states,)
466
-
467
- if output_attentions:
468
- outputs += (self_attn_weights,)
469
-
470
- if use_cache:
471
- outputs += (present_key_value,)
472
-
473
- return outputs
474
-
475
- class SkyworkPreTrainedModel(PreTrainedModel):
476
- config_class = SkyworkConfig
477
- base_model_prefix = "model"
478
- supports_gradient_checkpointing = True
479
- _no_split_modules = ["SkyworkDecoderLayer"]
480
- _skip_keys_device_placement = "past_key_values"
481
-
482
- def _init_weights(self, module):
483
- std = self.config.initializer_range
484
- if isinstance(module, nn.Linear):
485
- module.weight.data.normal_(mean=0.0, std=std)
486
- if module.bias is not None:
487
- module.bias.data.zero_()
488
- elif isinstance(module, nn.Embedding):
489
- module.weight.data.normal_(mean=0.0, std=std)
490
- if module.padding_idx is not None:
491
- module.weight.data[module.padding_idx].zero_()
492
-
493
- def _set_gradient_checkpointing(self, module, value=False):
494
- if isinstance(module, SkyworkModel):
495
- module.gradient_checkpointing = value
496
-
497
- class SkyworkModel(SkyworkPreTrainedModel):
498
- """
499
- Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`SkyworkDecoderLayer`]
500
-
501
- Args:
502
- config: SkyworkConfig
503
- """
504
-
505
- def __init__(self, config: SkyworkConfig):
506
- super().__init__(config)
507
- self.padding_idx = config.pad_token_id
508
- self.vocab_size = config.vocab_size
509
-
510
- self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
511
- self.layers = nn.ModuleList([SkyworkDecoderLayer(config) for _ in range(config.num_hidden_layers)])
512
- self.norm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
513
-
514
- self.gradient_checkpointing = False
515
- # Initialize weights and apply final processing
516
- self.post_init()
517
-
518
- def get_input_embeddings(self):
519
- return self.embed_tokens
520
-
521
- def set_input_embeddings(self, value):
522
- self.embed_tokens = value
523
-
524
- # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
525
- def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
526
- # create causal mask
527
- # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
528
- combined_attention_mask = None
529
- if input_shape[-1] > 1:
530
- combined_attention_mask = _make_causal_mask(
531
- input_shape,
532
- inputs_embeds.dtype,
533
- device=inputs_embeds.device,
534
- past_key_values_length=past_key_values_length,
535
- )
536
-
537
- if attention_mask is not None:
538
- # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
539
- expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
540
- inputs_embeds.device
541
- )
542
- combined_attention_mask = (
543
- expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
544
- )
545
-
546
- return combined_attention_mask
547
-
548
- def forward(
549
- self,
550
- input_ids: torch.LongTensor = None,
551
- attention_mask: Optional[torch.Tensor] = None,
552
- position_ids: Optional[torch.LongTensor] = None,
553
- past_key_values: Optional[List[torch.FloatTensor]] = None,
554
- inputs_embeds: Optional[torch.FloatTensor] = None,
555
- use_cache: Optional[bool] = None,
556
- output_attentions: Optional[bool] = None,
557
- output_hidden_states: Optional[bool] = None,
558
- return_dict: Optional[bool] = None,
559
- ) -> Union[Tuple, BaseModelOutputWithPast]:
560
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
561
- output_hidden_states = (
562
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
563
- )
564
- use_cache = use_cache if use_cache is not None else self.config.use_cache
565
-
566
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
567
-
568
- # retrieve input_ids and inputs_embeds
569
- if input_ids is not None and inputs_embeds is not None:
570
- raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
571
- elif input_ids is not None:
572
- batch_size, seq_length = input_ids.shape
573
- elif inputs_embeds is not None:
574
- batch_size, seq_length, _ = inputs_embeds.shape
575
- else:
576
- raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
577
-
578
- seq_length_with_past = seq_length
579
- past_key_values_length = 0
580
-
581
- if past_key_values is not None:
582
- past_key_values_length = past_key_values[0][0].shape[2]
583
- seq_length_with_past = seq_length_with_past + past_key_values_length
584
-
585
- if position_ids is None:
586
- device = input_ids.device if input_ids is not None else inputs_embeds.device
587
- position_ids = torch.arange(
588
- past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
589
- )
590
- position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
591
- else:
592
- position_ids = position_ids.view(-1, seq_length).long()
593
-
594
- if inputs_embeds is None:
595
- inputs_embeds = self.embed_tokens(input_ids)
596
- # embed positions
597
- if attention_mask is None:
598
- attention_mask = torch.ones(
599
- (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
600
- )
601
- attention_mask = self._prepare_decoder_attention_mask(
602
- attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
603
- )
604
-
605
- hidden_states = inputs_embeds
606
-
607
- if self.gradient_checkpointing and self.training:
608
- if use_cache:
609
- logger.warning_once(
610
- "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
611
- )
612
- use_cache = False
613
-
614
- # decoder layers
615
- all_hidden_states = () if output_hidden_states else None
616
- all_self_attns = () if output_attentions else None
617
- next_decoder_cache = () if use_cache else None
618
-
619
- for idx, decoder_layer in enumerate(self.layers):
620
- if output_hidden_states:
621
- all_hidden_states += (hidden_states,)
622
-
623
- past_key_value = past_key_values[idx] if past_key_values is not None else None
624
-
625
- if self.gradient_checkpointing and self.training:
626
-
627
- def create_custom_forward(module):
628
- def custom_forward(*inputs):
629
- # None for past_key_value
630
- return module(*inputs, past_key_value, output_attentions)
631
-
632
- return custom_forward
633
-
634
- layer_outputs = torch.utils.checkpoint.checkpoint(
635
- create_custom_forward(decoder_layer),
636
- hidden_states,
637
- attention_mask,
638
- position_ids,
639
- )
640
- else:
641
- layer_outputs = decoder_layer(
642
- hidden_states,
643
- attention_mask=attention_mask,
644
- position_ids=position_ids,
645
- past_key_value=past_key_value,
646
- output_attentions=output_attentions,
647
- use_cache=use_cache,
648
- )
649
-
650
- hidden_states = layer_outputs[0]
651
-
652
- if use_cache:
653
- next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
654
-
655
- if output_attentions:
656
- all_self_attns += (layer_outputs[1],)
657
-
658
- hidden_states = self.norm(hidden_states)
659
-
660
- # add hidden states from the last decoder layer
661
- if output_hidden_states:
662
- all_hidden_states += (hidden_states,)
663
-
664
- next_cache = next_decoder_cache if use_cache else None
665
- if not return_dict:
666
- return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
667
- return BaseModelOutputWithPast(
668
- last_hidden_state=hidden_states,
669
- past_key_values=next_cache,
670
- hidden_states=all_hidden_states,
671
- attentions=all_self_attns,
672
- )
673
-
674
-
675
- class SkyworkForCausalLM(SkyworkPreTrainedModel):
676
- _tied_weights_keys = ["lm_head.weight"]
677
-
678
- def __init__(self, config):
679
- super().__init__(config)
680
- self.model = SkyworkModel(config)
681
- self.vocab_size = config.vocab_size
682
- self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
683
-
684
- # Initialize weights and apply final processing
685
- self.post_init()
686
-
687
- def get_input_embeddings(self):
688
- return self.model.embed_tokens
689
-
690
- def set_input_embeddings(self, value):
691
- self.model.embed_tokens = value
692
-
693
- def get_output_embeddings(self):
694
- return self.lm_head
695
-
696
- def set_output_embeddings(self, new_embeddings):
697
- self.lm_head = new_embeddings
698
-
699
- def set_decoder(self, decoder):
700
- self.model = decoder
701
-
702
- def get_decoder(self):
703
- return self.model
704
-
705
- def forward(
706
- self,
707
- input_ids: torch.LongTensor = None,
708
- attention_mask: Optional[torch.Tensor] = None,
709
- position_ids: Optional[torch.LongTensor] = None,
710
- past_key_values: Optional[List[torch.FloatTensor]] = None,
711
- inputs_embeds: Optional[torch.FloatTensor] = None,
712
- labels: Optional[torch.LongTensor] = None,
713
- use_cache: Optional[bool] = None,
714
- output_attentions: Optional[bool] = None,
715
- output_hidden_states: Optional[bool] = None,
716
- return_dict: Optional[bool] = None,
717
- ) -> Union[Tuple, CausalLMOutputWithPast]:
718
-
719
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
720
- output_hidden_states = (
721
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
722
- )
723
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
724
-
725
- # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
726
- outputs = self.model(
727
- input_ids=input_ids,
728
- attention_mask=attention_mask,
729
- position_ids=position_ids,
730
- past_key_values=past_key_values,
731
- inputs_embeds=inputs_embeds,
732
- use_cache=use_cache,
733
- output_attentions=output_attentions,
734
- output_hidden_states=output_hidden_states,
735
- return_dict=return_dict,
736
- )
737
-
738
- hidden_states = outputs[0]
739
- if self.config.pretraining_tp > 1:
740
- lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
741
- logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
742
- logits = torch.cat(logits, dim=-1)
743
- else:
744
- logits = self.lm_head(hidden_states)
745
- logits = logits.float()
746
-
747
- loss = None
748
- if labels is not None:
749
- # Shift so that tokens < n predict n
750
- shift_logits = logits[..., :-1, :].contiguous()
751
- shift_labels = labels[..., 1:].contiguous()
752
- # Flatten the tokens
753
- loss_fct = CrossEntropyLoss()
754
- shift_logits = shift_logits.view(-1, self.config.vocab_size)
755
- shift_labels = shift_labels.view(-1)
756
- # Enable model parallelism
757
- shift_labels = shift_labels.to(shift_logits.device)
758
- loss = loss_fct(shift_logits, shift_labels)
759
-
760
- if not return_dict:
761
- output = (logits,) + outputs[1:]
762
- return (loss,) + output if loss is not None else output
763
-
764
- return CausalLMOutputWithPast(
765
- loss=loss,
766
- logits=logits,
767
- past_key_values=outputs.past_key_values,
768
- hidden_states=outputs.hidden_states,
769
- attentions=outputs.attentions,
770
- )
771
-
772
- def prepare_inputs_for_generation(
773
- self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
774
- ):
775
- if past_key_values:
776
- input_ids = input_ids[:, -1:]
777
-
778
- position_ids = kwargs.get("position_ids", None)
779
- if attention_mask is not None and position_ids is None:
780
- # create position_ids on the fly for batch generation
781
- position_ids = attention_mask.long().cumsum(-1) - 1
782
- position_ids.masked_fill_(attention_mask == 0, 1)
783
- if past_key_values:
784
- position_ids = position_ids[:, -1].unsqueeze(-1)
785
-
786
- # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
787
- if inputs_embeds is not None and past_key_values is None:
788
- model_inputs = {"inputs_embeds": inputs_embeds}
789
- else:
790
- model_inputs = {"input_ids": input_ids}
791
-
792
- model_inputs.update(
793
- {
794
- "position_ids": position_ids,
795
- "past_key_values": past_key_values,
796
- "use_cache": kwargs.get("use_cache"),
797
- "attention_mask": attention_mask,
798
- }
799
- )
800
- return model_inputs
801
-
802
- @staticmethod
803
- def _reorder_cache(past_key_values, beam_idx):
804
- reordered_past = ()
805
- for layer_past in past_key_values:
806
- reordered_past += (
807
- tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
808
- )
809
- return reordered_past
810
-
811
-
812
- class SkyworkForSequenceClassification(SkyworkPreTrainedModel):
813
- def __init__(self, config):
814
- super().__init__(config)
815
- self.num_labels = config.num_labels
816
- self.model = SkyworkModel(config)
817
- self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
818
-
819
- # Initialize weights and apply final processing
820
- self.post_init()
821
-
822
- def get_input_embeddings(self):
823
- return self.model.embed_tokens
824
-
825
- def set_input_embeddings(self, value):
826
- self.model.embed_tokens = value
827
-
828
- def forward(
829
- self,
830
- input_ids: torch.LongTensor = None,
831
- attention_mask: Optional[torch.Tensor] = None,
832
- position_ids: Optional[torch.LongTensor] = None,
833
- past_key_values: Optional[List[torch.FloatTensor]] = None,
834
- inputs_embeds: Optional[torch.FloatTensor] = None,
835
- labels: Optional[torch.LongTensor] = None,
836
- use_cache: Optional[bool] = None,
837
- output_attentions: Optional[bool] = None,
838
- output_hidden_states: Optional[bool] = None,
839
- return_dict: Optional[bool] = None,
840
- ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
841
-
842
-
843
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
844
-
845
- transformer_outputs = self.model(
846
- input_ids,
847
- attention_mask=attention_mask,
848
- position_ids=position_ids,
849
- past_key_values=past_key_values,
850
- inputs_embeds=inputs_embeds,
851
- use_cache=use_cache,
852
- output_attentions=output_attentions,
853
- output_hidden_states=output_hidden_states,
854
- return_dict=return_dict,
855
- )
856
- hidden_states = transformer_outputs[0]
857
- logits = self.score(hidden_states)
858
-
859
- if input_ids is not None:
860
- batch_size = input_ids.shape[0]
861
- else:
862
- batch_size = inputs_embeds.shape[0]
863
-
864
- if self.config.pad_token_id is None and batch_size != 1:
865
- raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
866
- if self.config.pad_token_id is None:
867
- sequence_lengths = -1
868
- else:
869
- if input_ids is not None:
870
- sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
871
- logits.device
872
- )
873
- else:
874
- sequence_lengths = -1
875
-
876
- pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
877
-
878
- loss = None
879
- if labels is not None:
880
- labels = labels.to(logits.device)
881
- if self.config.problem_type is None:
882
- if self.num_labels == 1:
883
- self.config.problem_type = "regression"
884
- elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
885
- self.config.problem_type = "single_label_classification"
886
- else:
887
- self.config.problem_type = "multi_label_classification"
888
-
889
- if self.config.problem_type == "regression":
890
- loss_fct = MSELoss()
891
- if self.num_labels == 1:
892
- loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
893
- else:
894
- loss = loss_fct(pooled_logits, labels)
895
- elif self.config.problem_type == "single_label_classification":
896
- loss_fct = CrossEntropyLoss()
897
- loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
898
- elif self.config.problem_type == "multi_label_classification":
899
- loss_fct = BCEWithLogitsLoss()
900
- loss = loss_fct(pooled_logits, labels)
901
- if not return_dict:
902
- output = (pooled_logits,) + transformer_outputs[1:]
903
- return ((loss,) + output) if loss is not None else output
904
-
905
- return SequenceClassifierOutputWithPast(
906
- loss=loss,
907
- logits=pooled_logits,
908
- past_key_values=transformer_outputs.past_key_values,
909
- hidden_states=transformer_outputs.hidden_states,
910
- attentions=transformer_outputs.attentions,
911
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00001-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:69626f67345dd2378ea1155f152804fb4886b151f2e43ebe3b2d6f33c80e606e
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00002-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7b651c6dde0c0a430a94dce24d3560bd07db9ed35f1f1cac9edd530e441b5f0
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00003-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8359b7ecc78b02a619751c96f60aec6fee4a2595db3f36cd61a5391838fc7ce1
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00004-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:69c333cb8fcfbe365e3bdcd260e5ff91601da65662a5718d002639937ef3cefb
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00005-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4277ccd1d2175e039075cc6fe2b95e213a590e9eabd35ef26785b998b4f2ad84
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00006-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:596649ca586587c17956487cda102ce7ce3c5c950ad89ba1bb8c9ef9224b5a01
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00007-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:953fdae0c98a276f579647ab7595cf9548ec3e46cc433364ba23cfa9b2a77e0a
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00008-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9648394bc03bc87c8913c31893fbdc55dbbbfeaf7041fc4e5b4946469261f026
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00009-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:72c026f7373e85d29e17dd6501c0622da44211273993cc641da8551468a8063d
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00010-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b1f53581597d14404ffd763f646f2a8346f89d3e84fd17f88ea0bc779bcd8cf
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00011-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4be0dc98193a0e3f421c7d2ebe1fa910ea62d70e9bb32b0f4cac7b69326c550c
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00012-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:94f369d5ba5c26dbf9b2b4ab27803068b220db8a38db5ac644eb24fa79a7b963
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00013-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b766e0e9ceb543f6ca5ebd5e4dac937bf58a7e14a3d13e1ae6d50fec820bfb8b
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00014-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5806356cccd558f1d05643ba00e23db77a8c06a174bfa5831efcd8283582776
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00015-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d198045292b6d3d0a78f907d67491541e4e5768feb9dfcde5dda18a7f8d8cdfa
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00016-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:db78b41b807f6c7b1bb6aa7372dbc14c9da9ce0cd4d7c4e89955cfa6e4400f0b
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00017-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:87ef520c42607a36a63f8c7e5e6513e46ccdcbaa077f6a7fa0a17f5663a19cae
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00018-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a43230a0381dc5d4a45f874376d6d1b7c24fb02dd6d9ad62dc934fa4bdbf22c9
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00019-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ea20023ae3238034245a47e2d782fa43798074640af65d355d08b5e2c7a3968
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00020-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:18465d3d867b0c517d4dcdf61c8088e26a7a16c42d5d17e6efa1d685725d5028
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00021-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:977e108f2584ff9a4f846f3ec4b515c4e28216b1439df82adb7e611a452929a7
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00022-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7236496347fda53cf7d1c4daef8c380b6e42d1ec0a6edb44fd9d0e38b6db6419
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00023-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b17bedf66127af7638e660befbc5defd047024d8b63fcd590626af272e3c5cc1
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00024-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e76c6c43337c94f45fd1b3f3365d14dfabbfd38cfc9074db75b18d9504052d67
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00025-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:250d260874617a3944545996ba6bd0291b5c748a6e8c603a235fb1f882a93b47
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00026-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:52023afeed136ba561793db69a90c23655f637175890e19080a01f62f87cdac9
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00027-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:54d3e82f5176315c470e4615fd5e1be85383351077fcceb684ab436f3e6796a1
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00028-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7aff27ef911125a991c6b58d8e809b319e847ff558cd12d49b025e06d3c5728e
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00029-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1832c51bef229f9014fa518c5cff4dcc39723ab853c880335796f5740914c80
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00030-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a72d1a732e9a99abf661f5e109c589635d1a86886bac7f2aa80a7b6c409f8be
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00031-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4114c138eb5a1bf68bd29a125c31e24201c3a5cf95ec3757c2b2f366b45befe6
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00032-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:31aaa207f4a4c1042868c0320cec86c8f7aed5ba0940c2552f9c6be88526398a
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00033-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:abf15bdd7289fabf72251cd45026e78b6bdd5c8a8c849d3ab4658521a9b5383e
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00034-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7ce1cb13ef764d3da8c01e65a7cb12e9b78fc7410555441851ad05138cbfdbc
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00035-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0c95670bea6f294572d6e0c7d6f410a378ed546c8beb239ee86d4b858574096
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00036-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:de7e7cc3ece84abbc58aa053644369e5efce366da96dfd94a70477eaacc4edcc
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00037-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:46c8254b6f12d8bb032f4cfa9c4596f06692eb307cfd2a617d7c4639282fdcf4
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00038-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6784ebd555e163c58262116bd192a2bc2679d30311c8387fb8a652f9ae3d082b
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00039-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:30db78de4bdbedd9066f08a8f0fc4fa6558ec22503df34d289bcad5dee26dbb8
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00040-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:571fe808760c495bf1b97396ec9ef4ce3bd5c1003d692dfcb7d3ad7197023a67
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00041-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ca91a4280c92688d97088bf2d3f1a0da6dcbed864f91f73ae7cec4bfc8496a0
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00042-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:572a6a020f42889d4e41596017022e880a4565cac9fa7b6072e6db21955dee78
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00043-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:59ebb7958ba6de1cd89e31ef83e6786eacc07d8de467234543a6356466e820e0
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00044-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3b364a668f019e0896567dfcde7ed9d131bef3b602b4bb9e932430820ca4101
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00045-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1c4819c9218302cfe4ad737bc079eb75aff0c58f854e82d391de66b8ae8724c
3
- size 509630194
 
 
 
 
model_hubs/Skywork-13B-Base-3T/pytorch_model-00046-of-00053.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:80692b714d33ec725a47009bec70a296365667698ea8d8b49c48d49cf753f969
3
- size 509630194