chentianqi commited on
Commit
2f7d2d1
·
verified ·
1 Parent(s): 8fc9f5f

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +182 -14
README.md CHANGED
@@ -49,10 +49,29 @@ This model has been 4-bit quantized Llada-8B-Base model with [GPTQModel](https:/
49
 
50
  ## Example:
51
  ```python
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  import torch
53
  from datasets import load_dataset
54
- from gptqmodel import GPTQModel, QuantizeConfig, BACKEND
55
- from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
56
  import torch.nn.functional as F
57
  import numpy as np
58
 
@@ -60,6 +79,64 @@ import numpy as np
60
 
61
 
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def add_gumbel_noise(logits, temperature):
64
  '''
65
  The Gumbel max is a method for sampling categorical distributions.
@@ -92,6 +169,82 @@ def get_num_transfer_tokens(mask_index, steps):
92
 
93
  return num_transfer_tokens
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
 
97
 
@@ -166,32 +319,45 @@ def generate(model, prompt, steps=128, gen_length=128, block_length=128, tempera
166
  return x
167
 
168
  def main():
169
- quantized_model_id="FunAGI/LLaDA-8B-Base-gptqmodel-4bit"
170
- tokenizer = AutoTokenizer.from_pretrained(quantized_model_id ,use_fast=False)
171
-
172
 
 
 
 
 
 
 
 
 
 
173
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
174
- prompt = "Paul is at a train station and is waiting for his train. He isn't sure how long he needs to wait, but he knows that the fourth train scheduled to arrive at the station is the one he needs to get on. The first train is scheduled to arrive in 10 minutes, and this train will stay in the station for 20 minutes. The second train is to arrive half an hour after the first train leaves the station, and this second train will stay in the station for a quarter of the amount of time that the first train stayed in the station. The third train is to arrive an hour after the second train leaves the station, and this third train is to leave the station immediately after it arrives. The fourth train will arrive 20 minutes after the third train leaves, and this is the train Paul will board. In total, how long, in minutes, will Paul wait for his train?"
175
 
176
- # # # Add special tokens for the Instruct model. The Base model does not require the following two lines.
177
- m = [{"role": "user", "content": prompt}, ]
178
- prompt = tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
179
 
180
  input_ids = tokenizer(prompt)['input_ids']
181
  input_ids = torch.tensor(input_ids).to(device).unsqueeze(0)
182
 
183
-
184
 
185
 
 
186
  model = GPTQModel.load(quantized_model_id, device=device , trust_remote_code=True )
187
 
188
-
189
- steps=256
190
- out = generate(model, input_ids, steps=steps , gen_length=256, block_length=8, temperature=0., cfg_scale=0., remasking='low_confidence')
191
- print("*"*30+ f"Steps {steps}"+ "*"*30)
192
  print(input_ids.shape)
193
  print( tokenizer.batch_decode(out[:, input_ids.shape[1]:], skip_special_tokens=True)[0])
 
 
 
194
 
 
 
 
 
195
 
196
 
197
  if __name__ == "__main__":
@@ -205,4 +371,6 @@ if __name__ == "__main__":
205
 
206
  main()
207
 
 
 
208
  ```
 
49
 
50
  ## Example:
51
  ```python
52
+
53
+ # Copyright 2024-2025 ModelCloud.ai
54
+ # Copyright 2024-2025 [email protected]
55
+ # Contact: [email protected], x.com/qubitium
56
+ #
57
+ # Licensed under the Apache License, Version 2.0 (the "License");
58
+ # you may not use this file except in compliance with the License.
59
+ # You may obtain a copy of the License at
60
+ #
61
+ # http://www.apache.org/licenses/LICENSE-2.0
62
+ #
63
+ # Unless required by applicable law or agreed to in writing, software
64
+ # distributed under the License is distributed on an "AS IS" BASIS,
65
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
66
+ # See the License for the specific language governing permissions and
67
+ # limitations under the License.
68
+
69
  import torch
70
  from datasets import load_dataset
71
+ from gptqmodel import GPTQModel, QuantizeConfig, BACKEND
72
+ from gptqmodel.models.base import BaseGPTQModel
73
+ from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
74
+ from gptqmodel.models.auto import MODEL_MAP
75
  import torch.nn.functional as F
76
  import numpy as np
77
 
 
79
 
80
 
81
 
82
+ pretrained_model_id = '/home/chentianqi/model/GSAI-ML/LLaDA-8B-Base' # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
83
+ quantized_model_id = "FunAGI/LLaDA-8B-Base-gptqmodel-4bit"
84
+
85
+
86
+
87
+ class LladaGPTQ(BaseGPTQModel):
88
+ # Non-repeating layers at the root level: same level as `layers_node`
89
+ # Excluding `layers_node`.
90
+ base_modules = ["model.transformer.wte", "model.transformer.ln_f"]
91
+ pre_lm_head_norm_module = "model.transformer.ln_f"
92
+ lm_head = "model.transformer.ff_out"
93
+ # Below describes all the repeating layers in this transformer model
94
+ # `model.layers` is a node/module that hold all the repeating layers. The parent node for all n-layers.
95
+ layers_node = "model.transformer.blocks"
96
+ # Each repeating layer in `model.layers` is of type `LlamaDecoderLayer`
97
+ layer_type = "LLaDALlamaBlock"
98
+ # Inside each `LlamaDecoderLayer` layer are many internal modules
99
+ # List them in the order executed in model forward() code
100
+ # Many models have same execution order of: attention (q_k_v) projection, attention (output) projection, mlp (n) projections
101
+ layer_modules = [
102
+ ["attn_out", "k_proj", "v_proj", "q_proj"],
103
+ ["ff_proj", "up_proj"],
104
+ ["ff_out"],
105
+ ]
106
+ MODEL_MAP ["llada"] = LladaGPTQ
107
+
108
+ # os.makedirs(quantized_model_dir, exist_ok=True)
109
+ def get_wikitext2(tokenizer, nsamples, seqlen):
110
+ traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train").filter(
111
+ lambda x: len(x["text"]) >= seqlen)
112
+
113
+ return [tokenizer(example["text"]) for example in traindata.select(range(nsamples))]
114
+
115
+
116
+ @torch.no_grad()
117
+ def calculate_avg_ppl(model, tokenizer):
118
+ from gptqmodel.utils import Perplexity
119
+
120
+ ppl = Perplexity(
121
+ model=model,
122
+ tokenizer=tokenizer,
123
+ dataset_path="wikitext",
124
+ dataset_name="wikitext-2-raw-v1",
125
+ split="train",
126
+ text_column="text",
127
+ )
128
+
129
+ all = ppl.calculate(n_ctx=512, n_batch=512)
130
+
131
+ # average ppl
132
+ avg = sum(all) / len(all)
133
+
134
+ return avg
135
+
136
+ dynamic = {
137
+
138
+ }
139
+
140
  def add_gumbel_noise(logits, temperature):
141
  '''
142
  The Gumbel max is a method for sampling categorical distributions.
 
169
 
170
  return num_transfer_tokens
171
 
172
+ def forward_process(batch, prompt_index, mask_id):
173
+ b, l = batch.shape
174
+
175
+ target_len = (l - prompt_index.sum()).item()
176
+ k = torch.randint(1, target_len + 1, (), device=batch.device)
177
+
178
+ x = torch.round(torch.linspace(float(k), k + (b - 1) * (target_len / b), steps=b, device=batch.device)).long()
179
+ x = ((x - 1) % target_len) + 1
180
+ assert x.min() >= 1 and x.max() <= target_len
181
+
182
+ indices = torch.arange(target_len, device=batch.device).repeat(b, 1)
183
+ is_mask = indices < x.unsqueeze(1)
184
+ for i in range(b):
185
+ is_mask[i] = is_mask[i][torch.randperm(target_len)]
186
+
187
+ is_mask = torch.cat((torch.zeros(b, prompt_index.sum(), dtype=torch.bool, device=batch.device), is_mask), dim=1)
188
+ noisy_batch = torch.where(is_mask, mask_id, batch)
189
+
190
+ # Return the masked batch and the mask ratio
191
+ return noisy_batch, (x / target_len).unsqueeze(1).repeat(1, l)
192
+
193
+
194
+ def get_logits(model, batch, prompt_index, cfg_scale, mask_id):
195
+ if cfg_scale > 0.:
196
+ assert len(prompt_index) == batch.shape[1]
197
+ prompt_index = prompt_index.unsqueeze(0).repeat(batch.shape[0], 1)
198
+ un_batch = batch.clone()
199
+ un_batch[prompt_index] = mask_id
200
+ batch = torch.cat([batch, un_batch])
201
+
202
+ input = batch
203
+ logits = model(input).logits
204
+
205
+ if cfg_scale > 0.:
206
+ logits, un_logits = torch.chunk(logits, 2, dim=0)
207
+ logits = un_logits + (cfg_scale + 1) * (logits - un_logits)
208
+ return logits
209
+
210
+
211
+
212
+ @ torch.no_grad()
213
+ def get_log_likelihood(model, prompt, answer, mc_num=128, batch_size=32, cfg_scale=0., mask_id=126336):
214
+ '''
215
+ Args:
216
+ model: Mask predictor.
217
+ prompt: A tensor of shape (l1).
218
+ answer: A tensor of shape (l2).
219
+ mc_num: Monte Carlo estimation times.
220
+ As detailed in Appendix B.5. Since MMLU, CMMLU, and C-EVAL only require the likelihood of a single token, a
221
+ single Monte Carlo estimate is sufficient for these benchmarks. For all other benchmarks, we find that 128
222
+ Monte Carlo samples are adequate to produce stable results.
223
+ batch_size: Mini batch size.
224
+ cfg_scale: Unsupervised classifier-free guidance scale.
225
+ mask_id: The toke id of [MASK] is 126336.
226
+ '''
227
+
228
+ seq = torch.concatenate([prompt, answer])[None, :]
229
+ seq = seq.repeat((batch_size, 1)).to(model.device)
230
+ prompt_index = torch.arange(seq.shape[1], device=model.device) < len(prompt)
231
+
232
+ loss_ = []
233
+ for _ in range(mc_num // batch_size):
234
+
235
+ perturbed_seq, p_mask = forward_process(seq, prompt_index, mask_id)
236
+ mask_index = perturbed_seq == mask_id
237
+
238
+ logits = get_logits(model, perturbed_seq, prompt_index, cfg_scale, mask_id)
239
+
240
+ loss = F.cross_entropy(logits[mask_index], seq[mask_index], reduction='none') / p_mask[mask_index]
241
+ loss = loss.sum() / batch_size
242
+
243
+ loss_.append(loss.item())
244
+
245
+ return - sum(loss_) / len(loss_)
246
+
247
+
248
 
249
 
250
 
 
319
  return x
320
 
321
  def main():
322
+ tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, use_fast=False)
 
 
323
 
324
+ traindataset = get_wikitext2(tokenizer, nsamples=128, seqlen=1024)
325
+
326
+ quantize_config = QuantizeConfig(
327
+ dynamic=dynamic,
328
+ bits=8, # quantize model to 4-bit
329
+ group_size=128, # it is recommended to set the value to 128,
330
+ desc_act = True,
331
+ sym=False
332
+ )
333
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
334
+ prompt = "Question: Lily can run 12 kilometers per hour for 4 hours. After that, she runs 6 kilometers per hour. How many kilometers can she run in 8 hours? The answer: "
335
 
336
+ # # Add special tokens for the Instruct model. The Base model does not require the following two lines.
337
+ # m = [{"role": "user", "content": prompt}, ]
338
+ # prompt = tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
339
 
340
  input_ids = tokenizer(prompt)['input_ids']
341
  input_ids = torch.tensor(input_ids).to(device).unsqueeze(0)
342
 
 
343
 
344
 
345
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
346
  model = GPTQModel.load(quantized_model_id, device=device , trust_remote_code=True )
347
 
348
+ steps=128
349
+ out = generate(model, input_ids, steps=steps , gen_length=128, block_length=32, temperature=0., cfg_scale=0., remasking='low_confidence')
350
+ print("*"*30+ f"GPTQ-4bit Steps {steps}"+ "*"*30)
 
351
  print(input_ids.shape)
352
  print( tokenizer.batch_decode(out[:, input_ids.shape[1]:], skip_special_tokens=True)[0])
353
+ del model
354
+
355
+ model =AutoModel.from_pretrained(pretrained_model_id, trust_remote_code=True ).cuda()
356
 
357
+ out = generate(model, input_ids, steps=steps , gen_length=128, block_length=32, temperature=0., cfg_scale=0., remasking='low_confidence')
358
+ print("*"*30+ f"FP16 Steps {steps}"+ "*"*30)
359
+ print(input_ids.shape)
360
+ print( tokenizer.batch_decode(out[:, input_ids.shape[1]:], skip_special_tokens=True)[0])
361
 
362
 
363
  if __name__ == "__main__":
 
371
 
372
  main()
373
 
374
+
375
+
376
  ```