mgoin commited on
Commit
89ce8ac
·
verified ·
1 Parent(s): 5a429b9

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +95 -0
README.md ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ```
2
+ vllm (pretrained=/home/mgoin/code/llm-compressor/examples/quantizing_moe/DeepSeek-Coder-V2-Lite-Instruct-FP8,tensor_parallel_size=2,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: auto
3
+ |Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr|
4
+ |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
5
+ |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.7710|± |0.0116|
6
+ | | |strict-match | 5|exact_match|↑ |0.7582|± |0.0118|
7
+ ```
8
+
9
+ ## Creation
10
+ ```python
11
+ import torch
12
+ from datasets import load_dataset
13
+ from transformers import AutoTokenizer
14
+
15
+ from llmcompressor.modifiers.quantization import QuantizationModifier
16
+ from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
17
+
18
+ # select a Mixture of Experts model for quantization
19
+ MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
20
+
21
+ model = SparseAutoModelForCausalLM.from_pretrained(
22
+ MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True
23
+ )
24
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
25
+
26
+ # Select calibration dataset.
27
+ # its recommended to use more calibration samples for MoE models so each expert is hit
28
+ DATASET_ID = "HuggingFaceH4/ultrachat_200k"
29
+ DATASET_SPLIT = "train_sft"
30
+ NUM_CALIBRATION_SAMPLES = 2048
31
+ MAX_SEQUENCE_LENGTH = 2048
32
+
33
+
34
+ # Load dataset and preprocess.
35
+ ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
36
+ ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
37
+
38
+
39
+ def preprocess(example):
40
+ return {
41
+ "text": tokenizer.apply_chat_template(
42
+ example["messages"],
43
+ tokenize=False,
44
+ )
45
+ }
46
+
47
+
48
+ ds = ds.map(preprocess)
49
+
50
+
51
+ # Tokenize inputs.
52
+ def tokenize(sample):
53
+ return tokenizer(
54
+ sample["text"],
55
+ padding=False,
56
+ max_length=MAX_SEQUENCE_LENGTH,
57
+ truncation=True,
58
+ add_special_tokens=False,
59
+ )
60
+
61
+
62
+ ds = ds.map(tokenize, remove_columns=ds.column_names)
63
+
64
+ # define a llmcompressor recipe for FP8 W8A8 quantization
65
+ # since the MoE gate layers are sensitive to quantization, we add them to the ignore
66
+ # list so they remain at full precision
67
+ recipe = [
68
+ QuantizationModifier(
69
+ targets="Linear",
70
+ scheme="FP8",
71
+ ignore=["lm_head", "re:.*mlp.gate$"],
72
+ ),
73
+ ]
74
+
75
+ SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8"
76
+
77
+ oneshot(
78
+ model=model,
79
+ dataset=ds,
80
+ recipe=recipe,
81
+ max_seq_length=MAX_SEQUENCE_LENGTH,
82
+ num_calibration_samples=NUM_CALIBRATION_SAMPLES,
83
+ save_compressed=True,
84
+ output_dir=SAVE_DIR,
85
+ )
86
+
87
+
88
+ print("========== SAMPLE GENERATION ==============")
89
+ SAMPLE_INPUT = ["I love quantization because"]
90
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
91
+ inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
92
+ output = model.generate(**inputs, max_length=50)
93
+ text_output = tokenizer.batch_decode(output)
94
+ print(text_output)
95
+ ```