Update README.md (#2)
Browse files- Update README.md (0d2b1b1d927f98d9fecbce2dd9df133927d053a0)
Co-authored-by: Scott Roy <[email protected]>
README.md
CHANGED
@@ -10,33 +10,62 @@ tags:
|
|
10 |
We used following code to get the quantized model:
|
11 |
|
12 |
```
|
13 |
-
model_id = "microsoft/Phi-4-mini-instruct"
|
14 |
from transformers import (
|
15 |
AutoModelForCausalLM,
|
16 |
AutoProcessor,
|
17 |
AutoTokenizer,
|
|
|
18 |
)
|
19 |
from torchao.quantization.quant_api import (
|
20 |
Int8DynamicActivationIntxWeightConfig,
|
21 |
-
MappingType,
|
22 |
-
quantize_,
|
23 |
)
|
24 |
from torchao.quantization.granularity import PerGroup
|
25 |
import torch
|
26 |
|
27 |
-
|
28 |
-
model_id, torch_dtype="auto", device_map="auto"
|
29 |
-
)
|
30 |
linear_config = Int8DynamicActivationIntxWeightConfig(
|
31 |
weight_dtype=torch.int4,
|
32 |
weight_granularity=PerGroup(32),
|
33 |
-
weight_mapping_type=MappingType.SYMMETRIC,
|
34 |
)
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
)
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
torch.save(state_dict, "phi4-mini-8dq4w.pt")
|
41 |
```
|
42 |
|
@@ -57,12 +86,7 @@ from lm_eval.utils import (
|
|
57 |
make_table,
|
58 |
)
|
59 |
|
60 |
-
|
61 |
-
# quantize_(
|
62 |
-
# model,
|
63 |
-
# linear_config,
|
64 |
-
#)
|
65 |
-
lm_eval_model = lm_eval.models.huggingface.HFLM(pretrained=model, batch_size=8)
|
66 |
results = evaluator.simple_evaluate(
|
67 |
lm_eval_model, tasks=["hellaswag"], device="cuda:0", batch_size="auto"
|
68 |
)
|
@@ -86,9 +110,12 @@ Exporting to ExecuTorch requires you clone and install [ExecuTorch](https://gith
|
|
86 |
|
87 |
|
88 |
## Convert quantized checkpoint to ExecuTorch's format
|
|
|
89 |
python -m executorch.examples.models.phi_4_mini.convert_weights phi4-mini-8dq4w.pt phi4-mini-8dq4w-converted.pt
|
|
|
90 |
|
91 |
## Export to an ExecuTorch *.pte with XNNPACK
|
|
|
92 |
PARAMS="executorch/examples/models/phi_4_mini/config.json"
|
93 |
python -m executorch.examples.models.llama.export_llama \
|
94 |
--model "phi_4_mini" \
|
@@ -99,11 +126,13 @@ python -m executorch.examples.models.llama.export_llama \
|
|
99 |
-X \
|
100 |
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
|
101 |
--output_name="phi4-mini-8dq4w.pte"
|
|
|
102 |
|
103 |
## Run model with pybindings
|
|
|
104 |
export TOKENIZER="/path/to/tokenizer.json"
|
105 |
export TOKENIZER_CONFIG="/path/to/tokenizer_config.json"
|
106 |
-
export PROMPT="<|system|><|end|><|user|>
|
107 |
python -m executorch.examples.models.llama.runner.native \
|
108 |
--model phi_4_mini \
|
109 |
--pte phi4-mini-8dq4w.pte \
|
@@ -113,4 +142,5 @@ python -m executorch.examples.models.llama.runner.native \
|
|
113 |
--prompt "${PROMPT}" \
|
114 |
--params "${PARAMS}" \
|
115 |
--max_len 128 \
|
116 |
-
--temperature 0
|
|
|
|
10 |
We used following code to get the quantized model:
|
11 |
|
12 |
```
|
|
|
13 |
from transformers import (
|
14 |
AutoModelForCausalLM,
|
15 |
AutoProcessor,
|
16 |
AutoTokenizer,
|
17 |
+
TorchAoConfig,
|
18 |
)
|
19 |
from torchao.quantization.quant_api import (
|
20 |
Int8DynamicActivationIntxWeightConfig,
|
|
|
|
|
21 |
)
|
22 |
from torchao.quantization.granularity import PerGroup
|
23 |
import torch
|
24 |
|
25 |
+
model_id = "microsoft/Phi-4-mini-instruct"
|
|
|
|
|
26 |
linear_config = Int8DynamicActivationIntxWeightConfig(
|
27 |
weight_dtype=torch.int4,
|
28 |
weight_granularity=PerGroup(32),
|
|
|
29 |
)
|
30 |
+
quantization_config = TorchAoConfig(quant_type=linear_config)
|
31 |
+
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
|
32 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
33 |
+
|
34 |
+
# Push to hub
|
35 |
+
USER_ID = "YOUR_USER_ID"
|
36 |
+
save_to = f"{USER_ID}/phi4-mini-8dq4w"
|
37 |
+
quantized_model.push_to_hub(save_to, safe_serialization=False)
|
38 |
+
tokenizer.push_to_hub(save_to)
|
39 |
+
|
40 |
+
# Manual testing
|
41 |
+
prompt = "Hey, are you conscious? Can you talk to me?"
|
42 |
+
messages = [
|
43 |
+
{
|
44 |
+
"role": "system",
|
45 |
+
"content": "",
|
46 |
+
},
|
47 |
+
{"role": "user", "content": prompt},
|
48 |
+
]
|
49 |
+
templated_prompt = tokenizer.apply_chat_template(
|
50 |
+
messages,
|
51 |
+
tokenize=False,
|
52 |
+
add_generation_prompt=True,
|
53 |
)
|
54 |
+
print("Prompt:", prompt)
|
55 |
+
print("Templated prompt:", templated_prompt)
|
56 |
+
inputs = tokenizer(
|
57 |
+
templated_prompt,
|
58 |
+
return_tensors="pt",
|
59 |
+
).to("cuda")
|
60 |
+
generated_ids = quantized_model.generate(**inputs, max_new_tokens=128)
|
61 |
+
output_text = tokenizer.batch_decode(
|
62 |
+
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
63 |
+
)
|
64 |
+
print("Response:", output_text[0][len(prompt):])
|
65 |
+
|
66 |
+
|
67 |
+
# Save to disk
|
68 |
+
state_dict = quantized_model.state_dict()
|
69 |
torch.save(state_dict, "phi4-mini-8dq4w.pt")
|
70 |
```
|
71 |
|
|
|
86 |
make_table,
|
87 |
)
|
88 |
|
89 |
+
lm_eval_model = lm_eval.models.huggingface.HFLM(pretrained=quantized_model, batch_size=8)
|
|
|
|
|
|
|
|
|
|
|
90 |
results = evaluator.simple_evaluate(
|
91 |
lm_eval_model, tasks=["hellaswag"], device="cuda:0", batch_size="auto"
|
92 |
)
|
|
|
110 |
|
111 |
|
112 |
## Convert quantized checkpoint to ExecuTorch's format
|
113 |
+
```
|
114 |
python -m executorch.examples.models.phi_4_mini.convert_weights phi4-mini-8dq4w.pt phi4-mini-8dq4w-converted.pt
|
115 |
+
```
|
116 |
|
117 |
## Export to an ExecuTorch *.pte with XNNPACK
|
118 |
+
```
|
119 |
PARAMS="executorch/examples/models/phi_4_mini/config.json"
|
120 |
python -m executorch.examples.models.llama.export_llama \
|
121 |
--model "phi_4_mini" \
|
|
|
126 |
-X \
|
127 |
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
|
128 |
--output_name="phi4-mini-8dq4w.pte"
|
129 |
+
```
|
130 |
|
131 |
## Run model with pybindings
|
132 |
+
```
|
133 |
export TOKENIZER="/path/to/tokenizer.json"
|
134 |
export TOKENIZER_CONFIG="/path/to/tokenizer_config.json"
|
135 |
+
export PROMPT="<|system|><|end|><|user|>Hey, are you conscious? Can you talk to me?<|end|><|assistant|>"
|
136 |
python -m executorch.examples.models.llama.runner.native \
|
137 |
--model phi_4_mini \
|
138 |
--pte phi4-mini-8dq4w.pte \
|
|
|
142 |
--prompt "${PROMPT}" \
|
143 |
--params "${PARAMS}" \
|
144 |
--max_len 128 \
|
145 |
+
--temperature 0
|
146 |
+
```
|