jerryzh168 metascroy commited on
Commit
b92ceeb
·
verified ·
1 Parent(s): dde60eb

Update README.md (#2)

Browse files

- Update README.md (0d2b1b1d927f98d9fecbce2dd9df133927d053a0)


Co-authored-by: Scott Roy <[email protected]>

Files changed (1) hide show
  1. README.md +49 -19
README.md CHANGED
@@ -10,33 +10,62 @@ tags:
10
  We used following code to get the quantized model:
11
 
12
  ```
13
- model_id = "microsoft/Phi-4-mini-instruct"
14
  from transformers import (
15
  AutoModelForCausalLM,
16
  AutoProcessor,
17
  AutoTokenizer,
 
18
  )
19
  from torchao.quantization.quant_api import (
20
  Int8DynamicActivationIntxWeightConfig,
21
- MappingType,
22
- quantize_,
23
  )
24
  from torchao.quantization.granularity import PerGroup
25
  import torch
26
 
27
- model = AutoModelForCausalLM.from_pretrained(
28
- model_id, torch_dtype="auto", device_map="auto"
29
- )
30
  linear_config = Int8DynamicActivationIntxWeightConfig(
31
  weight_dtype=torch.int4,
32
  weight_granularity=PerGroup(32),
33
- weight_mapping_type=MappingType.SYMMETRIC,
34
  )
35
- quantize_(
36
- model,
37
- linear_config,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  )
39
- state_dict = model.state_dict()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  torch.save(state_dict, "phi4-mini-8dq4w.pt")
41
  ```
42
 
@@ -57,12 +86,7 @@ from lm_eval.utils import (
57
  make_table,
58
  )
59
 
60
- # model is after calling quantize_ as we do in the recipe
61
- # quantize_(
62
- # model,
63
- # linear_config,
64
- #)
65
- lm_eval_model = lm_eval.models.huggingface.HFLM(pretrained=model, batch_size=8)
66
  results = evaluator.simple_evaluate(
67
  lm_eval_model, tasks=["hellaswag"], device="cuda:0", batch_size="auto"
68
  )
@@ -86,9 +110,12 @@ Exporting to ExecuTorch requires you clone and install [ExecuTorch](https://gith
86
 
87
 
88
  ## Convert quantized checkpoint to ExecuTorch's format
 
89
  python -m executorch.examples.models.phi_4_mini.convert_weights phi4-mini-8dq4w.pt phi4-mini-8dq4w-converted.pt
 
90
 
91
  ## Export to an ExecuTorch *.pte with XNNPACK
 
92
  PARAMS="executorch/examples/models/phi_4_mini/config.json"
93
  python -m executorch.examples.models.llama.export_llama \
94
  --model "phi_4_mini" \
@@ -99,11 +126,13 @@ python -m executorch.examples.models.llama.export_llama \
99
  -X \
100
  --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
101
  --output_name="phi4-mini-8dq4w.pte"
 
102
 
103
  ## Run model with pybindings
 
104
  export TOKENIZER="/path/to/tokenizer.json"
105
  export TOKENIZER_CONFIG="/path/to/tokenizer_config.json"
106
- export PROMPT="<|system|><|end|><|user|>What is in a california roll?<|end|><|assistant|>"
107
  python -m executorch.examples.models.llama.runner.native \
108
  --model phi_4_mini \
109
  --pte phi4-mini-8dq4w.pte \
@@ -113,4 +142,5 @@ python -m executorch.examples.models.llama.runner.native \
113
  --prompt "${PROMPT}" \
114
  --params "${PARAMS}" \
115
  --max_len 128 \
116
- --temperature 0
 
 
10
  We used following code to get the quantized model:
11
 
12
  ```
 
13
  from transformers import (
14
  AutoModelForCausalLM,
15
  AutoProcessor,
16
  AutoTokenizer,
17
+ TorchAoConfig,
18
  )
19
  from torchao.quantization.quant_api import (
20
  Int8DynamicActivationIntxWeightConfig,
 
 
21
  )
22
  from torchao.quantization.granularity import PerGroup
23
  import torch
24
 
25
+ model_id = "microsoft/Phi-4-mini-instruct"
 
 
26
  linear_config = Int8DynamicActivationIntxWeightConfig(
27
  weight_dtype=torch.int4,
28
  weight_granularity=PerGroup(32),
 
29
  )
30
+ quantization_config = TorchAoConfig(quant_type=linear_config)
31
+ quantized_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
32
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
33
+
34
+ # Push to hub
35
+ USER_ID = "YOUR_USER_ID"
36
+ save_to = f"{USER_ID}/phi4-mini-8dq4w"
37
+ quantized_model.push_to_hub(save_to, safe_serialization=False)
38
+ tokenizer.push_to_hub(save_to)
39
+
40
+ # Manual testing
41
+ prompt = "Hey, are you conscious? Can you talk to me?"
42
+ messages = [
43
+ {
44
+ "role": "system",
45
+ "content": "",
46
+ },
47
+ {"role": "user", "content": prompt},
48
+ ]
49
+ templated_prompt = tokenizer.apply_chat_template(
50
+ messages,
51
+ tokenize=False,
52
+ add_generation_prompt=True,
53
  )
54
+ print("Prompt:", prompt)
55
+ print("Templated prompt:", templated_prompt)
56
+ inputs = tokenizer(
57
+ templated_prompt,
58
+ return_tensors="pt",
59
+ ).to("cuda")
60
+ generated_ids = quantized_model.generate(**inputs, max_new_tokens=128)
61
+ output_text = tokenizer.batch_decode(
62
+ generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
63
+ )
64
+ print("Response:", output_text[0][len(prompt):])
65
+
66
+
67
+ # Save to disk
68
+ state_dict = quantized_model.state_dict()
69
  torch.save(state_dict, "phi4-mini-8dq4w.pt")
70
  ```
71
 
 
86
  make_table,
87
  )
88
 
89
+ lm_eval_model = lm_eval.models.huggingface.HFLM(pretrained=quantized_model, batch_size=8)
 
 
 
 
 
90
  results = evaluator.simple_evaluate(
91
  lm_eval_model, tasks=["hellaswag"], device="cuda:0", batch_size="auto"
92
  )
 
110
 
111
 
112
  ## Convert quantized checkpoint to ExecuTorch's format
113
+ ```
114
  python -m executorch.examples.models.phi_4_mini.convert_weights phi4-mini-8dq4w.pt phi4-mini-8dq4w-converted.pt
115
+ ```
116
 
117
  ## Export to an ExecuTorch *.pte with XNNPACK
118
+ ```
119
  PARAMS="executorch/examples/models/phi_4_mini/config.json"
120
  python -m executorch.examples.models.llama.export_llama \
121
  --model "phi_4_mini" \
 
126
  -X \
127
  --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
128
  --output_name="phi4-mini-8dq4w.pte"
129
+ ```
130
 
131
  ## Run model with pybindings
132
+ ```
133
  export TOKENIZER="/path/to/tokenizer.json"
134
  export TOKENIZER_CONFIG="/path/to/tokenizer_config.json"
135
+ export PROMPT="<|system|><|end|><|user|>Hey, are you conscious? Can you talk to me?<|end|><|assistant|>"
136
  python -m executorch.examples.models.llama.runner.native \
137
  --model phi_4_mini \
138
  --pte phi4-mini-8dq4w.pte \
 
142
  --prompt "${PROMPT}" \
143
  --params "${PARAMS}" \
144
  --max_len 128 \
145
+ --temperature 0
146
+ ```