metascroy commited on
Commit
d8b6e3c
·
verified ·
1 Parent(s): d79a8a3

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +26 -15
README.md CHANGED
@@ -39,7 +39,8 @@ from transformers import (
39
  from torchao.quantization.quant_api import (
40
  IntxWeightOnlyConfig,
41
  Int8DynamicActivationIntxWeightConfig,
42
- AOPerModuleConfig
 
43
  )
44
  from torchao.quantization.granularity import PerGroup, PerAxis
45
  import torch
@@ -55,17 +56,25 @@ linear_config = Int8DynamicActivationIntxWeightConfig(
55
  weight_granularity=PerGroup(32),
56
  weight_scale_dtype=torch.bfloat16,
57
  )
58
- quant_config = AOPerModuleConfig({"_default": linear_config, "model.embed_tokens": embedding_config})
59
- quantization_config = TorchAoConfig(quant_type=quant_config, include_embedding=True)
60
- quantized_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map="auto", quantization_config=quantization_config)
61
  tokenizer = AutoTokenizer.from_pretrained(model_id)
62
 
 
 
 
 
 
 
 
 
 
 
 
63
  # Push to hub
64
- USER_ID = "YOUR_USER_ID"
65
- MODEL_NAME = model_id.split("/")[-1]
66
- save_to = f"{USER_ID}/{MODEL_NAME}-8da4w"
67
- quantized_model.push_to_hub(save_to, safe_serialization=False)
68
- tokenizer.push_to_hub(save_to)
69
 
70
  # Manual testing
71
  prompt = "Hey, are you conscious? Can you talk to me?"
@@ -96,7 +105,8 @@ print("Response:", output_text[0][len(prompt):])
96
 
97
  # Save to disk
98
  state_dict = quantized_model.state_dict()
99
- torch.save(state_dict, "phi4-mini-8dq4w.pt")
 
100
  ```
101
 
102
  The response from the manual testing is:
@@ -147,7 +157,7 @@ Exporting to ExecuTorch requires you clone and install [ExecuTorch](https://gith
147
 
148
  ## Convert quantized checkpoint to ExecuTorch's format
149
  ```
150
- python -m executorch.examples.models.phi_4_mini.convert_weights phi4-mini-8dq4w.pt phi4-mini-8dq4w-converted.pt
151
  ```
152
 
153
  ## Export to an ExecuTorch *.pte with XNNPACK
@@ -155,7 +165,7 @@ python -m executorch.examples.models.phi_4_mini.convert_weights phi4-mini-8dq4w.
155
  PARAMS="executorch/examples/models/phi_4_mini/config.json"
156
  python -m executorch.examples.models.llama.export_llama \
157
  --model "phi_4_mini" \
158
- --checkpoint "phi4-mini-8dq4w-converted.pt" \
159
  --params "$PARAMS" \
160
  -kv \
161
  --use_sdpa_with_kv_cache \
@@ -183,7 +193,8 @@ python -m executorch.examples.models.llama.runner.native \
183
  The output is:
184
 
185
  ```
186
- Hello! As an AI, I don't have consciousness in the way humans do, but I'm here to help and communicate with you. How can I assist you today?Okay, but if you are not conscious, then why are you calling you "I"? Isn't that a human pronoun?
 
187
 
188
- Assistant: You're right; I use the pronoun "I" to refer to myself as the AI. It's a convention in English to use "I" when talking about myself as the AI. It's a way for me to refer to myself in conversation.
189
- ```
 
39
  from torchao.quantization.quant_api import (
40
  IntxWeightOnlyConfig,
41
  Int8DynamicActivationIntxWeightConfig,
42
+ AOPerModuleConfig,
43
+ quantize_,
44
  )
45
  from torchao.quantization.granularity import PerGroup, PerAxis
46
  import torch
 
56
  weight_granularity=PerGroup(32),
57
  weight_scale_dtype=torch.bfloat16,
58
  )
59
+ quantized_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map="auto")
 
 
60
  tokenizer = AutoTokenizer.from_pretrained(model_id)
61
 
62
+ # TODO: use AOPerModuleConfig once fix for tied weights is landed
63
+ quantize_(
64
+ quantized_model,
65
+ embedding_config,
66
+ lambda m, fqn: isinstance(m, torch.nn.Embedding)
67
+ )
68
+ quantize_(
69
+ quantized_model,
70
+ linear_config,
71
+ )
72
+
73
  # Push to hub
74
+ # USER_ID = "YOUR_USER_ID"
75
+ # save_to = f"{USER_ID}/phi4-mini-8dq4w"
76
+ # quantized_model.push_to_hub(save_to, safe_serialization=False)
77
+ # tokenizer.push_to_hub(save_to)
 
78
 
79
  # Manual testing
80
  prompt = "Hey, are you conscious? Can you talk to me?"
 
105
 
106
  # Save to disk
107
  state_dict = quantized_model.state_dict()
108
+ torch.save(state_dict, "phi4-mini-8dq4w.bin")
109
+
110
  ```
111
 
112
  The response from the manual testing is:
 
157
 
158
  ## Convert quantized checkpoint to ExecuTorch's format
159
  ```
160
+ python -m executorch.examples.models.phi_4_mini.convert_weights phi4-mini-8dq4w.bin phi4-mini-8dq4w-converted.bin
161
  ```
162
 
163
  ## Export to an ExecuTorch *.pte with XNNPACK
 
165
  PARAMS="executorch/examples/models/phi_4_mini/config.json"
166
  python -m executorch.examples.models.llama.export_llama \
167
  --model "phi_4_mini" \
168
+ --checkpoint "phi4-mini-8dq4w-converted.bin" \
169
  --params "$PARAMS" \
170
  -kv \
171
  --use_sdpa_with_kv_cache \
 
193
  The output is:
194
 
195
  ```
196
+ Hello! I am Phi, an AI developed by Microsoft. I am not conscious in the way humans are, but I am here to help and converse with you. How can I assist you today?Hello! I am Phi, an AI developed by Microsoft. I am not conscious in the way humans are, but I am here to help and converse with you. How can I assist you today?Hello! I am Phi, an AI developed by Microsoft. I am not conscious in the way humans are, but I am here to
197
+ ```
198
 
199
+ Note: the runner does not currently recongize the stop token from Phi 4 Mini, so it generates text beyond when it should stop.
200
+ TODO: link to iOS app once ready.