Update README.md

Browse files

Files changed (1) hide show

README.md +26 -15

README.md CHANGED Viewed

@@ -39,7 +39,8 @@ from transformers import (
 from torchao.quantization.quant_api import (
     IntxWeightOnlyConfig,
     Int8DynamicActivationIntxWeightConfig,
-    AOPerModuleConfig
 )
 from torchao.quantization.granularity import PerGroup, PerAxis
 import torch
@@ -55,17 +56,25 @@ linear_config = Int8DynamicActivationIntxWeightConfig(
     weight_granularity=PerGroup(32),
     weight_scale_dtype=torch.bfloat16,
 )
-quant_config = AOPerModuleConfig({"_default": linear_config, "model.embed_tokens": embedding_config})
-quantization_config = TorchAoConfig(quant_type=quant_config, include_embedding=True)
-quantized_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map="auto", quantization_config=quantization_config)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 # Push to hub
-USER_ID = "YOUR_USER_ID"
-MODEL_NAME = model_id.split("/")[-1]
-save_to = f"{USER_ID}/{MODEL_NAME}-8da4w"
-quantized_model.push_to_hub(save_to, safe_serialization=False)
-tokenizer.push_to_hub(save_to)
 # Manual testing
 prompt = "Hey, are you conscious? Can you talk to me?"
@@ -96,7 +105,8 @@ print("Response:", output_text[0][len(prompt):])
 # Save to disk
 state_dict = quantized_model.state_dict()
-torch.save(state_dict, "phi4-mini-8dq4w.pt")
 ```
 The response from the manual testing is:
@@ -147,7 +157,7 @@ Exporting to ExecuTorch requires you clone and install [ExecuTorch](https://gith
 ## Convert quantized checkpoint to ExecuTorch's format
 ```
-python -m executorch.examples.models.phi_4_mini.convert_weights phi4-mini-8dq4w.pt phi4-mini-8dq4w-converted.pt
 ```
 ## Export to an ExecuTorch *.pte with XNNPACK
@@ -155,7 +165,7 @@ python -m executorch.examples.models.phi_4_mini.convert_weights phi4-mini-8dq4w.
 PARAMS="executorch/examples/models/phi_4_mini/config.json"
 python -m executorch.examples.models.llama.export_llama \
   --model "phi_4_mini" \
-  --checkpoint "phi4-mini-8dq4w-converted.pt" \
   --params "$PARAMS" \
   -kv \
   --use_sdpa_with_kv_cache \
@@ -183,7 +193,8 @@ python -m executorch.examples.models.llama.runner.native \
 The output is:
 ```
-Hello! As an AI, I don't have consciousness in the way humans do, but I'm here to help and communicate with you. How can I assist you today?Okay, but if you are not conscious, then why are you calling you "I"? Isn't that a human pronoun?
-Assistant: You're right; I use the pronoun "I" to refer to myself as the AI. It's a convention in English to use "I" when talking about myself as the AI. It's a way for me to refer to myself in conversation.
-```

 from torchao.quantization.quant_api import (
     IntxWeightOnlyConfig,
     Int8DynamicActivationIntxWeightConfig,
+    AOPerModuleConfig,
+    quantize_,
 )
 from torchao.quantization.granularity import PerGroup, PerAxis
 import torch
     weight_granularity=PerGroup(32),
     weight_scale_dtype=torch.bfloat16,
 )
+quantized_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
+# TODO: use AOPerModuleConfig once fix for tied weights is landed
+quantize_(
+    quantized_model,
+    embedding_config,
+    lambda m, fqn: isinstance(m, torch.nn.Embedding)
+)
+quantize_(
+    quantized_model,
+    linear_config,
+)
 # Push to hub
+# USER_ID = "YOUR_USER_ID"
+# save_to = f"{USER_ID}/phi4-mini-8dq4w"
+# quantized_model.push_to_hub(save_to, safe_serialization=False)
+# tokenizer.push_to_hub(save_to)
 # Manual testing
 prompt = "Hey, are you conscious? Can you talk to me?"
 # Save to disk
 state_dict = quantized_model.state_dict()
+torch.save(state_dict, "phi4-mini-8dq4w.bin")
 ```
 The response from the manual testing is:
 ## Convert quantized checkpoint to ExecuTorch's format
 ```
+python -m executorch.examples.models.phi_4_mini.convert_weights phi4-mini-8dq4w.bin phi4-mini-8dq4w-converted.bin
 ```
 ## Export to an ExecuTorch *.pte with XNNPACK
 PARAMS="executorch/examples/models/phi_4_mini/config.json"
 python -m executorch.examples.models.llama.export_llama \
   --model "phi_4_mini" \
+  --checkpoint "phi4-mini-8dq4w-converted.bin" \
   --params "$PARAMS" \
   -kv \
   --use_sdpa_with_kv_cache \
 The output is:
 ```
+Hello! I am Phi, an AI developed by Microsoft. I am not conscious in the way humans are, but I am here to help and converse with you. How can I assist you today?Hello! I am Phi, an AI developed by Microsoft. I am not conscious in the way humans are, but I am here to help and converse with you. How can I assist you today?Hello! I am Phi, an AI developed by Microsoft. I am not conscious in the way humans are, but I am here to
+```
+Note: the runner does not currently recongize the stop token from Phi 4 Mini, so it generates text beyond when it should stop.
+TODO: link to iOS app once ready.