Update README.md
Browse files
README.md
CHANGED
@@ -25,6 +25,48 @@ pip install git+https://github.com/huggingface/transformers@main
|
|
25 |
pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
|
26 |
```
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
We used following code to get the quantized model:
|
29 |
|
30 |
```
|
@@ -43,7 +85,12 @@ from torchao.quantization.quant_api import (
|
|
43 |
from torchao.quantization.granularity import PerGroup, PerAxis
|
44 |
import torch
|
45 |
|
|
|
46 |
model_id = "microsoft/Phi-4-mini-instruct"
|
|
|
|
|
|
|
|
|
47 |
|
48 |
embedding_config = IntxWeightOnlyConfig(
|
49 |
weight_dtype=torch.int8,
|
@@ -54,7 +101,7 @@ linear_config = Int8DynamicActivationIntxWeightConfig(
|
|
54 |
weight_granularity=PerGroup(32),
|
55 |
weight_scale_dtype=torch.bfloat16,
|
56 |
)
|
57 |
-
quantized_model = AutoModelForCausalLM.from_pretrained(
|
58 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
59 |
|
60 |
# TODO: use AOPerModuleConfig once fix for tied weights is landed
|
|
|
25 |
pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
|
26 |
```
|
27 |
|
28 |
+
## Untie Embedding Weights
|
29 |
+
Before quantization, since we need quantize input embedding and unembedding (lm_head) layer which are tied, we first need to untie the model:
|
30 |
+
|
31 |
+
```
|
32 |
+
from transformers import (
|
33 |
+
AutoModelForCausalLM,
|
34 |
+
AutoProcessor,
|
35 |
+
AutoTokenizer,
|
36 |
+
TorchAoConfig,
|
37 |
+
)
|
38 |
+
from torchao.quantization.quant_api import (
|
39 |
+
IntxWeightOnlyConfig,
|
40 |
+
Int8DynamicActivationIntxWeightConfig,
|
41 |
+
AOPerModuleConfig
|
42 |
+
)
|
43 |
+
from torchao.quantization.granularity import PerGroup, PerAxis
|
44 |
+
import torch
|
45 |
+
|
46 |
+
model_id = "microsoft/Phi-4-mini-instruct"
|
47 |
+
untied_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map="a\uto")
|
48 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
49 |
+
|
50 |
+
print(untied_model)
|
51 |
+
from transformers.modeling_utils import find_tied_parameters
|
52 |
+
print("tied weights:", find_tied_parameters(untied_model))
|
53 |
+
if getattr(untied_model.config.get_text_config(decoder=True), "tie_word_embeddings"):
|
54 |
+
setattr(untied_model.config.get_text_config(decoder=True), "tie_word_embeddings", False)
|
55 |
+
|
56 |
+
untied_model._tied_weights_keys = []
|
57 |
+
untied_model.lm_head.weight = torch.nn.Parameter(quantized_model.lm_head.weight.clone())
|
58 |
+
|
59 |
+
print("tied weights:", find_tied_parameters(untied_model))
|
60 |
+
|
61 |
+
USER_ID = "YOUR_USER_ID"
|
62 |
+
MODEL_NAME = model_id.split("/")[-1]
|
63 |
+
save_to = f"{USER_ID}/{MODEL_NAME}-untied-weights"
|
64 |
+
untied_model.push_to_hub(save_to)
|
65 |
+
tokenizer.push_to_hub(save_to)
|
66 |
+
```
|
67 |
+
|
68 |
+
## Quantization
|
69 |
+
|
70 |
We used following code to get the quantized model:
|
71 |
|
72 |
```
|
|
|
85 |
from torchao.quantization.granularity import PerGroup, PerAxis
|
86 |
import torch
|
87 |
|
88 |
+
# we start from the model with untied weights
|
89 |
model_id = "microsoft/Phi-4-mini-instruct"
|
90 |
+
USER_ID = "YOUR_USER_ID"
|
91 |
+
MODEL_NAME = model_id.split("/")[-1]
|
92 |
+
untied_model_id = f"{USER_ID}/{MODEL_NAME}-untied-weights"
|
93 |
+
|
94 |
|
95 |
embedding_config = IntxWeightOnlyConfig(
|
96 |
weight_dtype=torch.int8,
|
|
|
101 |
weight_granularity=PerGroup(32),
|
102 |
weight_scale_dtype=torch.bfloat16,
|
103 |
)
|
104 |
+
quantized_model = AutoModelForCausalLM.from_pretrained(untied_model_id, torch_dtype=torch.float32, device_map="auto")
|
105 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
106 |
|
107 |
# TODO: use AOPerModuleConfig once fix for tied weights is landed
|