jerryzh168 commited on
Commit
0369ded
·
verified ·
1 Parent(s): a9124af

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +48 -1
README.md CHANGED
@@ -25,6 +25,48 @@ pip install git+https://github.com/huggingface/transformers@main
25
  pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
26
  ```
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  We used following code to get the quantized model:
29
 
30
  ```
@@ -43,7 +85,12 @@ from torchao.quantization.quant_api import (
43
  from torchao.quantization.granularity import PerGroup, PerAxis
44
  import torch
45
 
 
46
  model_id = "microsoft/Phi-4-mini-instruct"
 
 
 
 
47
 
48
  embedding_config = IntxWeightOnlyConfig(
49
  weight_dtype=torch.int8,
@@ -54,7 +101,7 @@ linear_config = Int8DynamicActivationIntxWeightConfig(
54
  weight_granularity=PerGroup(32),
55
  weight_scale_dtype=torch.bfloat16,
56
  )
57
- quantized_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map="auto")
58
  tokenizer = AutoTokenizer.from_pretrained(model_id)
59
 
60
  # TODO: use AOPerModuleConfig once fix for tied weights is landed
 
25
  pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
26
  ```
27
 
28
+ ## Untie Embedding Weights
29
+ Before quantization, since we need quantize input embedding and unembedding (lm_head) layer which are tied, we first need to untie the model:
30
+
31
+ ```
32
+ from transformers import (
33
+ AutoModelForCausalLM,
34
+ AutoProcessor,
35
+ AutoTokenizer,
36
+ TorchAoConfig,
37
+ )
38
+ from torchao.quantization.quant_api import (
39
+ IntxWeightOnlyConfig,
40
+ Int8DynamicActivationIntxWeightConfig,
41
+ AOPerModuleConfig
42
+ )
43
+ from torchao.quantization.granularity import PerGroup, PerAxis
44
+ import torch
45
+
46
+ model_id = "microsoft/Phi-4-mini-instruct"
47
+ untied_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map="a\uto")
48
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
49
+
50
+ print(untied_model)
51
+ from transformers.modeling_utils import find_tied_parameters
52
+ print("tied weights:", find_tied_parameters(untied_model))
53
+ if getattr(untied_model.config.get_text_config(decoder=True), "tie_word_embeddings"):
54
+ setattr(untied_model.config.get_text_config(decoder=True), "tie_word_embeddings", False)
55
+
56
+ untied_model._tied_weights_keys = []
57
+ untied_model.lm_head.weight = torch.nn.Parameter(quantized_model.lm_head.weight.clone())
58
+
59
+ print("tied weights:", find_tied_parameters(untied_model))
60
+
61
+ USER_ID = "YOUR_USER_ID"
62
+ MODEL_NAME = model_id.split("/")[-1]
63
+ save_to = f"{USER_ID}/{MODEL_NAME}-untied-weights"
64
+ untied_model.push_to_hub(save_to)
65
+ tokenizer.push_to_hub(save_to)
66
+ ```
67
+
68
+ ## Quantization
69
+
70
  We used following code to get the quantized model:
71
 
72
  ```
 
85
  from torchao.quantization.granularity import PerGroup, PerAxis
86
  import torch
87
 
88
+ # we start from the model with untied weights
89
  model_id = "microsoft/Phi-4-mini-instruct"
90
+ USER_ID = "YOUR_USER_ID"
91
+ MODEL_NAME = model_id.split("/")[-1]
92
+ untied_model_id = f"{USER_ID}/{MODEL_NAME}-untied-weights"
93
+
94
 
95
  embedding_config = IntxWeightOnlyConfig(
96
  weight_dtype=torch.int8,
 
101
  weight_granularity=PerGroup(32),
102
  weight_scale_dtype=torch.bfloat16,
103
  )
104
+ quantized_model = AutoModelForCausalLM.from_pretrained(untied_model_id, torch_dtype=torch.float32, device_map="auto")
105
  tokenizer = AutoTokenizer.from_pretrained(model_id)
106
 
107
  # TODO: use AOPerModuleConfig once fix for tied weights is landed