File size: 974 Bytes
807ff23 f68d0d6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
---
tags:
- fp8
---
Quantized using AutoFP8 with this script:
```python
from transformers import AutoTokenizer
import auto_fp8
from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig
pretrained_model_dir = "ibm-granite/granite-20b-code-base"
quantized_model_dir = "granite-20b-code-base-FP8"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
# use some code to calibrate
import auto_fp8
tmp = auto_fp8.__file__.split('/')[:-1]
tmp.append('quantize.py')
seed_text_file = '/'.join(tmp)
with open(seed_text_file, "r") as f:
text = f.read()
examples = [text]
examples = tokenizer(examples, return_tensors="pt").to("cuda")
quantize_config = BaseQuantizeConfig(
quant_method="fp8",
activation_scheme="static",
ignore_patterns=["re:.*lm_head"],
)
model = AutoFP8ForCausalLM.from_pretrained(
pretrained_model_dir, quantize_config=quantize_config
)
model.quantize(examples)
model.save_quantized(quantized_model_dir)
``` |