mobiuslabsgmbh
/

Llama-3.1-8b-instruct_4bitgs64_hqq_calib

Text Generation

Model card Files Files and versions Community

mobicham commited on Aug 27, 2024

Commit

66f3e32

·

verified ·

1 Parent(s): eaea27b

Update README.md

Files changed (1) hide show

README.md +5 -5

README.md CHANGED Viewed

@@ -47,9 +47,9 @@ You can reproduce the results above via `pip install lm-eval==0.4.3`
 First, install the dependecies:
 ```
 pip install git+https://github.com/mobiusml/hqq.git #master branch fix
-pip install bitblas
 ```
-Also, make sure you use at least torch `2.4.0` or the nightly build.
 Then you can use the sample code below:
 ``` Python
@@ -65,7 +65,7 @@ from hqq.utils.generation_hf import HFGenerator
 #model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq' #no calib version
 model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq_calib' #calibrated version
-compute_dtype = torch.float16 #bfloat16 for torchao, float16 for bitblas
 cache_dir = '.'
 model     = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
 tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
@@ -77,8 +77,8 @@ patch_linearlayers(model, patch_add_quant_config, quant_config)
 ###################################################
 HQQLinear.set_backend(HQQBackend.PYTORCH)
 #prepare_for_inference(model) #default backend
-#prepare_for_inference(model, backend="torchao_int4")
-prepare_for_inference(model, backend="bitblas") #takes a while to init...
 #Generate
 ###################################################

 First, install the dependecies:
 ```
 pip install git+https://github.com/mobiusml/hqq.git #master branch fix
+pip install bitblas #if you use the bitblas backend
 ```
+Also, make sure you use at least torch `2.4.0` or the nightly build with at least CUDA 12.1.
 Then you can use the sample code below:
 ``` Python
 #model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq' #no calib version
 model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq_calib' #calibrated version
+compute_dtype = torch.bfloat16 #bfloat16 for torchao_int4, float16 for bitblas
 cache_dir = '.'
 model     = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
 tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
 ###################################################
 HQQLinear.set_backend(HQQBackend.PYTORCH)
 #prepare_for_inference(model) #default backend
+prepare_for_inference(model, backend="torchao_int4")
+#prepare_for_inference(model, backend="bitblas") #takes a while to init...
 #Generate
 ###################################################