Update README.md (#7)
Browse files- Update README.md (d5bd0a862826c50c2bc6a4d9bd337a3ecc9a9e91)
README.md
CHANGED
@@ -45,15 +45,23 @@ To run the inference on top of Llama 3.1 8B Instruct AWQ in INT4 precision, the
|
|
45 |
|
46 |
```python
|
47 |
import torch
|
48 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
49 |
|
50 |
model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
52 |
model = AutoModelForCausalLM.from_pretrained(
|
53 |
model_id,
|
54 |
torch_dtype=torch.float16,
|
55 |
low_cpu_mem_usage=True,
|
56 |
device_map="auto",
|
|
|
57 |
)
|
58 |
|
59 |
prompt = [
|
|
|
45 |
|
46 |
```python
|
47 |
import torch
|
48 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, AwqConfig
|
49 |
|
50 |
model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
|
51 |
+
|
52 |
+
quantization_config = AwqConfig(
|
53 |
+
bits=4,
|
54 |
+
fuse_max_seq_len=512, # Note: Update this as per your use-case
|
55 |
+
do_fuse=True,
|
56 |
+
)
|
57 |
+
|
58 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
59 |
model = AutoModelForCausalLM.from_pretrained(
|
60 |
model_id,
|
61 |
torch_dtype=torch.float16,
|
62 |
low_cpu_mem_usage=True,
|
63 |
device_map="auto",
|
64 |
+
quantization_config=quantization_config
|
65 |
)
|
66 |
|
67 |
prompt = [
|