Improve code snippet
#3
by
Xenova
HF staff
- opened
README.md
CHANGED
@@ -92,7 +92,7 @@ Alternatively, one may want to run that via `AutoAWQ` even though it's built on
|
|
92 |
|
93 |
```python
|
94 |
import torch
|
95 |
-
from
|
96 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
97 |
|
98 |
model_id = "hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4"
|
@@ -102,18 +102,15 @@ prompt = [
|
|
102 |
]
|
103 |
|
104 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
105 |
-
|
106 |
-
inputs = tokenizer.apply_chat_template(prompt, tokenize=True, add_generation_prompt=True, return_tensors="pt").cuda()
|
107 |
-
|
108 |
model = AutoAWQForCausalLM.from_pretrained(
|
109 |
model_id,
|
110 |
torch_dtype=torch.float16,
|
111 |
low_cpu_mem_usage=True,
|
112 |
device_map="auto",
|
113 |
-
fuse_layers=True,
|
114 |
)
|
115 |
|
116 |
-
|
|
|
117 |
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
|
118 |
```
|
119 |
|
|
|
92 |
|
93 |
```python
|
94 |
import torch
|
95 |
+
from awq import AutoAWQForCausalLM
|
96 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
97 |
|
98 |
model_id = "hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4"
|
|
|
102 |
]
|
103 |
|
104 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
|
|
|
|
|
105 |
model = AutoAWQForCausalLM.from_pretrained(
|
106 |
model_id,
|
107 |
torch_dtype=torch.float16,
|
108 |
low_cpu_mem_usage=True,
|
109 |
device_map="auto",
|
|
|
110 |
)
|
111 |
|
112 |
+
inputs = tokenizer.apply_chat_template(prompt, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True).to('cuda')
|
113 |
+
outputs = model.generate(**inputs, do_sample=True, max_new_tokens=256)
|
114 |
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
|
115 |
```
|
116 |
|