Update README.md
Browse files
README.md
CHANGED
@@ -60,13 +60,6 @@ prompt = [
|
|
60 |
]
|
61 |
|
62 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
63 |
-
tokenizer.pad_token_id = tokenizer.eos_token_id
|
64 |
-
tokenizer.padding_side = "left"
|
65 |
-
|
66 |
-
terminators = [
|
67 |
-
tokenizer.eos_token_id,
|
68 |
-
tokenizer.convert_tokens_to_ids("<|eot_id|>"),
|
69 |
-
]
|
70 |
|
71 |
inputs = tokenizer.apply_chat_template(prompt, tokenize=True, add_generation_prompt=True, return_tensors="pt").cuda()
|
72 |
|
@@ -77,12 +70,24 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
77 |
device_map="auto",
|
78 |
)
|
79 |
|
80 |
-
outputs = model.generate(
|
81 |
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
|
82 |
```
|
83 |
|
84 |
### AutoAWQ
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
Alternatively, one may want to run that via `AutoAWQ` even though it's built on top of 🤗 `transformers`, which is the recommended approach instead as described above.
|
87 |
|
88 |
```python
|
@@ -97,8 +102,6 @@ prompt = [
|
|
97 |
]
|
98 |
|
99 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
100 |
-
tokenizer.pad_token_id = tokenizer.eos_token_id
|
101 |
-
tokenizer.padding_side = "left"
|
102 |
|
103 |
inputs = tokenizer.apply_chat_template(prompt, tokenize=True, add_generation_prompt=True, return_tensors="pt").cuda()
|
104 |
|
@@ -110,7 +113,7 @@ model = AutoAWQForCausalLM.from_pretrained(
|
|
110 |
fuse_layers=True,
|
111 |
)
|
112 |
|
113 |
-
outputs = model.generate(
|
114 |
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
|
115 |
```
|
116 |
|
|
|
60 |
]
|
61 |
|
62 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
inputs = tokenizer.apply_chat_template(prompt, tokenize=True, add_generation_prompt=True, return_tensors="pt").cuda()
|
65 |
|
|
|
70 |
device_map="auto",
|
71 |
)
|
72 |
|
73 |
+
outputs = model.generate(inputs, do_sample=True, max_new_tokens=256)
|
74 |
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
|
75 |
```
|
76 |
|
77 |
### AutoAWQ
|
78 |
|
79 |
+
In order to run the inference with Llama 3.1 405B Instruct AWQ in INT4, both `torch` and `autoawq` need to be installed as:
|
80 |
+
|
81 |
+
```bash
|
82 |
+
pip install "torch>=2.2.0,<2.3.0" autoawq --upgrade
|
83 |
+
```
|
84 |
+
|
85 |
+
Then, the latest version of `transformers` need to be installed, being 4.43.0 or higher, as:
|
86 |
+
|
87 |
+
```bash
|
88 |
+
pip install "transformers[accelerate]>=4.43.0" --upgrade
|
89 |
+
```
|
90 |
+
|
91 |
Alternatively, one may want to run that via `AutoAWQ` even though it's built on top of 🤗 `transformers`, which is the recommended approach instead as described above.
|
92 |
|
93 |
```python
|
|
|
102 |
]
|
103 |
|
104 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
|
|
|
105 |
|
106 |
inputs = tokenizer.apply_chat_template(prompt, tokenize=True, add_generation_prompt=True, return_tensors="pt").cuda()
|
107 |
|
|
|
113 |
fuse_layers=True,
|
114 |
)
|
115 |
|
116 |
+
outputs = model.generate(inputs, do_sample=True, max_new_tokens=256)
|
117 |
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
|
118 |
```
|
119 |
|