Improve code snippet

#3
by Xenova HF staff - opened
Files changed (1) hide show
  1. README.md +3 -6
README.md CHANGED
@@ -92,7 +92,7 @@ Alternatively, one may want to run that via `AutoAWQ` even though it's built on
92
 
93
  ```python
94
  import torch
95
- from autoawq import AutoAWQForCausalLM
96
  from transformers import AutoModelForCausalLM, AutoTokenizer
97
 
98
  model_id = "hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4"
@@ -102,18 +102,15 @@ prompt = [
102
  ]
103
 
104
  tokenizer = AutoTokenizer.from_pretrained(model_id)
105
-
106
- inputs = tokenizer.apply_chat_template(prompt, tokenize=True, add_generation_prompt=True, return_tensors="pt").cuda()
107
-
108
  model = AutoAWQForCausalLM.from_pretrained(
109
  model_id,
110
  torch_dtype=torch.float16,
111
  low_cpu_mem_usage=True,
112
  device_map="auto",
113
- fuse_layers=True,
114
  )
115
 
116
- outputs = model.generate(inputs, do_sample=True, max_new_tokens=256)
 
117
  print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
118
  ```
119
 
 
92
 
93
  ```python
94
  import torch
95
+ from awq import AutoAWQForCausalLM
96
  from transformers import AutoModelForCausalLM, AutoTokenizer
97
 
98
  model_id = "hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4"
 
102
  ]
103
 
104
  tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 
 
105
  model = AutoAWQForCausalLM.from_pretrained(
106
  model_id,
107
  torch_dtype=torch.float16,
108
  low_cpu_mem_usage=True,
109
  device_map="auto",
 
110
  )
111
 
112
+ inputs = tokenizer.apply_chat_template(prompt, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True).to('cuda')
113
+ outputs = model.generate(**inputs, do_sample=True, max_new_tokens=256)
114
  print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
115
  ```
116