Abhinav Kulkarni
commited on
Commit
•
60bf695
1
Parent(s):
6553ed5
Updated README
Browse files
README.md
CHANGED
@@ -43,6 +43,7 @@ git clone https://github.com/mit-han-lab/llm-awq \
|
|
43 |
```
|
44 |
|
45 |
```python
|
|
|
46 |
import torch
|
47 |
from awq.quantize.quantizer import real_quantize_model_weight
|
48 |
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, TextStreamer
|
@@ -84,6 +85,7 @@ prompt = f'''What is the difference between nuclear fusion and fission?
|
|
84 |
###Response:'''
|
85 |
|
86 |
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
|
|
|
87 |
output = model.generate(
|
88 |
inputs=input_ids,
|
89 |
temperature=0.7,
|
@@ -93,6 +95,9 @@ output = model.generate(
|
|
93 |
repetition_penalty=1.1,
|
94 |
eos_token_id=tokenizer.eos_token_id,
|
95 |
streamer=streamer)
|
|
|
|
|
|
|
96 |
```
|
97 |
|
98 |
## Evaluation
|
|
|
43 |
```
|
44 |
|
45 |
```python
|
46 |
+
import time
|
47 |
import torch
|
48 |
from awq.quantize.quantizer import real_quantize_model_weight
|
49 |
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, TextStreamer
|
|
|
85 |
###Response:'''
|
86 |
|
87 |
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
|
88 |
+
t1 = time.time()
|
89 |
output = model.generate(
|
90 |
inputs=input_ids,
|
91 |
temperature=0.7,
|
|
|
95 |
repetition_penalty=1.1,
|
96 |
eos_token_id=tokenizer.eos_token_id,
|
97 |
streamer=streamer)
|
98 |
+
t2 = time.time()
|
99 |
+
print("*"*80)
|
100 |
+
print(f"Generated {num_tokens/(t2-t1):.2f} token/s; {(t2-t1)*1000/num_tokens:.2f} ms/token")
|
101 |
```
|
102 |
|
103 |
## Evaluation
|