codefuse-admin
commited on
Commit
·
cffbce0
1
Parent(s):
d38f5d1
Update README.md
Browse files
README.md
CHANGED
@@ -31,7 +31,7 @@ After undergoing 4-bit quantization, the CodeFuse-CodeLlama-34B-4bits model can
|
|
31 |
|
32 |
🔥🔥🔥 2023-09-26 We are pleased to announce the release of the 4-bit quantized version of CodeFuse-CodeLlama-34B. Despite the quantization process, the model still achieves a remarkable 73.8% accuracy (greedy decoding) on the HumanEval pass@1 metric.
|
33 |
|
34 |
-
🔥🔥🔥 2023-09-11 CodeFuse-CodeLlama34B has
|
35 |
|
36 |
<br>
|
37 |
|
@@ -124,24 +124,22 @@ pip install -r requirements.txt
|
|
124 |
import os
|
125 |
import torch
|
126 |
import time
|
127 |
-
from
|
128 |
-
from auto_gptq import AutoGPTQForCausalLM
|
129 |
|
130 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
131 |
|
132 |
-
def load_model_tokenizer(
|
133 |
"""
|
134 |
-
Load model and tokenizer based on the given model name or local path of downloaded model.
|
135 |
"""
|
136 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
137 |
trust_remote_code=True,
|
138 |
use_fast=False,
|
139 |
lagecy=False)
|
140 |
tokenizer.padding_side = "left"
|
141 |
-
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("<unk>")
|
142 |
-
tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("</s>")
|
143 |
|
144 |
-
model = AutoGPTQForCausalLM.from_quantized(
|
145 |
inject_fused_attention=False,
|
146 |
inject_fused_mlp=False,
|
147 |
use_cuda_fp16=True,
|
@@ -153,7 +151,7 @@ def load_model_tokenizer(model_path):
|
|
153 |
|
154 |
def inference(model, tokenizer, prompt):
|
155 |
"""
|
156 |
-
Uset the given model and tokenizer to generate an answer for the
|
157 |
"""
|
158 |
st = time.time()
|
159 |
prompt = prompt if prompt.endswith('\n') else f'{prompt}\n'
|
@@ -181,11 +179,10 @@ def inference(model, tokenizer, prompt):
|
|
181 |
|
182 |
|
183 |
if __name__ == "__main__":
|
184 |
-
|
185 |
-
|
186 |
prompt = 'Please write a QuickSort program in Python'
|
187 |
|
188 |
-
model, tokenizer = load_model_tokenizer(
|
189 |
inference(model, tokenizer, prompt)
|
190 |
```
|
191 |
|
@@ -319,29 +316,27 @@ pip install -r requirements.txt
|
|
319 |
import os
|
320 |
import torch
|
321 |
import time
|
322 |
-
from
|
323 |
-
from auto_gptq import AutoGPTQForCausalLM
|
324 |
|
325 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
326 |
|
327 |
-
def load_model_tokenizer(
|
328 |
"""
|
329 |
Load model and tokenizer based on the given model name or local path of downloaded model.
|
330 |
"""
|
331 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
332 |
trust_remote_code=True,
|
333 |
use_fast=False,
|
334 |
lagecy=False)
|
335 |
tokenizer.padding_side = "left"
|
336 |
-
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("<unk>")
|
337 |
-
tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("</s>")
|
338 |
|
339 |
-
model = AutoGPTQForCausalLM.from_quantized(
|
340 |
inject_fused_attention=False,
|
341 |
inject_fused_mlp=False,
|
342 |
use_cuda_fp16=True,
|
343 |
disable_exllama=False,
|
344 |
-
device_map='auto' #
|
345 |
)
|
346 |
return model, tokenizer
|
347 |
|
@@ -366,7 +361,7 @@ def inference(model, tokenizer, prompt):
|
|
366 |
do_sample=True,
|
367 |
max_new_tokens=512,
|
368 |
eos_token_id=tokenizer.eos_token_id,
|
369 |
-
pad_token_id=tokenizer.pad_token_id
|
370 |
)
|
371 |
print(f'generated tokens num is {len(generated_ids[0][input_ids.size(1):])}')
|
372 |
outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
|
@@ -376,11 +371,10 @@ def inference(model, tokenizer, prompt):
|
|
376 |
|
377 |
|
378 |
if __name__ == "__main__":
|
379 |
-
|
380 |
-
|
381 |
prompt = '请用Python实现一个快速排序算法'
|
382 |
|
383 |
-
model, tokenizer = load_model_tokenizer(
|
384 |
inference(model, tokenizer, prompt)
|
385 |
```
|
386 |
|
|
|
31 |
|
32 |
🔥🔥🔥 2023-09-26 We are pleased to announce the release of the 4-bit quantized version of CodeFuse-CodeLlama-34B. Despite the quantization process, the model still achieves a remarkable 73.8% accuracy (greedy decoding) on the HumanEval pass@1 metric.
|
33 |
|
34 |
+
🔥🔥🔥 2023-09-11 CodeFuse-CodeLlama34B has achieved 74.4% of pass@1 (greedy decoding) on HumanEval, which is SOTA results for openspurced LLMs at present.
|
35 |
|
36 |
<br>
|
37 |
|
|
|
124 |
import os
|
125 |
import torch
|
126 |
import time
|
127 |
+
from transformers import AutoTokenizer
|
128 |
+
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
|
129 |
|
130 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
131 |
|
132 |
+
def load_model_tokenizer(model_name_or_local_path):
|
133 |
"""
|
134 |
+
Load model and tokenizer based on the given model name or local path of the downloaded model.
|
135 |
"""
|
136 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name_or_local_path,
|
137 |
trust_remote_code=True,
|
138 |
use_fast=False,
|
139 |
lagecy=False)
|
140 |
tokenizer.padding_side = "left"
|
|
|
|
|
141 |
|
142 |
+
model = AutoGPTQForCausalLM.from_quantized(model_name_or_local_path,
|
143 |
inject_fused_attention=False,
|
144 |
inject_fused_mlp=False,
|
145 |
use_cuda_fp16=True,
|
|
|
151 |
|
152 |
def inference(model, tokenizer, prompt):
|
153 |
"""
|
154 |
+
Uset the given model and tokenizer to generate an answer for the specified prompt.
|
155 |
"""
|
156 |
st = time.time()
|
157 |
prompt = prompt if prompt.endswith('\n') else f'{prompt}\n'
|
|
|
179 |
|
180 |
|
181 |
if __name__ == "__main__":
|
182 |
+
model_name_or_local_path = '<Mole name (i.e. codefuse-ai/CodeFuse-CodeLlama-34B-4bits) or local path of the downloaded model>'
|
|
|
183 |
prompt = 'Please write a QuickSort program in Python'
|
184 |
|
185 |
+
model, tokenizer = load_model_tokenizer(model_name_or_local_path)
|
186 |
inference(model, tokenizer, prompt)
|
187 |
```
|
188 |
|
|
|
316 |
import os
|
317 |
import torch
|
318 |
import time
|
319 |
+
from transformers import AutoTokenizer
|
320 |
+
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
|
321 |
|
322 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
323 |
|
324 |
+
def load_model_tokenizer(model_name_or_local_path):
|
325 |
"""
|
326 |
Load model and tokenizer based on the given model name or local path of downloaded model.
|
327 |
"""
|
328 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name_or_local_path,
|
329 |
trust_remote_code=True,
|
330 |
use_fast=False,
|
331 |
lagecy=False)
|
332 |
tokenizer.padding_side = "left"
|
|
|
|
|
333 |
|
334 |
+
model = AutoGPTQForCausalLM.from_quantized(model_name_or_local_path,
|
335 |
inject_fused_attention=False,
|
336 |
inject_fused_mlp=False,
|
337 |
use_cuda_fp16=True,
|
338 |
disable_exllama=False,
|
339 |
+
device_map='auto' # Support multi-gpus
|
340 |
)
|
341 |
return model, tokenizer
|
342 |
|
|
|
361 |
do_sample=True,
|
362 |
max_new_tokens=512,
|
363 |
eos_token_id=tokenizer.eos_token_id,
|
364 |
+
pad_token_id=tokenizer.pad_token_id
|
365 |
)
|
366 |
print(f'generated tokens num is {len(generated_ids[0][input_ids.size(1):])}')
|
367 |
outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
|
|
|
371 |
|
372 |
|
373 |
if __name__ == "__main__":
|
374 |
+
model_name_or_local_path = '<模型名字 (即codefuse-ai/CodeFuse-CodeLlama-34B-4bits)或者提前下载到本地的模型路径>'
|
|
|
375 |
prompt = '请用Python实现一个快速排序算法'
|
376 |
|
377 |
+
model, tokenizer = load_model_tokenizer(model_name_or_local_path)
|
378 |
inference(model, tokenizer, prompt)
|
379 |
```
|
380 |
|