jeevavijay10 commited on
Commit
c09cb0e
·
1 Parent(s): 281252e

change codet5p-770m

Browse files
Files changed (3) hide show
  1. app-autogptq.py +70 -0
  2. app.py +8 -43
  3. requirements.txt +2 -1
app-autogptq.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ from transformers import AutoTokenizer, pipeline, logging
4
+ from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
5
+
6
+ model_name_or_path = "TheBloke/WizardCoder-Guanaco-15B-V1.1-GPTQ"
7
+ model_basename = "gptq_model-4bit-128g"
8
+
9
+ use_triton = False
10
+
11
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
+
13
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
14
+
15
+ quantize_config = BaseQuantizeConfig(
16
+ bits=4, # quantize model to 4-bit
17
+ group_size=128, # it is recommended to set the value to 128
18
+ desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
19
+ )
20
+
21
+ model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
22
+ model_basename=model_basename,
23
+ use_safetensors=True,
24
+ trust_remote_code=False,
25
+ device=device,
26
+ use_triton=use_triton,
27
+ quantize_config=quantize_config,
28
+ cache_dir="models/"
29
+ )
30
+
31
+ """
32
+ To download from a specific branch, use the revision parameter, as in this example:
33
+
34
+ model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
35
+ revision="gptq-4bit-32g-actorder_True",
36
+ model_basename=model_basename,
37
+ use_safetensors=True,
38
+ trust_remote_code=False,
39
+ device="cuda:0",
40
+ quantize_config=None)
41
+ """
42
+
43
+
44
+ def code_gen(text):
45
+ logging.set_verbosity(logging.CRITICAL)
46
+
47
+ print("*** Pipeline:")
48
+ pipe = pipeline(
49
+ "text-generation",
50
+ model=model,
51
+ tokenizer=tokenizer,
52
+ max_new_tokens=124,
53
+ temperature=0.7,
54
+ top_p=0.95,
55
+ repetition_penalty=1.15
56
+ )
57
+
58
+ response = pipe(text)
59
+ print(response)
60
+
61
+ return response[0]['generated_text']
62
+
63
+
64
+ iface = gr.Interface(fn=code_gen,
65
+ inputs=gr.inputs.Textbox(
66
+ label="Input Source Code"),
67
+ outputs="text",
68
+ title="Code Generation")
69
+
70
+ iface.launch()
app.py CHANGED
@@ -1,57 +1,22 @@
1
  import torch
2
  import gradio as gr
3
- from transformers import AutoTokenizer, pipeline, logging
4
- from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
5
 
6
- model_name_or_path = "TheBloke/WizardCoder-Guanaco-15B-V1.1-GPTQ"
7
- model_basename = "gptq_model-4bit-128g"
8
 
9
- use_triton = False
10
 
11
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
-
13
- tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
14
-
15
- model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
16
- model_basename=model_basename,
17
- use_safetensors=True,
18
- trust_remote_code=False,
19
- device=device,
20
- use_triton=use_triton,
21
- quantize_config=None,
22
- cache_dir="models/"
23
- )
24
-
25
- """
26
- To download from a specific branch, use the revision parameter, as in this example:
27
-
28
- model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
29
- revision="gptq-4bit-32g-actorder_True",
30
- model_basename=model_basename,
31
- use_safetensors=True,
32
- trust_remote_code=False,
33
- device="cuda:0",
34
- quantize_config=None)
35
- """
36
 
37
 
38
  def code_gen(text):
39
- # input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
40
- # output = model.generate(
41
- # inputs=input_ids, temperature=0.7, max_new_tokens=124)
42
- # print(tokenizer.decode(output[0]))
43
-
44
- # Inference can also be done using transformers' pipeline
45
-
46
- # Prevent printing spurious transformers error when using pipeline with AutoGPTQ
47
  logging.set_verbosity(logging.CRITICAL)
48
 
49
  print("*** Pipeline:")
50
  pipe = pipeline(
51
- "text-generation",
52
- model=model,
53
- tokenizer=tokenizer,
54
- max_new_tokens=124,
55
  temperature=0.7,
56
  top_p=0.95,
57
  repetition_penalty=1.15
@@ -59,7 +24,7 @@ def code_gen(text):
59
 
60
  response = pipe(text)
61
  print(response)
62
-
63
  return response[0]['generated_text']
64
 
65
 
 
1
  import torch
2
  import gradio as gr
3
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, logging
 
4
 
 
 
5
 
6
+ checkpoint = "Salesforce/codet5p-770m"
7
 
8
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
9
+ model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, cache_dir="models/")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
12
  def code_gen(text):
 
 
 
 
 
 
 
 
13
  logging.set_verbosity(logging.CRITICAL)
14
 
15
  print("*** Pipeline:")
16
  pipe = pipeline(
17
+ model=checkpoint,
18
+ # tokenizer=tokenizer,
19
+ max_new_tokens=64,
 
20
  temperature=0.7,
21
  top_p=0.95,
22
  repetition_penalty=1.15
 
24
 
25
  response = pipe(text)
26
  print(response)
27
+
28
  return response[0]['generated_text']
29
 
30
 
requirements.txt CHANGED
@@ -2,4 +2,5 @@ transformers
2
  # tiktoken
3
  torch
4
  torchvision
5
- auto-gptq
 
 
2
  # tiktoken
3
  torch
4
  torchvision
5
+ auto-gptq
6
+ bitsandbytes