Spaces:
Runtime error
Runtime error
import torch | |
import gradio as gr | |
from transformers import AutoTokenizer, pipeline, logging | |
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig | |
model_name_or_path = "TheBloke/WizardCoder-Guanaco-15B-V1.1-GPTQ" | |
model_basename = "gptq_model-4bit-128g" | |
use_triton = False | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) | |
quantize_config = BaseQuantizeConfig( | |
bits=4, # quantize model to 4-bit | |
group_size=128, # it is recommended to set the value to 128 | |
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad | |
) | |
model = AutoGPTQForCausalLM.from_quantized(model_name_or_path, | |
model_basename=model_basename, | |
use_safetensors=True, | |
trust_remote_code=False, | |
device=device, | |
use_triton=use_triton, | |
quantize_config=quantize_config, | |
cache_dir="models/" | |
) | |
""" | |
To download from a specific branch, use the revision parameter, as in this example: | |
model = AutoGPTQForCausalLM.from_quantized(model_name_or_path, | |
revision="gptq-4bit-32g-actorder_True", | |
model_basename=model_basename, | |
use_safetensors=True, | |
trust_remote_code=False, | |
device="cuda:0", | |
quantize_config=None) | |
""" | |
def code_gen(text): | |
logging.set_verbosity(logging.CRITICAL) | |
print("*** Pipeline:") | |
pipe = pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
max_new_tokens=124, | |
temperature=0.7, | |
top_p=0.95, | |
repetition_penalty=1.15 | |
) | |
response = pipe(text) | |
print(response) | |
return response[0]['generated_text'] | |
iface = gr.Interface(fn=code_gen, | |
inputs=gr.inputs.Textbox( | |
label="Input Source Code"), | |
outputs="text", | |
title="Code Generation") | |
iface.launch() | |