How to run it ?
I have tried this:
#############################################################################
from transformers import AutoTokenizer, pipeline, logging,AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import argparse
quantized_model_dir = "/media/galaxy/guanaco/guanaco-65B-GPTQ"
model_basename = "Guanaco-65B-GPTQ-4bit.act-order"
use_triton = False
tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=True)
quantize_config = BaseQuantizeConfig(
bits=4,
group_size=128,
desc_act=False
)
model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir,
use_safetensors=True,
model_basename=model_basename,
device="cuda:0",
use_triton=use_triton,
quantize_config=quantize_config)
Prevent printing spurious transformers error when using pipeline with AutoGPTQ
logging.set_verbosity(logging.CRITICAL)
prompt = "Tell me about AI"
prompt_template=f'''### Human: {prompt}
Assistant:'''
print("*** Pipeline:")
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tok,
max_new_tokens=512,
temperature=0.7,
top_p=0.95,
repetition_penalty=1.15
)
print(pipe(prompt_template)[0]['generated_text'])
print("\n\n*** Generate:")
input_ids = tok(prompt_template, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
print(tokenizer.decode(output[0]))
#############################################################################
but got an error :
Exception: data did not match any variant of untagged enum PyNormalizerTypeWrapper at line 49
column 3
I am using an 4x 40G station , any suggestion ? or it will be great if a demo python file could be provided ,thank you .