from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from llama_condense_monkey_patch import replace_llama_with_condense from peft import PeftConfig from peft import PeftModel import torch ## config device params & load model peft_model_id = "mingkuan/longchat-7b-qlora-customer-support" base_model_id = "lmsys/longchat-7b-16k" config = AutoConfig.from_pretrained(base_model_id) replace_llama_with_condense(config.rope_condense_ratio) tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=False) kwargs = {"torch_dtype": torch.float16} kwargs["device_map"] = "auto" nf4_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16 ) model = AutoModelForCausalLM.from_pretrained( base_model_id, return_dict=True, trust_remote_code=True, quantization_config=nf4_config, load_in_4bit=True, **kwargs ) model = PeftModel.from_pretrained(model, peft_model_id) def generate_prompt(query): return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: You are a customer support assistant that can extract user request intent and category, and then provide appropriate answers. If the user input is related to customer support domain, please try to generate a json string that contains extracted category and intent, and the proper response. If user input is unrelated to customer support domain, please try to answer it in natural language. Example run: Input: Would it be possible to cancel the order I made? Output: "Category": "ORDER", "Intent": "cancel_order", "Answer": "Sure, I definitely can help you with that. Can you provide me your order number for the cancelation?" ### Input: {query} """ def getLLMResponse(prompt): device = "cuda" input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda() output = model.generate(inputs=input_ids, temperature=0.5, max_new_tokens=256) promptLen = len(prompt) response = tokenizer.decode(output[0], skip_special_tokens=True)[promptLen:] ## omit the user input part return response query = 'help me to setup a new shipping address?' response = getLLMResponse(generate_prompt(query)) print(f'\nUserInput:{query}\n\nLLM:\n{response}\n\n')