|
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig |
|
from llama_condense_monkey_patch import replace_llama_with_condense |
|
from peft import PeftConfig |
|
from peft import PeftModel |
|
import torch |
|
|
|
|
|
peft_model_id = "mingkuan/longchat-7b-qlora-customer-support" |
|
base_model_id = "lmsys/longchat-7b-16k" |
|
|
|
config = AutoConfig.from_pretrained(base_model_id) |
|
replace_llama_with_condense(config.rope_condense_ratio) |
|
tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=False) |
|
|
|
kwargs = {"torch_dtype": torch.float16} |
|
kwargs["device_map"] = "auto" |
|
nf4_config = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_quant_type="nf4", |
|
bnb_4bit_use_double_quant=True, |
|
bnb_4bit_compute_dtype=torch.bfloat16 |
|
) |
|
model = AutoModelForCausalLM.from_pretrained( |
|
base_model_id, |
|
return_dict=True, |
|
trust_remote_code=True, |
|
quantization_config=nf4_config, |
|
load_in_4bit=True, |
|
**kwargs |
|
) |
|
|
|
|
|
model = PeftModel.from_pretrained(model, peft_model_id) |
|
|
|
def generate_prompt(query): |
|
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. |
|
|
|
### Instruction: |
|
You are a customer support assistant that can extract user request intent and category, and then provide appropriate answers. If the user input is related to customer support domain, please try to generate a json string that contains extracted category and intent, and the proper response. |
|
If user input is unrelated to customer support domain, please try to answer it in natural language. |
|
|
|
Example run: |
|
Input: Would it be possible to cancel the order I made? |
|
Output: "Category": "ORDER", "Intent": "cancel_order", "Answer": "Sure, I definitely can help you with that. Can you provide me your order number for the cancelation?" |
|
|
|
### Input: |
|
{query} |
|
|
|
""" |
|
|
|
|
|
def getLLMResponse(prompt): |
|
device = "cuda" |
|
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda() |
|
output = model.generate(inputs=input_ids, temperature=0.5, max_new_tokens=256) |
|
promptLen = len(prompt) |
|
response = tokenizer.decode(output[0], skip_special_tokens=True)[promptLen:] |
|
return response |
|
|
|
query = 'help me to setup a new shipping address?' |
|
response = getLLMResponse(generate_prompt(query)) |
|
print(f'\nUserInput:{query}\n\nLLM:\n{response}\n\n') |
|
|