经测试，此版本的效果较好😀

I use the 50k Chinese data, which is the combination of alpaca_chinese_instruction_dataset and the Chinese conversation data from sharegpt-90k data. I finetune the model for 3 epochs use a single 4090 GPU with cutoff_len=1024.

Use in Python:

from transformers import LlamaForCausalLM, LlamaTokenizer
from peft import PeftModel
import torch


tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")

model = LlamaForCausalLM.from_pretrained(
    "decapoda-research/llama-7b-hf",
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
model = PeftModel.from_pretrained(
    model,
    "Laurie/lora-instruct-chat-50k-cn-en",
    torch_dtype=torch.float16,
    device_map={'': 0}
)

device = "cuda" if torch.cuda.is_available() else "cpu"

inputs = tokenizer("什么是自然语言处理？",return_tensors="pt" )

model.to(device)

with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=129)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))