--- library_name: peft base_model: meta-llama/Llama-2-70b-chat-hf --- # Chadgpt Llama2 70b conversation A minimum of 36 GB VRAM is required. ## Colab Example https://colab.research.google.com/drive/10ZBNDK3lRn_IdPgSFQIO2f2UphP1MiVi?usp=sharing ## Install Prerequisite ```bash !pip install peft !pip install transformers !pip install bitsandbytes !pip install accelerate ``` ## Login Using Huggingface Token ```bash # You need a huggingface token that can access llama2 from huggingface_hub import notebook_login notebook_login() ``` ## Download Model ```python import torch from peft import PeftModel, PeftConfig from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig device = "cuda" if torch.cuda.is_available() else "cpu" bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 ) peft_model_id = "danjie/Chadgpt-Llama2-70b-conversation" config = PeftConfig.from_pretrained(peft_model_id) model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, device_map='cuda', quantization_config=bnb_config) tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) # Load the Lora model model = PeftModel.from_pretrained(model, peft_model_id) ``` ## Inference ```python # Run this cell to start a new conversation conversation_history = [] def format_conversation(conversation: list[str]) -> str: formatted_conversation = "" # Check if the conversation has more than two turns if len(conversation) > 2: # Process all but the last two turns for i in range(len(conversation) - 2): if i % 2 == 0: formatted_conversation += "" + conversation[i] + "\n" else: formatted_conversation += "" + conversation[i] + "\n" # Process the last two turns if len(conversation) >= 2: formatted_conversation += "" + conversation[-2] + "\n" formatted_conversation += "" + conversation[-1] return formatted_conversation def talk_with_llm(chat: str) -> str: # Encode and move tensor into cuda if applicable. conversation_history.append(chat) conversation_history.append("") conversation = format_conversation(conversation_history) encoded_input = tokenizer(conversation, return_tensors='pt') encoded_input = {k: v.to(device) for k, v in encoded_input.items()} output = model.generate(**encoded_input, max_new_tokens=256) response = tokenizer.decode(output[0], skip_special_tokens=True) response = response[len(conversation):] conversation_history.pop() conversation_history.append(response) return response ```