|
--- |
|
license: unlicense |
|
--- |
|
Running opt-6.7b with added loras locally on windows! |
|
|
|
# bitsandbytes |
|
|
|
I needed to get bitsandbytes working in my venv: |
|
I replaced the main.py in C:\Users\user\Desktop\test\peft\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py with the one here! |
|
I also added a .dll file here: C:\Users\user\Desktop\test\peft\venv\Lib\site-packages\bitsandbytes\libbitsandbytes_cuda116.dll |
|
|
|
|
|
|
|
# Training Script |
|
|
|
(https://github.com/huggingface/peft/commit/df0e1fb59266c9903ddd6dbfe7339bcd2068d150) (It's from their notebook!) |
|
|
|
``` |
|
#load |
|
|
|
|
|
import os |
|
os.environ["CUDA_VISIBLE_DEVICES"]="0" |
|
import torch |
|
import torch.nn as nn |
|
import bitsandbytes as bnb |
|
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
"facebook/opt-6.7b", |
|
load_in_8bit=True, |
|
device_map='auto', |
|
) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-6.7b") |
|
|
|
|
|
#post-processing |
|
|
|
for param in model.parameters(): |
|
param.requires_grad = False # freeze the model - train adapters later |
|
if param.ndim == 1: |
|
# cast the small parameters (e.g. layernorm) to fp32 for stability |
|
param.data = param.data.to(torch.float32) |
|
|
|
model.gradient_checkpointing_enable() # reduce number of stored activations |
|
model.enable_input_require_grads() |
|
|
|
class CastOutputToFloat(nn.Sequential): |
|
def forward(self, x): return super().forward(x).to(torch.float32) |
|
model.lm_head = CastOutputToFloat(model.lm_head) |
|
|
|
# apply lora |
|
|
|
def print_trainable_parameters(model): |
|
""" |
|
Prints the number of trainable parameters in the model. |
|
""" |
|
trainable_params = 0 |
|
all_param = 0 |
|
for _, param in model.named_parameters(): |
|
all_param += param.numel() |
|
if param.requires_grad: |
|
trainable_params += param.numel() |
|
print( |
|
f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" |
|
) |
|
|
|
# apply lora 2 |
|
|
|
from peft import LoraConfig, get_peft_model |
|
|
|
config = LoraConfig( |
|
r=16, |
|
lora_alpha=32, |
|
target_modules=["q_proj", "v_proj"], |
|
lora_dropout=0.05, |
|
bias="none", |
|
task_type="CAUSAL_LM" |
|
) |
|
|
|
model = get_peft_model(model, config) |
|
print_trainable_parameters(model) |
|
|
|
# training |
|
|
|
import transformers |
|
from datasets import load_dataset |
|
data = load_dataset("Abirate/english_quotes") |
|
data = data.map(lambda samples: tokenizer(samples['quote']), batched=True) |
|
|
|
trainer = transformers.Trainer( |
|
model=model, |
|
train_dataset=data['train'], |
|
args=transformers.TrainingArguments( |
|
per_device_train_batch_size=4, |
|
gradient_accumulation_steps=4, |
|
warmup_steps=100, |
|
max_steps=200, |
|
learning_rate=2e-4, |
|
fp16=True, |
|
logging_steps=1, |
|
output_dir='outputs' |
|
), |
|
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False) |
|
) |
|
model.config.use_cache = False # silence the warnings. Please re-enable for inference! |
|
trainer.train() |
|
|
|
# push to huggingface txtloras |
|
model.push_to_hub("Yoshiii/opt-6.7b-lora", use_auth_token=True) |
|
|
|
|
|
# inference |
|
|
|
batch = tokenizer("Two things are infinite: ", return_tensors='pt') |
|
|
|
with torch.cuda.amp.autocast(): |
|
output_tokens = model.generate(**batch, max_new_tokens=50) |
|
|
|
print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True)) |
|
``` |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Inference (loading this repo lora from hf) |
|
|
|
``` |
|
import torch |
|
from peft import PeftModel, PeftConfig |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
peft_model_id = "Yoshiii/opt-6.7b-lora" |
|
config = PeftConfig.from_pretrained(peft_model_id) |
|
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto') |
|
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) |
|
|
|
# Load the Lora model |
|
model = PeftModel.from_pretrained(model, peft_model_id) |
|
|
|
|
|
batch = tokenizer("Two things are infinite: ", return_tensors='pt') |
|
|
|
with torch.cuda.amp.autocast(): |
|
output_tokens = model.generate(**batch, max_new_tokens=50) |
|
|
|
print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True)) |
|
``` |
|
|
|
Two things are infinite: the universe and human stupidity; and I'm not sure about the universe. -Albert Einstein I'm not sure about the universe either. |
|
|
|
|
|
This output is like the training data. If you run without applying the Lora, it will usually look worse. If you retrain the lora, know that your new lora is not going to output the same results, despite you using the same settings. |
|
Inference should usually be deterministic when using the same lora, or using without lora. |
|
|
|
|
|
|