basicmood / format_data.py
MoodChartAI's picture
Upload 2 files
41d7246 verified
from peft import PeftModel
import pandas as pd
import shelve
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from transformers import AutoModelForCausalLM
import torch
from datasets import load_dataset, Dataset
import datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
#model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
moodb = shelve.open('mood.db')
happy, sad = moodb['happy'][1].split('\n'), moodb['sad'][1].split('\n')
for i, h in enumerate(happy):
happy[i] = "Prompt:"+h+"Completion: You're feeling happy"
for i, s in enumerate(sad):
sad[i] = "Prompt:"+s+"Completion: You're feeling sad"
happy = list(zip(happy, ["You're happy" for d in range(len(happy))]))
sad = list(zip(sad, ["You're sad" for d in range(len(sad))]))
data = sad+happy
#print(data)
df = pd.DataFrame(data, columns=['Prompt', 'Completion'])
#print(df)
def tokenize(sample):
tokenized_text = tokenizer(sample['Prompt'], padding=True, truncation=True, max_length=512)
return tokenized_text
data = Dataset.from_pandas(df)
tokenized_data = data.map(tokenize, batched=True, desc="Tokenizing data", remove_columns=data.column_names)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(
"EleutherAI/gpt-neo-1.3B",
device_map={"":0},
trust_remote_code=True,
quantization_config=bnb_config
)
lora_config = LoraConfig(
r=16,
lora_alpha=16,
target_modules=["Wqkv", "out_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
training_arguments = TrainingArguments(
output_dir="Multi-lingual-finetuned-med-text",
per_device_train_batch_size=4,
gradient_accumulation_steps=1,
learning_rate=2e-4,
lr_scheduler_type="cosine",
save_strategy="epoch",
logging_steps=1000,
max_steps=55550,
num_train_epochs=1
)
trainer = Trainer(
model=model,
train_dataset=tokenized_data,
args=training_arguments,
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()
#peft_model = PeftModel.from_pretrained(model, "/root/projects/Multi-lingual-finetuned-med-text/checkpoint-10/", from_transformers=True)
#model = peft_model.merge_and_unload()
# model