teachyourselfcoding's picture
Upload 245 files
fa6856c
raw
history blame
1.84 kB
import json
import sys
from datasets import load_dataset
from ppo_hh import create_reward_fn
import trlx
from trlx.data.default_configs import (
ModelConfig,
OptimizerConfig,
SchedulerConfig,
SFTConfig,
TokenizerConfig,
TrainConfig,
TRLConfig,
)
default_config = TRLConfig(
train=TrainConfig(
seq_length=1024,
epochs=100,
total_steps=10000,
batch_size=4,
checkpoint_interval=10000,
eval_interval=1000,
pipeline="PromptPipeline",
trainer="AccelerateSFTTrainer",
checkpoint_dir="checkpoints/sft_hh",
),
model=ModelConfig(model_path="EleutherAI/gpt-j-6B", num_layers_unfrozen=-1),
tokenizer=TokenizerConfig(tokenizer_path="EleutherAI/gpt-j-6B", truncation_side="left"),
optimizer=OptimizerConfig(name="adamw", kwargs=dict(lr=1e-6, betas=(0.9, 0.95), eps=1.0e-8, weight_decay=1.0e-6)),
scheduler=SchedulerConfig(name="cosine_annealing", kwargs=dict(T_max=100000000, eta_min=1e-6)),
method=SFTConfig(
name="sftconfig",
gen_kwargs=dict(max_new_tokens=128, top_k=20, top_p=1.0, do_sample=True),
),
)
def preprocess(sample):
sample["chosen_sample"] = sample["prompt"] + sample["chosen"]
return sample
def main(hparams={}):
config = TRLConfig.update(default_config, hparams)
dataset = load_dataset("Dahoas/full-hh-rlhf").map(preprocess)
reward_fn = create_reward_fn()
trlx.train(
config=config,
samples=dataset["train"]["chosen_sample"],
eval_prompts=dataset["test"]["prompt"][:280],
metric_fn=lambda **kwargs: {"reward": reward_fn(**kwargs)},
stop_sequences=["Human:", "human:", "Assistant:", "assistant:"],
)
if __name__ == "__main__":
hparams = {} if len(sys.argv) == 1 else json.loads(sys.argv[1])
main(hparams)