# Data paths and options when using EleutherAI cluster { # you may include multiple distinct datasets if desired "train_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_text_document"], "valid_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_val_text_document"], "test_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_test_text_document"], # if using multiple datasets, provide weights for them to be sampled with # "train-data-weights": [1., 2.], # "test-data-weights": [2., 1.], # "valid-data-weights": [0.5, 0.4], # If you would like the code to create val and test datasets from your training set use the following instead # "split" determines the relative size of train, val, and test # "split" 995,4,1 # "data_path": "/mnt/ssd-1/data/enwik8/enwik8_text_document", "vocab_file": "/mnt/ssd-1/data/gpt2-vocab.json", "merge_file": "/mnt/ssd-1/data/gpt2-merges.txt", "save": "/mnt/ssd-1/checkpoints", "load": "/mnt/ssd-1/checkpoints", "tensorboard_dir": "/mnt/ssd-1/tensorboard", "log_dir": "/mnt/ssd-1/logs", "wandb_team": "eleutherai", "wandb_project": "neox", "wandb_group": "example" }