Train

Environment

cd scripts
python -m venv venv
source venv/bin/activate
pip install -U -r requirements.in

Tokenizer

python -B train_tokenizer.py

Dataset

python -B prepare_pretrain_dataset.py

from litdata import StreamingDataset, StreamingDataLoader, TokensLoader

dataset = StreamingDataset(
  input_dir='../pretrain-data/',
  item_loader=TokensLoader(block_size=2048 + 1),
)

print(len(dataset))

Model

Pretrain

litgpt pretrain --config ./pretrain-model.yaml

litgpt convert_from_litgpt out/pretrain/final/ out/converted_model
cp config.json out/pretrain/final/
cp config.json out/converted_model/

import torch
from safetensors.torch import save_file

state_dict = torch.load('out/converted_model/model.pth', map_location='cpu')
save_file(state_dict, 'out/converted_model/model.safetensors')

Evaluate

litgpt evaluate --tasks 'hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge' --out_dir 'evaluate-quick/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
arc_challenge	1	none	0	acc	↑	0.1962	±	0.0116
		none	0	acc_norm	↑	0.2304	±	0.0123
gsm8k	3	flexible-extract	5	exact_match	↑	0.0144	±	0.0033
		strict-match	5	exact_match	↑	0.0015	±	0.0011
hellaswag	1	none	0	acc	↑	0.2631	±	0.0044
		none	0	acc_norm	↑	0.2758	±	0.0045
mmlu	2	none		acc	↑	0.2473	±	0.0036
- humanities	2	none		acc	↑	0.2351	±	0.0062
- formal_logic	1	none	0	acc	↑	0.2857	±	0.0404
- high_school_european_history	1	none	0	acc	↑	0.2667	±	0.0345
- high_school_us_history	1	none	0	acc	↑	0.2696	±	0.0311
- high_school_world_history	1	none	0	acc	↑	0.2110	±	0.0266
- international_law	1	none	0	acc	↑	0.1653	±	0.0339
- jurisprudence	1	none	0	acc	↑	0.2870	±	0.0437
- logical_fallacies	1	none	0	acc	↑	0.2331	±	0.0332
- moral_disputes	1	none	0	acc	↑	0.2283	±	0.0226
- moral_scenarios	1	none	0	acc	↑	0.2425	±	0.0143
- philosophy	1	none	0	acc	↑	0.2186	±	0.0235
- prehistory	1	none	0	acc	↑	0.2099	±	0.0227
- professional_law	1	none	0	acc	↑	0.2314	±	0.0108
- world_religions	1	none	0	acc	↑	0.2632	±	0.0338
- other	2	none		acc	↑	0.2485	±	0.0078
- business_ethics	1	none	0	acc	↑	0.2600	±	0.0441
- clinical_knowledge	1	none	0	acc	↑	0.2528	±	0.0267
- college_medicine	1	none	0	acc	↑	0.2254	±	0.0319
- global_facts	1	none	0	acc	↑	0.2700	±	0.0446
- human_aging	1	none	0	acc	↑	0.2377	±	0.0286
- management	1	none	0	acc	↑	0.2816	±	0.0445
- marketing	1	none	0	acc	↑	0.2692	±	0.0291
- medical_genetics	1	none	0	acc	↑	0.2600	±	0.0441
- miscellaneous	1	none	0	acc	↑	0.2350	±	0.0152
- nutrition	1	none	0	acc	↑	0.2549	±	0.0250
- professional_accounting	1	none	0	acc	↑	0.2801	±	0.0268
- professional_medicine	1	none	0	acc	↑	0.2610	±	0.0267
- virology	1	none	0	acc	↑	0.1807	±	0.0300
- social sciences	2	none		acc	↑	0.2658	±	0.0080
- econometrics	1	none	0	acc	↑	0.1930	±	0.0371
- high_school_geography	1	none	0	acc	↑	0.2172	±	0.0294
- high_school_government_and_politics	1	none	0	acc	↑	0.3212	±	0.0337
- high_school_macroeconomics	1	none	0	acc	↑	0.2923	±	0.0231
- high_school_microeconomics	1	none	0	acc	↑	0.3025	±	0.0298
- high_school_psychology	1	none	0	acc	↑	0.2752	±	0.0191
- human_sexuality	1	none	0	acc	↑	0.2290	±	0.0369
- professional_psychology	1	none	0	acc	↑	0.2386	±	0.0172
- public_relations	1	none	0	acc	↑	0.2636	±	0.0422
- security_studies	1	none	0	acc	↑	0.3143	±	0.0297
- sociology	1	none	0	acc	↑	0.2338	±	0.0299
- us_foreign_policy	1	none	0	acc	↑	0.2600	±	0.0441
- stem	2	none		acc	↑	0.2464	±	0.0077
- abstract_algebra	1	none	0	acc	↑	0.2500	±	0.0435
- anatomy	1	none	0	acc	↑	0.2148	±	0.0355
- astronomy	1	none	0	acc	↑	0.1908	±	0.0320
- college_biology	1	none	0	acc	↑	0.2569	±	0.0365
- college_chemistry	1	none	0	acc	↑	0.2700	±	0.0446
- college_computer_science	1	none	0	acc	↑	0.3500	±	0.0479
- college_mathematics	1	none	0	acc	↑	0.2700	±	0.0446
- college_physics	1	none	0	acc	↑	0.2745	±	0.0444
- computer_security	1	none	0	acc	↑	0.3000	±	0.0461
- conceptual_physics	1	none	0	acc	↑	0.2766	±	0.0292
- electrical_engineering	1	none	0	acc	↑	0.2345	±	0.0353
- elementary_mathematics	1	none	0	acc	↑	0.2566	±	0.0225
- high_school_biology	1	none	0	acc	↑	0.2226	±	0.0237
- high_school_chemistry	1	none	0	acc	↑	0.2217	±	0.0292
- high_school_computer_science	1	none	0	acc	↑	0.2000	±	0.0402
- high_school_mathematics	1	none	0	acc	↑	0.2370	±	0.0259
- high_school_physics	1	none	0	acc	↑	0.2517	±	0.0354
- high_school_statistics	1	none	0	acc	↑	0.2685	±	0.0302
- machine_learning	1	none	0	acc	↑	0.1786	±	0.0364
truthfulqa_mc2	2	none	0	acc	↑	0.4668	±	0.0161
winogrande	1	none	0	acc	↑	0.5012	±	0.0141

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.2473	±	0.0036
- humanities	2	none	acc	↑	0.2351	±	0.0062
- other	2	none	acc	↑	0.2485	±	0.0078
- social sciences	2	none	acc	↑	0.2658	±	0.0080
- stem	2	none	acc	↑	0.2464	±	0.0077

litgpt evaluate --tasks 'leaderboard' --out_dir 'evaluate-leaderboard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/

litgpt evaluate --tasks 'bbh_zeroshot,bbh_fewshot,bbh_cot_fewshot,bbh_cot_zeroshot' --out_dir 'evaluate-bigbenchhard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/

litgpt evaluate --tasks 'mmlu,mmlu_pro' --out_dir 'evaluate-mmlu/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/

litgpt evaluate --tasks 'arc_challenge,boolq,gpqa,hellaswag,openbookqa,piqa,truthfulqa_mc2,winogrande' --out_dir 'evaluate-reasoning/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/

litgpt evaluate --tasks 'mmlu_multilingual,mgsm' --out_dir 'evaluate-multilinguals/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/

litgpt evaluate --tasks 'gsm8k,mathqa' --out_dir 'evaluate-math/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/

litgpt evaluate --tasks 'qasper' --out_dir 'evaluate-long/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/