mtasic85's picture
eval
f03b061

Train

Environment

cd scripts
python -m venv venv
source venv/bin/activate
pip install -U -r requirements.in

Tokenizer

python -B train_tokenizer.py

Dataset

python -B prepare_pretrain_dataset.py
from litdata import StreamingDataset, StreamingDataLoader, TokensLoader

dataset = StreamingDataset(
  input_dir='../pretrain-data/',
  item_loader=TokensLoader(block_size=2048 + 1),
)

print(len(dataset))

Model

Pretrain

litgpt pretrain --config ./pretrain-model.yaml
litgpt convert_from_litgpt out/pretrain/final/ out/converted_model
cp config.json out/pretrain/final/
cp config.json out/converted_model/
import torch
from safetensors.torch import save_file

state_dict = torch.load('out/converted_model/model.pth', map_location='cpu')
save_file(state_dict, 'out/converted_model/model.safetensors')

Evaluate

litgpt evaluate --tasks 'hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge' --out_dir 'evaluate-quick/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
Tasks Version Filter n-shot Metric Value Stderr
arc_challenge 1 none 0 acc 0.1962 ± 0.0116
none 0 acc_norm 0.2304 ± 0.0123
gsm8k 3 flexible-extract 5 exact_match 0.0144 ± 0.0033
strict-match 5 exact_match 0.0015 ± 0.0011
hellaswag 1 none 0 acc 0.2631 ± 0.0044
none 0 acc_norm 0.2758 ± 0.0045
mmlu 2 none acc 0.2473 ± 0.0036
- humanities 2 none acc 0.2351 ± 0.0062
- formal_logic 1 none 0 acc 0.2857 ± 0.0404
- high_school_european_history 1 none 0 acc 0.2667 ± 0.0345
- high_school_us_history 1 none 0 acc 0.2696 ± 0.0311
- high_school_world_history 1 none 0 acc 0.2110 ± 0.0266
- international_law 1 none 0 acc 0.1653 ± 0.0339
- jurisprudence 1 none 0 acc 0.2870 ± 0.0437
- logical_fallacies 1 none 0 acc 0.2331 ± 0.0332
- moral_disputes 1 none 0 acc 0.2283 ± 0.0226
- moral_scenarios 1 none 0 acc 0.2425 ± 0.0143
- philosophy 1 none 0 acc 0.2186 ± 0.0235
- prehistory 1 none 0 acc 0.2099 ± 0.0227
- professional_law 1 none 0 acc 0.2314 ± 0.0108
- world_religions 1 none 0 acc 0.2632 ± 0.0338
- other 2 none acc 0.2485 ± 0.0078
- business_ethics 1 none 0 acc 0.2600 ± 0.0441
- clinical_knowledge 1 none 0 acc 0.2528 ± 0.0267
- college_medicine 1 none 0 acc 0.2254 ± 0.0319
- global_facts 1 none 0 acc 0.2700 ± 0.0446
- human_aging 1 none 0 acc 0.2377 ± 0.0286
- management 1 none 0 acc 0.2816 ± 0.0445
- marketing 1 none 0 acc 0.2692 ± 0.0291
- medical_genetics 1 none 0 acc 0.2600 ± 0.0441
- miscellaneous 1 none 0 acc 0.2350 ± 0.0152
- nutrition 1 none 0 acc 0.2549 ± 0.0250
- professional_accounting 1 none 0 acc 0.2801 ± 0.0268
- professional_medicine 1 none 0 acc 0.2610 ± 0.0267
- virology 1 none 0 acc 0.1807 ± 0.0300
- social sciences 2 none acc 0.2658 ± 0.0080
- econometrics 1 none 0 acc 0.1930 ± 0.0371
- high_school_geography 1 none 0 acc 0.2172 ± 0.0294
- high_school_government_and_politics 1 none 0 acc 0.3212 ± 0.0337
- high_school_macroeconomics 1 none 0 acc 0.2923 ± 0.0231
- high_school_microeconomics 1 none 0 acc 0.3025 ± 0.0298
- high_school_psychology 1 none 0 acc 0.2752 ± 0.0191
- human_sexuality 1 none 0 acc 0.2290 ± 0.0369
- professional_psychology 1 none 0 acc 0.2386 ± 0.0172
- public_relations 1 none 0 acc 0.2636 ± 0.0422
- security_studies 1 none 0 acc 0.3143 ± 0.0297
- sociology 1 none 0 acc 0.2338 ± 0.0299
- us_foreign_policy 1 none 0 acc 0.2600 ± 0.0441
- stem 2 none acc 0.2464 ± 0.0077
- abstract_algebra 1 none 0 acc 0.2500 ± 0.0435
- anatomy 1 none 0 acc 0.2148 ± 0.0355
- astronomy 1 none 0 acc 0.1908 ± 0.0320
- college_biology 1 none 0 acc 0.2569 ± 0.0365
- college_chemistry 1 none 0 acc 0.2700 ± 0.0446
- college_computer_science 1 none 0 acc 0.3500 ± 0.0479
- college_mathematics 1 none 0 acc 0.2700 ± 0.0446
- college_physics 1 none 0 acc 0.2745 ± 0.0444
- computer_security 1 none 0 acc 0.3000 ± 0.0461
- conceptual_physics 1 none 0 acc 0.2766 ± 0.0292
- electrical_engineering 1 none 0 acc 0.2345 ± 0.0353
- elementary_mathematics 1 none 0 acc 0.2566 ± 0.0225
- high_school_biology 1 none 0 acc 0.2226 ± 0.0237
- high_school_chemistry 1 none 0 acc 0.2217 ± 0.0292
- high_school_computer_science 1 none 0 acc 0.2000 ± 0.0402
- high_school_mathematics 1 none 0 acc 0.2370 ± 0.0259
- high_school_physics 1 none 0 acc 0.2517 ± 0.0354
- high_school_statistics 1 none 0 acc 0.2685 ± 0.0302
- machine_learning 1 none 0 acc 0.1786 ± 0.0364
truthfulqa_mc2 2 none 0 acc 0.4668 ± 0.0161
winogrande 1 none 0 acc 0.5012 ± 0.0141
Groups Version Filter n-shot Metric Value Stderr
mmlu 2 none acc 0.2473 ± 0.0036
- humanities 2 none acc 0.2351 ± 0.0062
- other 2 none acc 0.2485 ± 0.0078
- social sciences 2 none acc 0.2658 ± 0.0080
- stem 2 none acc 0.2464 ± 0.0077
litgpt evaluate --tasks 'leaderboard' --out_dir 'evaluate-leaderboard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
litgpt evaluate --tasks 'bbh_zeroshot,bbh_fewshot,bbh_cot_fewshot,bbh_cot_zeroshot' --out_dir 'evaluate-bigbenchhard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
litgpt evaluate --tasks 'mmlu,mmlu_pro' --out_dir 'evaluate-mmlu/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
litgpt evaluate --tasks 'arc_challenge,boolq,gpqa,hellaswag,openbookqa,piqa,truthfulqa_mc2,winogrande' --out_dir 'evaluate-reasoning/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
litgpt evaluate --tasks 'mmlu_multilingual,mgsm' --out_dir 'evaluate-multilinguals/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
litgpt evaluate --tasks 'gsm8k,mathqa' --out_dir 'evaluate-math/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
litgpt evaluate --tasks 'qasper' --out_dir 'evaluate-long/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/