eval

f03b061 9 days ago

10.7 kB

	# Train

	## Environment

	```bash
	cd scripts
	python -m venv venv
	source venv/bin/activate
	pip install -U -r requirements.in
	```

	## Tokenizer

	```bash
	python -B train_tokenizer.py
	```

	## Dataset

	```bash
	python -B prepare_pretrain_dataset.py
	```

	```python
	from litdata import StreamingDataset, StreamingDataLoader, TokensLoader

	dataset = StreamingDataset(
	input_dir='../pretrain-data/',
	item_loader=TokensLoader(block_size=2048 + 1),
	)

	print(len(dataset))
	```

	## Model

	### Pretrain

	```bash
	litgpt pretrain --config ./pretrain-model.yaml
	```

	```bash
	litgpt convert_from_litgpt out/pretrain/final/ out/converted_model
	cp config.json out/pretrain/final/
	cp config.json out/converted_model/
	```

	```python
	import torch
	from safetensors.torch import save_file

	state_dict = torch.load('out/converted_model/model.pth', map_location='cpu')
	save_file(state_dict, 'out/converted_model/model.safetensors')
	```

	## Evaluate

	```bash
	litgpt evaluate --tasks 'hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge' --out_dir 'evaluate-quick/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
	```

	\| Tasks \|Version\| Filter \|n-shot\| Metric \| \|Value \| \|Stderr\|
	\|---------------------------------------\|------:\|----------------\|-----:\|-----------\|---\|-----:\|---\|-----:\|
	\|arc_challenge \| 1\|none \| 0\|acc \|↑ \|0.1962\|± \|0.0116\|
	\| \| \|none \| 0\|acc_norm \|↑ \|0.2304\|± \|0.0123\|
	\|gsm8k \| 3\|flexible-extract\| 5\|exact_match\|↑ \|0.0144\|± \|0.0033\|
	\| \| \|strict-match \| 5\|exact_match\|↑ \|0.0015\|± \|0.0011\|
	\|hellaswag \| 1\|none \| 0\|acc \|↑ \|0.2631\|± \|0.0044\|
	\| \| \|none \| 0\|acc_norm \|↑ \|0.2758\|± \|0.0045\|
	\|mmlu \| 2\|none \| \|acc \|↑ \|0.2473\|± \|0.0036\|
	\| - humanities \| 2\|none \| \|acc \|↑ \|0.2351\|± \|0.0062\|
	\| - formal_logic \| 1\|none \| 0\|acc \|↑ \|0.2857\|± \|0.0404\|
	\| - high_school_european_history \| 1\|none \| 0\|acc \|↑ \|0.2667\|± \|0.0345\|
	\| - high_school_us_history \| 1\|none \| 0\|acc \|↑ \|0.2696\|± \|0.0311\|
	\| - high_school_world_history \| 1\|none \| 0\|acc \|↑ \|0.2110\|± \|0.0266\|
	\| - international_law \| 1\|none \| 0\|acc \|↑ \|0.1653\|± \|0.0339\|
	\| - jurisprudence \| 1\|none \| 0\|acc \|↑ \|0.2870\|± \|0.0437\|
	\| - logical_fallacies \| 1\|none \| 0\|acc \|↑ \|0.2331\|± \|0.0332\|
	\| - moral_disputes \| 1\|none \| 0\|acc \|↑ \|0.2283\|± \|0.0226\|
	\| - moral_scenarios \| 1\|none \| 0\|acc \|↑ \|0.2425\|± \|0.0143\|
	\| - philosophy \| 1\|none \| 0\|acc \|↑ \|0.2186\|± \|0.0235\|
	\| - prehistory \| 1\|none \| 0\|acc \|↑ \|0.2099\|± \|0.0227\|
	\| - professional_law \| 1\|none \| 0\|acc \|↑ \|0.2314\|± \|0.0108\|
	\| - world_religions \| 1\|none \| 0\|acc \|↑ \|0.2632\|± \|0.0338\|
	\| - other \| 2\|none \| \|acc \|↑ \|0.2485\|± \|0.0078\|
	\| - business_ethics \| 1\|none \| 0\|acc \|↑ \|0.2600\|± \|0.0441\|
	\| - clinical_knowledge \| 1\|none \| 0\|acc \|↑ \|0.2528\|± \|0.0267\|
	\| - college_medicine \| 1\|none \| 0\|acc \|↑ \|0.2254\|± \|0.0319\|
	\| - global_facts \| 1\|none \| 0\|acc \|↑ \|0.2700\|± \|0.0446\|
	\| - human_aging \| 1\|none \| 0\|acc \|↑ \|0.2377\|± \|0.0286\|
	\| - management \| 1\|none \| 0\|acc \|↑ \|0.2816\|± \|0.0445\|
	\| - marketing \| 1\|none \| 0\|acc \|↑ \|0.2692\|± \|0.0291\|
	\| - medical_genetics \| 1\|none \| 0\|acc \|↑ \|0.2600\|± \|0.0441\|
	\| - miscellaneous \| 1\|none \| 0\|acc \|↑ \|0.2350\|± \|0.0152\|
	\| - nutrition \| 1\|none \| 0\|acc \|↑ \|0.2549\|± \|0.0250\|
	\| - professional_accounting \| 1\|none \| 0\|acc \|↑ \|0.2801\|± \|0.0268\|
	\| - professional_medicine \| 1\|none \| 0\|acc \|↑ \|0.2610\|± \|0.0267\|
	\| - virology \| 1\|none \| 0\|acc \|↑ \|0.1807\|± \|0.0300\|
	\| - social sciences \| 2\|none \| \|acc \|↑ \|0.2658\|± \|0.0080\|
	\| - econometrics \| 1\|none \| 0\|acc \|↑ \|0.1930\|± \|0.0371\|
	\| - high_school_geography \| 1\|none \| 0\|acc \|↑ \|0.2172\|± \|0.0294\|
	\| - high_school_government_and_politics\| 1\|none \| 0\|acc \|↑ \|0.3212\|± \|0.0337\|
	\| - high_school_macroeconomics \| 1\|none \| 0\|acc \|↑ \|0.2923\|± \|0.0231\|
	\| - high_school_microeconomics \| 1\|none \| 0\|acc \|↑ \|0.3025\|± \|0.0298\|
	\| - high_school_psychology \| 1\|none \| 0\|acc \|↑ \|0.2752\|± \|0.0191\|
	\| - human_sexuality \| 1\|none \| 0\|acc \|↑ \|0.2290\|± \|0.0369\|
	\| - professional_psychology \| 1\|none \| 0\|acc \|↑ \|0.2386\|± \|0.0172\|
	\| - public_relations \| 1\|none \| 0\|acc \|↑ \|0.2636\|± \|0.0422\|
	\| - security_studies \| 1\|none \| 0\|acc \|↑ \|0.3143\|± \|0.0297\|
	\| - sociology \| 1\|none \| 0\|acc \|↑ \|0.2338\|± \|0.0299\|
	\| - us_foreign_policy \| 1\|none \| 0\|acc \|↑ \|0.2600\|± \|0.0441\|
	\| - stem \| 2\|none \| \|acc \|↑ \|0.2464\|± \|0.0077\|
	\| - abstract_algebra \| 1\|none \| 0\|acc \|↑ \|0.2500\|± \|0.0435\|
	\| - anatomy \| 1\|none \| 0\|acc \|↑ \|0.2148\|± \|0.0355\|
	\| - astronomy \| 1\|none \| 0\|acc \|↑ \|0.1908\|± \|0.0320\|
	\| - college_biology \| 1\|none \| 0\|acc \|↑ \|0.2569\|± \|0.0365\|
	\| - college_chemistry \| 1\|none \| 0\|acc \|↑ \|0.2700\|± \|0.0446\|
	\| - college_computer_science \| 1\|none \| 0\|acc \|↑ \|0.3500\|± \|0.0479\|
	\| - college_mathematics \| 1\|none \| 0\|acc \|↑ \|0.2700\|± \|0.0446\|
	\| - college_physics \| 1\|none \| 0\|acc \|↑ \|0.2745\|± \|0.0444\|
	\| - computer_security \| 1\|none \| 0\|acc \|↑ \|0.3000\|± \|0.0461\|
	\| - conceptual_physics \| 1\|none \| 0\|acc \|↑ \|0.2766\|± \|0.0292\|
	\| - electrical_engineering \| 1\|none \| 0\|acc \|↑ \|0.2345\|± \|0.0353\|
	\| - elementary_mathematics \| 1\|none \| 0\|acc \|↑ \|0.2566\|± \|0.0225\|
	\| - high_school_biology \| 1\|none \| 0\|acc \|↑ \|0.2226\|± \|0.0237\|
	\| - high_school_chemistry \| 1\|none \| 0\|acc \|↑ \|0.2217\|± \|0.0292\|
	\| - high_school_computer_science \| 1\|none \| 0\|acc \|↑ \|0.2000\|± \|0.0402\|
	\| - high_school_mathematics \| 1\|none \| 0\|acc \|↑ \|0.2370\|± \|0.0259\|
	\| - high_school_physics \| 1\|none \| 0\|acc \|↑ \|0.2517\|± \|0.0354\|
	\| - high_school_statistics \| 1\|none \| 0\|acc \|↑ \|0.2685\|± \|0.0302\|
	\| - machine_learning \| 1\|none \| 0\|acc \|↑ \|0.1786\|± \|0.0364\|
	\|truthfulqa_mc2 \| 2\|none \| 0\|acc \|↑ \|0.4668\|± \|0.0161\|
	\|winogrande \| 1\|none \| 0\|acc \|↑ \|0.5012\|± \|0.0141\|

	\| Groups \|Version\|Filter\|n-shot\|Metric\| \|Value \| \|Stderr\|
	\|------------------\|------:\|------\|------\|------\|---\|-----:\|---\|-----:\|
	\|mmlu \| 2\|none \| \|acc \|↑ \|0.2473\|± \|0.0036\|
	\| - humanities \| 2\|none \| \|acc \|↑ \|0.2351\|± \|0.0062\|
	\| - other \| 2\|none \| \|acc \|↑ \|0.2485\|± \|0.0078\|
	\| - social sciences\| 2\|none \| \|acc \|↑ \|0.2658\|± \|0.0080\|
	\| - stem \| 2\|none \| \|acc \|↑ \|0.2464\|± \|0.0077\|

	```bash
	litgpt evaluate --tasks 'leaderboard' --out_dir 'evaluate-leaderboard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
	```

	```bash
	litgpt evaluate --tasks 'bbh_zeroshot,bbh_fewshot,bbh_cot_fewshot,bbh_cot_zeroshot' --out_dir 'evaluate-bigbenchhard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
	```

	```bash
	litgpt evaluate --tasks 'mmlu,mmlu_pro' --out_dir 'evaluate-mmlu/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
	```

	```bash
	litgpt evaluate --tasks 'arc_challenge,boolq,gpqa,hellaswag,openbookqa,piqa,truthfulqa_mc2,winogrande' --out_dir 'evaluate-reasoning/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
	```

	```bash
	litgpt evaluate --tasks 'mmlu_multilingual,mgsm' --out_dir 'evaluate-multilinguals/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
	```

	```bash
	litgpt evaluate --tasks 'gsm8k,mathqa' --out_dir 'evaluate-math/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
	```

	```bash
	litgpt evaluate --tasks 'qasper' --out_dir 'evaluate-long/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
	```

	# Train

	## Environment

	```bash
	cd scripts
	python -m venv venv
	source venv/bin/activate
	pip install -U -r requirements.in
	```

	## Tokenizer

	```bash
	python -B train_tokenizer.py
	```

	## Dataset

	```bash
	python -B prepare_pretrain_dataset.py
	```

	```python
	from litdata import StreamingDataset, StreamingDataLoader, TokensLoader

	dataset = StreamingDataset(
	input_dir='../pretrain-data/',
	item_loader=TokensLoader(block_size=2048 + 1),
	)

	print(len(dataset))
	```

	## Model

	### Pretrain

	```bash
	litgpt pretrain --config ./pretrain-model.yaml
	```

	```bash
	litgpt convert_from_litgpt out/pretrain/final/ out/converted_model
	cp config.json out/pretrain/final/
	cp config.json out/converted_model/
	```

	```python
	import torch
	from safetensors.torch import save_file

	state_dict = torch.load('out/converted_model/model.pth', map_location='cpu')
	save_file(state_dict, 'out/converted_model/model.safetensors')
	```

	## Evaluate

	```bash
	litgpt evaluate --tasks 'hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge' --out_dir 'evaluate-quick/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
	```

	\| Tasks \|Version\| Filter \|n-shot\| Metric \| \|Value \| \|Stderr\|
	\|---------------------------------------\|------:\|----------------\|-----:\|-----------\|---\|-----:\|---\|-----:\|
	\|arc_challenge \| 1\|none \| 0\|acc \|↑ \|0.1962\|± \|0.0116\|
	\| \| \|none \| 0\|acc_norm \|↑ \|0.2304\|± \|0.0123\|
	\|gsm8k \| 3\|flexible-extract\| 5\|exact_match\|↑ \|0.0144\|± \|0.0033\|
	\| \| \|strict-match \| 5\|exact_match\|↑ \|0.0015\|± \|0.0011\|
	\|hellaswag \| 1\|none \| 0\|acc \|↑ \|0.2631\|± \|0.0044\|
	\| \| \|none \| 0\|acc_norm \|↑ \|0.2758\|± \|0.0045\|
	\|mmlu \| 2\|none \| \|acc \|↑ \|0.2473\|± \|0.0036\|
	\| - humanities \| 2\|none \| \|acc \|↑ \|0.2351\|± \|0.0062\|
	\| - formal_logic \| 1\|none \| 0\|acc \|↑ \|0.2857\|± \|0.0404\|
	\| - high_school_european_history \| 1\|none \| 0\|acc \|↑ \|0.2667\|± \|0.0345\|
	\| - high_school_us_history \| 1\|none \| 0\|acc \|↑ \|0.2696\|± \|0.0311\|
	\| - high_school_world_history \| 1\|none \| 0\|acc \|↑ \|0.2110\|± \|0.0266\|
	\| - international_law \| 1\|none \| 0\|acc \|↑ \|0.1653\|± \|0.0339\|
	\| - jurisprudence \| 1\|none \| 0\|acc \|↑ \|0.2870\|± \|0.0437\|
	\| - logical_fallacies \| 1\|none \| 0\|acc \|↑ \|0.2331\|± \|0.0332\|
	\| - moral_disputes \| 1\|none \| 0\|acc \|↑ \|0.2283\|± \|0.0226\|
	\| - moral_scenarios \| 1\|none \| 0\|acc \|↑ \|0.2425\|± \|0.0143\|
	\| - philosophy \| 1\|none \| 0\|acc \|↑ \|0.2186\|± \|0.0235\|
	\| - prehistory \| 1\|none \| 0\|acc \|↑ \|0.2099\|± \|0.0227\|
	\| - professional_law \| 1\|none \| 0\|acc \|↑ \|0.2314\|± \|0.0108\|
	\| - world_religions \| 1\|none \| 0\|acc \|↑ \|0.2632\|± \|0.0338\|
	\| - other \| 2\|none \| \|acc \|↑ \|0.2485\|± \|0.0078\|
	\| - business_ethics \| 1\|none \| 0\|acc \|↑ \|0.2600\|± \|0.0441\|
	\| - clinical_knowledge \| 1\|none \| 0\|acc \|↑ \|0.2528\|± \|0.0267\|
	\| - college_medicine \| 1\|none \| 0\|acc \|↑ \|0.2254\|± \|0.0319\|
	\| - global_facts \| 1\|none \| 0\|acc \|↑ \|0.2700\|± \|0.0446\|
	\| - human_aging \| 1\|none \| 0\|acc \|↑ \|0.2377\|± \|0.0286\|
	\| - management \| 1\|none \| 0\|acc \|↑ \|0.2816\|± \|0.0445\|
	\| - marketing \| 1\|none \| 0\|acc \|↑ \|0.2692\|± \|0.0291\|
	\| - medical_genetics \| 1\|none \| 0\|acc \|↑ \|0.2600\|± \|0.0441\|
	\| - miscellaneous \| 1\|none \| 0\|acc \|↑ \|0.2350\|± \|0.0152\|
	\| - nutrition \| 1\|none \| 0\|acc \|↑ \|0.2549\|± \|0.0250\|
	\| - professional_accounting \| 1\|none \| 0\|acc \|↑ \|0.2801\|± \|0.0268\|
	\| - professional_medicine \| 1\|none \| 0\|acc \|↑ \|0.2610\|± \|0.0267\|
	\| - virology \| 1\|none \| 0\|acc \|↑ \|0.1807\|± \|0.0300\|
	\| - social sciences \| 2\|none \| \|acc \|↑ \|0.2658\|± \|0.0080\|
	\| - econometrics \| 1\|none \| 0\|acc \|↑ \|0.1930\|± \|0.0371\|
	\| - high_school_geography \| 1\|none \| 0\|acc \|↑ \|0.2172\|± \|0.0294\|
	\| - high_school_government_and_politics\| 1\|none \| 0\|acc \|↑ \|0.3212\|± \|0.0337\|
	\| - high_school_macroeconomics \| 1\|none \| 0\|acc \|↑ \|0.2923\|± \|0.0231\|
	\| - high_school_microeconomics \| 1\|none \| 0\|acc \|↑ \|0.3025\|± \|0.0298\|
	\| - high_school_psychology \| 1\|none \| 0\|acc \|↑ \|0.2752\|± \|0.0191\|
	\| - human_sexuality \| 1\|none \| 0\|acc \|↑ \|0.2290\|± \|0.0369\|
	\| - professional_psychology \| 1\|none \| 0\|acc \|↑ \|0.2386\|± \|0.0172\|
	\| - public_relations \| 1\|none \| 0\|acc \|↑ \|0.2636\|± \|0.0422\|
	\| - security_studies \| 1\|none \| 0\|acc \|↑ \|0.3143\|± \|0.0297\|
	\| - sociology \| 1\|none \| 0\|acc \|↑ \|0.2338\|± \|0.0299\|
	\| - us_foreign_policy \| 1\|none \| 0\|acc \|↑ \|0.2600\|± \|0.0441\|
	\| - stem \| 2\|none \| \|acc \|↑ \|0.2464\|± \|0.0077\|
	\| - abstract_algebra \| 1\|none \| 0\|acc \|↑ \|0.2500\|± \|0.0435\|
	\| - anatomy \| 1\|none \| 0\|acc \|↑ \|0.2148\|± \|0.0355\|
	\| - astronomy \| 1\|none \| 0\|acc \|↑ \|0.1908\|± \|0.0320\|
	\| - college_biology \| 1\|none \| 0\|acc \|↑ \|0.2569\|± \|0.0365\|
	\| - college_chemistry \| 1\|none \| 0\|acc \|↑ \|0.2700\|± \|0.0446\|
	\| - college_computer_science \| 1\|none \| 0\|acc \|↑ \|0.3500\|± \|0.0479\|
	\| - college_mathematics \| 1\|none \| 0\|acc \|↑ \|0.2700\|± \|0.0446\|
	\| - college_physics \| 1\|none \| 0\|acc \|↑ \|0.2745\|± \|0.0444\|
	\| - computer_security \| 1\|none \| 0\|acc \|↑ \|0.3000\|± \|0.0461\|
	\| - conceptual_physics \| 1\|none \| 0\|acc \|↑ \|0.2766\|± \|0.0292\|
	\| - electrical_engineering \| 1\|none \| 0\|acc \|↑ \|0.2345\|± \|0.0353\|
	\| - elementary_mathematics \| 1\|none \| 0\|acc \|↑ \|0.2566\|± \|0.0225\|
	\| - high_school_biology \| 1\|none \| 0\|acc \|↑ \|0.2226\|± \|0.0237\|
	\| - high_school_chemistry \| 1\|none \| 0\|acc \|↑ \|0.2217\|± \|0.0292\|
	\| - high_school_computer_science \| 1\|none \| 0\|acc \|↑ \|0.2000\|± \|0.0402\|
	\| - high_school_mathematics \| 1\|none \| 0\|acc \|↑ \|0.2370\|± \|0.0259\|
	\| - high_school_physics \| 1\|none \| 0\|acc \|↑ \|0.2517\|± \|0.0354\|
	\| - high_school_statistics \| 1\|none \| 0\|acc \|↑ \|0.2685\|± \|0.0302\|
	\| - machine_learning \| 1\|none \| 0\|acc \|↑ \|0.1786\|± \|0.0364\|
	\|truthfulqa_mc2 \| 2\|none \| 0\|acc \|↑ \|0.4668\|± \|0.0161\|
	\|winogrande \| 1\|none \| 0\|acc \|↑ \|0.5012\|± \|0.0141\|

	\| Groups \|Version\|Filter\|n-shot\|Metric\| \|Value \| \|Stderr\|
	\|------------------\|------:\|------\|------\|------\|---\|-----:\|---\|-----:\|
	\|mmlu \| 2\|none \| \|acc \|↑ \|0.2473\|± \|0.0036\|
	\| - humanities \| 2\|none \| \|acc \|↑ \|0.2351\|± \|0.0062\|
	\| - other \| 2\|none \| \|acc \|↑ \|0.2485\|± \|0.0078\|
	\| - social sciences\| 2\|none \| \|acc \|↑ \|0.2658\|± \|0.0080\|
	\| - stem \| 2\|none \| \|acc \|↑ \|0.2464\|± \|0.0077\|

	```bash
	litgpt evaluate --tasks 'leaderboard' --out_dir 'evaluate-leaderboard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
	```

	```bash
	litgpt evaluate --tasks 'bbh_zeroshot,bbh_fewshot,bbh_cot_fewshot,bbh_cot_zeroshot' --out_dir 'evaluate-bigbenchhard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
	```

	```bash
	litgpt evaluate --tasks 'mmlu,mmlu_pro' --out_dir 'evaluate-mmlu/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
	```

	```bash
	litgpt evaluate --tasks 'arc_challenge,boolq,gpqa,hellaswag,openbookqa,piqa,truthfulqa_mc2,winogrande' --out_dir 'evaluate-reasoning/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
	```

	```bash
	litgpt evaluate --tasks 'mmlu_multilingual,mgsm' --out_dir 'evaluate-multilinguals/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
	```

	```bash
	litgpt evaluate --tasks 'gsm8k,mathqa' --out_dir 'evaluate-math/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
	```

	```bash
	litgpt evaluate --tasks 'qasper' --out_dir 'evaluate-long/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
	```