mtasic85 commited on
Commit
193a28c
·
1 Parent(s): 8432ba4

pretrain model

Browse files
added_tokens.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 32000,
3
+ "<|assistant|>": 32001,
4
+ "<|placeholder1|>": 32002,
5
+ "<|placeholder2|>": 32003,
6
+ "<|placeholder3|>": 32004,
7
+ "<|placeholder4|>": 32005,
8
+ "<|system|>": 32006,
9
+ "<|end|>": 32007,
10
+ "<|placeholder5|>": 32008,
11
+ "<|placeholder6|>": 32009,
12
+ "<|user|>": 32010,
13
+ "<tools>": 32011,
14
+ "</tools>": 32012,
15
+ "<tool_call>": 32013,
16
+ "</tool_call>": 32014,
17
+ "<tool_response>": 32015,
18
+ "</tool_response>": 32016,
19
+ "<think>": 32017,
20
+ "</think>": 32018
21
+ }
misc/logo.jpg ADDED
scripts/backup/base_instruct_datasets.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ roles_map = {
2
+ 'system': 'system',
3
+ 'user': 'user',
4
+ 'human': 'user',
5
+ 'assistant': 'assistant',
6
+ 'gpt': 'assistant',
7
+ 'AI': 'assistant',
8
+ }
9
+
10
+
11
+ core_instruct_datasets = [
12
+ #
13
+ # general instructs
14
+ #
15
+ # 1.48 GB, 1,420,909
16
+ # mlabonne/open-perfectblend
17
+ # meta-math/MetaMathQA 395,000
18
+ # openbmb/UltraInteract_sft 288,579
19
+ # HuggingFaceH4/ultrachat_200k 207,865
20
+ # microsoft/orca-math-word-problems-200k 200,035
21
+ # HuggingFaceH4/ultrafeedback_binarized 187,405
22
+ # theblackcat102/evol-codealpaca-v1 111,272
23
+ # Post-training-Data-Flywheel/AutoIF-instruct-61k 61,492
24
+ # mlabonne/lmsys-arena-human-preference-55k-sharegpt 57,362
25
+ *[
26
+ {'kind': 'instruct', 'path': 'mlabonne/open-perfectblend', 'split': f'train[{i}%:{i + 10}%]', 'field': 'conversations', 'transform': lambda msgs: [
27
+ {'role': roles_map[m['from']], 'content': m['value']}
28
+ for m in msgs
29
+ ]}
30
+ for i in range(0, 100, 10)
31
+ ],
32
+ # 1.41 GB, 939,343
33
+ # allenai/tulu-3-sft-mixture
34
+ # CoCoNot (ODC-BY-1.0), 10,983 prompts (Brahman et al., 2024)
35
+ # FLAN v2 via ai2-adapt-dev/flan_v2_converted, 89,982 prompts (Longpre et al., 2023)
36
+ # No Robots (CC-BY-NC-4.0), 9,500 prompts (Rajani et al. 2023)
37
+ # OpenAssistant Guanaco (Apache 2.0), 7,132 prompts (Kopf et al., 2024)
38
+ # Tulu 3 Persona MATH (ODC-BY-1.0), 149,960 prompts
39
+ # Tulu 3 Persona GSM (ODC-BY-1.0), 49,980 prompts
40
+ # Tulu 3 Persona Python (ODC-BY-1.0), 34,999 prompts
41
+ # Tulu 3 Persona Algebra (ODC-BY-1.0), 20,000 prompts
42
+ # Tulu 3 Persona IF (ODC-BY-1.0), 29,980 prompts
43
+ # NuminaMath-TIR (Apache 2.0), 64,312 prompts (Beeching et al. 2024)
44
+ # Tulu 3 WildGuardMix (Apache 2.0), 50,000 prompts (Han et al., 2024)
45
+ # Tulu 3 WildJailbreak (ODC-BY-1.0), 50,000 prompts (Wildteaming, 2024)
46
+ # Tulu 3 Hardcoded (CC-BY-4.0), 240 prompts
47
+ # Aya (Apache 2.0), 100,000 prompts (Singh et al., 2024)
48
+ # WildChat GPT-4 (ODC-BY-1.0), 100,000 prompts (Zhao et al., 2024)
49
+ # TableGPT (MIT), 5,000 prompts (Zha et al., 2023)
50
+ # SciRIFF (ODC-BY-1.0), 10,000 prompts (Wadden et al., 2024)
51
+ # Evol CodeAlpaca (Apache 2.0), 107,276 prompts (Luo et al., 2023)
52
+ *[
53
+ {'kind': 'instruct', 'path': 'allenai/tulu-3-sft-mixture', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
54
+ for i in range(0, 100, 10)
55
+ ],
56
+
57
+ #
58
+ # multilingual instructs
59
+ #
60
+ # 2.48 GB, 5,808,694
61
+ # rombodawg/Everything_Instruct_Multilingual
62
+ # Science:
63
+ # antiven0m/physical-reasoning-dpoScience
64
+ # LawalAfeez/science-dataset
65
+ # Social media:
66
+ # Kyle1668/AG-Tweets
67
+ # euclaise/reddit-instruct-curated
68
+ # General Knowledge:
69
+ # NousResearch/CharacterCodex_Characters
70
+ # jstet/quotes-500k_Famous_Quotes
71
+ # FronkonGames/steam-games-dataset_Video_Games
72
+ # totuta_youtube_subs_howto100M_HowTo
73
+ # Multi-lingual:
74
+ # Amani27/massive_translation_dataset
75
+ # udmurtNLP/udmurt-russian-english-labse
76
+ # grosenthal/latin_english
77
+ # msarmi9/korean-english-multitarget-ted-talks-task
78
+ # HaiderSultanArc/MT-Urdu-English_Translate
79
+ # Garsa3112/ChineseEnglishTranslationDataset
80
+ # Cooking:
81
+ # andrewsiah/se_cooking_preference_sft
82
+ # Hieu-Phamkaggle/food_recipes
83
+ # Writing:
84
+ # shahules786/PoetryFoundationData
85
+ # euclaise/writingprompts
86
+ # qwedsacf/ivypanda-essaysEssay
87
+ # Medicine:
88
+ # keivalya/MedQuad-MedicalQnADataset
89
+ # nuvocare/MSD
90
+ # History:
91
+ # ambrosfitz10k/history_data_v4
92
+ # Law:
93
+ # dzunggg/legal-qa-v1
94
+ # Role-Play:
95
+ # roleplay4/fun_CoupleRP
96
+ # Undi95andrijdavid/roleplay-conversation-sharegpt
97
+ # News:
98
+ # RealTimeData/bbc_news_alltime
99
+ # Coding: (rombodawg/code_bagel)
100
+ # layoric/tiny-codes-alpaca
101
+ # glaiveai/glaive-code-assistant-v3
102
+ # ajibawa-2023/Code-290k-ShareGPT
103
+ # chargoddard/commitpack-ft-instruct-rated
104
+ # iamtarun/code_instructions_120k_alpaca
105
+ # ise-uiuc/Magicoder-Evol-Instruct-110K
106
+ # cognitivecomputations/dolphin-coder
107
+ # nickrosh/Evol-Instruct-Code-80k-v1
108
+ # coseal/CodeUltraFeedback_binarized
109
+ # CyberNative/Code_Vulnerability_Security_DPO
110
+ # Math: (rombodawg/code_bagel)
111
+ # TIGER-Lab/MathInstruct
112
+ # Function calling: (rombodawg/code_bagel)
113
+ # glaiveai/glaive-function-calling-v2
114
+ # General Instruct: (rombodawg/OpenHermes-2.5-Uncensored)
115
+ # teknium/OpenHermes-2.5
116
+ *[
117
+ {'kind': 'instruct', 'path': 'rombodawg/Everything_Instruct_Multilingual', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: [
118
+ {'role': 'system', 'content': r['instruction']},
119
+ {'role': 'user', 'content': r['input']},
120
+ {'role': 'assistant', 'content': r['output']},
121
+ ]}
122
+ for i in range(0, 100, 10)
123
+ ],
124
+ ]
scripts/backup/base_reason_datasets.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ roles_map = {
2
+ 'system': 'system',
3
+ 'user': 'user',
4
+ 'human': 'user',
5
+ 'assistant': 'assistant',
6
+ 'gpt': 'assistant',
7
+ 'AI': 'assistant',
8
+ }
9
+
10
+ R1_SYSTEM_PROMPT = '''\
11
+ You are an AI assistant.
12
+
13
+ Your primary directive is to provide well-reasoned, structured, and extensively detailed responses.
14
+
15
+ Formatting Requirements:
16
+ - Always structure your replies using: <think>{reasoning}</think>{answer}
17
+ - The <think></think> block should contain at least six reasoning steps when applicable.
18
+ - If the answer requires minimal thought, the <think></think> block may be left empty.
19
+ - The user does not see the <think></think> section. Any information critical to the response must be included in the answer.
20
+ - If you notice that you have engaged in circular reasoning or repetition, immediately terminate {reasoning} with a </think> and proceed to the {answer}
21
+
22
+ Response Guidelines:
23
+ - Detailed and Structured: Use rich Markdown formatting for clarity and readability.
24
+ - Scientific and Logical Approach: Your explanations should reflect the depth and precision of the greatest scientific minds.
25
+ - Prioritize Reasoning: Always reason through the problem first, unless the answer is trivial.
26
+ - Concise yet Complete: Ensure responses are informative, yet to the point without unnecessary elaboration.
27
+ - Maintain a professional, intelligent, and analytical tone in all interactions.'''
28
+
29
+ core_reason_datasets = [
30
+ #
31
+ # math reason
32
+ #
33
+ # 8.43 GB, 450,258
34
+ *[
35
+ {'kind': 'instruct', 'path': 'open-r1/OpenR1-Math-220k', 'data_dir': 'data', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages', 'transform': lambda msgs: [
36
+ {'role': roles_map[m['from']], 'content': m['value']}
37
+ for m in msgs
38
+ ]}
39
+ for i in range(0, 100, 10)
40
+ ],
41
+
42
+ #
43
+ # general reason
44
+ #
45
+ # 3.55 GB, 227,914
46
+ *[
47
+ {'kind': 'instruct', 'path': 'open-thoughts/OpenThoughts-114k', 'data_dir': 'data', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: [
48
+ {'role': 'system', 'content': r['system']}
49
+ ] + [
50
+ {'role': roles_map[m['from']], 'content': m['value']}
51
+ for m in r['conversations']
52
+ ]}
53
+ for i in range(0, 100, 10)
54
+ ],
55
+ # 3.98 GB, 814,334
56
+ # 300k
57
+ *[
58
+ {'kind': 'instruct', 'path': 'cognitivecomputations/dolphin-r1', 'data_files': 'dolphin-r1-reasoning-deepseek.jsonl', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: [
59
+ {'role': 'system', 'content': R1_SYSTEM_PROMPT},
60
+ *r['messages'],
61
+ {'role': 'assistant', 'content': '<think>\n' + (r.get('reasoning') or '') + '\n</think>\n' + (r.get('answer') or '')},
62
+ ]}
63
+ for i in range(0, 100, 10)
64
+ ],
65
+ # 300k
66
+ *[
67
+ {'kind': 'instruct', 'path': 'cognitivecomputations/dolphin-r1', 'data_files': 'dolphin-r1-reasoning-flash.jsonl', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: [
68
+ {'role': 'system', 'content': R1_SYSTEM_PROMPT},
69
+ *r['messages'],
70
+ {'role': 'assistant', 'content': '<think>\n' + (r.get('reasoning') or '') + '\n</think>\n' + (r.get('answer') or '')},
71
+ ]}
72
+ for i in range(0, 100, 10)
73
+ ],
74
+ # 21.1 MB, 1,000
75
+ {'kind': 'instruct', 'path': 'simplescaling/s1K-1.1', 'split': 'train', 'transform': lambda r: [
76
+ {'role': 'user', 'content': r.get('question') or ''},
77
+ {'role': 'assistant', 'content': '<think>\n' + (r.get('deepseek_thinking_trajectory') or '') + '\n</think>\n' + (r.get('solution') or '')},
78
+ ]}
79
+ ]
scripts/backup/cpt_base_datasets.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cpt_base_datasets = [
2
+ #
3
+ # stem
4
+ #
5
+ # 1.44 GB, 63,357
6
+ {'kind': 'base', 'path': 'neuralwork/arxiver', 'split': 'train', 'format': lambda n: n['abstract']},
7
+ {'kind': 'base', 'path': 'neuralwork/arxiver', 'split': 'train', 'format': lambda n: n['markdown']},
8
+
9
+ #
10
+ # code
11
+ #
12
+ # 1.62 GB, 1,632,309
13
+ # Python, TypeScript, JavaScript, Ruby, Julia, Rust, C++, Bash, Java, C#, and Go; SQL, Cypher
14
+ {'kind': 'base', 'path': 'nampdn-ai/tiny-codes', 'split': 'train', 'format': '{prompt} {response}'},
15
+
16
+ #
17
+ # misc
18
+ #
19
+ # 472 KB, 5,034
20
+ {'kind': 'base', 'path': 'badrex/llm-emoji-dataset', 'format': '{short description}. {LLM description}. {character}'},
21
+
22
+ #
23
+ # multilingual
24
+ #
25
+ # 742 MB, 321,697
26
+ *[
27
+ {'kind': 'base', 'path': 'data-silence/sumnews', 'split': split, 'format': lambda n: n[field]}
28
+ for split in ['train', 'test']
29
+ for field in ['title', 'resume', 'news']
30
+ ],
31
+ # 193 MB, 1,141,967
32
+ *[
33
+ {'kind': 'base', 'path': 'xu-song/cc100-samples', 'name': name, 'split': 'train', 'format': lambda n: n['text']}
34
+ for name in [
35
+ 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
36
+ 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
37
+ 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl',
38
+ 'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu',
39
+ 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km',
40
+ 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt',
41
+ 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw',
42
+ 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt',
43
+ 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl',
44
+ 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom',
45
+ 'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur',
46
+ 'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
47
+ 'zh-Hans', 'zh-Hant', 'zu',
48
+ ]
49
+ ],
50
+
51
+ #
52
+ # general knowledge
53
+ #
54
+ # 3.18 GB, 1,010,500 - uncompressed 6GB
55
+ {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'train', 'format': lambda n: n['text']},
56
+ {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
57
+ {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
58
+ ]
scripts/backup/cpt_base_model.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from unsloth import FastLanguageModel
2
+ import torch
3
+ from transformers import AutoTokenizer
4
+
5
+ max_seq_length = 4096
6
+ dtype = torch.bfloat16
7
+ load_in_4bit = True
8
+ model_name = '../out/pretrain-base'
9
+ output_dir = '../out/cpt-base'
10
+
11
+ model, tokenizer = FastLanguageModel.from_pretrained(
12
+ model_name=model_name,
13
+ max_seq_length=max_seq_length,
14
+ dtype=dtype,
15
+ load_in_4bit=load_in_4bit,
16
+ )
17
+
18
+ print('Ignore loaded tokenizer by FastLanguageModel.from_pretrained and using AutoTokenizer.from_pretrained')
19
+ tokenizer = AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True)
20
+
21
+ print(f'{model=}')
22
+ print(f'{tokenizer=}')
23
+
24
+ model = FastLanguageModel.get_peft_model(
25
+ model,
26
+ r=64, # 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
27
+ target_modules=[
28
+ "q_proj", "k_proj", "v_proj", "o_proj",
29
+ "gate_proj", "up_proj", "down_proj",
30
+ "embed_tokens", "lm_head",
31
+ ], # Add for continual pretraining
32
+ lora_alpha=16,
33
+ lora_dropout=0, # Supports any, but = 0 is optimized
34
+ bias='none', # Supports any, but = "none" is optimized
35
+ # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
36
+ use_gradient_checkpointing='unsloth', # True or "unsloth" for very long context
37
+ random_state=23,
38
+ use_rslora=True, # We support rank stabilized LoRA
39
+ loftq_config=None, # And LoftQ
40
+ )
41
+
42
+ print(f'{model=}')
43
+
44
+ from datasets import concatenate_datasets
45
+ from cpt_base_datasets import cpt_base_datasets
46
+ from cpt_instruct_datasets import cpt_instruct_datasets
47
+ from unsloth_utils import load_text_dataset, load_chat_dataset
48
+
49
+ core_datasets = []
50
+
51
+ for dataset_config in cpt_base_datasets:
52
+ dataset = load_text_dataset(tokenizer, **dataset_config)
53
+ print(f'{dataset=}')
54
+ core_datasets.append(dataset)
55
+
56
+ # for dataset_config in cpt_instruct_datasets:
57
+ # dataset = load_chat_dataset(tokenizer, **dataset_config)
58
+ # print(f'{dataset=}')
59
+ # core_datasets.append(dataset)
60
+
61
+ final_dataset = concatenate_datasets(core_datasets)
62
+ print(f'{final_dataset=}')
63
+
64
+
65
+ from trl import SFTTrainer
66
+ from transformers import TrainingArguments
67
+ from unsloth import is_bfloat16_supported
68
+ from unsloth import UnslothTrainer, UnslothTrainingArguments
69
+
70
+
71
+ trainer = UnslothTrainer(
72
+ model=model,
73
+ tokenizer=tokenizer,
74
+ train_dataset=final_dataset,
75
+ dataset_text_field='text',
76
+ max_seq_length=max_seq_length,
77
+ dataset_num_proc=32,
78
+
79
+ args = UnslothTrainingArguments(
80
+ per_device_train_batch_size=8,
81
+ gradient_accumulation_steps=8,
82
+
83
+ warmup_ratio=0.1,
84
+ num_train_epochs=1,
85
+
86
+ learning_rate=5e-5,
87
+ embedding_learning_rate=5e-6,
88
+
89
+ fp16=not is_bfloat16_supported(),
90
+ bf16=is_bfloat16_supported(),
91
+ logging_steps=1,
92
+ optim='adamw_8bit',
93
+ weight_decay=0.01,
94
+ lr_scheduler_type='cosine',
95
+ seed=23,
96
+ output_dir=output_dir,
97
+ report_to='wandb',
98
+ ),
99
+ )
100
+
101
+ trainer_stats = trainer.train()
scripts/backup/cpt_instruct_datasets.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+
4
+ roles_map = {
5
+ 'system': 'system',
6
+ 'user': 'user',
7
+ 'human': 'user',
8
+ 'assistant': 'assistant',
9
+ 'gpt': 'assistant',
10
+ 'AI': 'assistant',
11
+ }
12
+
13
+
14
+ cpt_instruct_datasets = [
15
+ #
16
+ # general instructs
17
+ #
18
+ # 1.48 GB, 1,420,909
19
+ # mlabonne/open-perfectblend
20
+ # meta-math/MetaMathQA 395,000
21
+ # openbmb/UltraInteract_sft 288,579
22
+ # HuggingFaceH4/ultrachat_200k 207,865
23
+ # microsoft/orca-math-word-problems-200k 200,035
24
+ # HuggingFaceH4/ultrafeedback_binarized 187,405
25
+ # theblackcat102/evol-codealpaca-v1 111,272
26
+ # Post-training-Data-Flywheel/AutoIF-instruct-61k 61,492
27
+ # mlabonne/lmsys-arena-human-preference-55k-sharegpt 57,362
28
+ {'kind': 'instruct', 'path': 'mlabonne/open-perfectblend', 'split': 'train', 'field': 'conversations', 'transform': lambda msgs: [
29
+ {'role': roles_map[m['from']], 'content': m['value']}
30
+ for m in msgs
31
+ ]},
32
+
33
+ # 1.41 GB, 939,343
34
+ # allenai/tulu-3-sft-mixture
35
+ # CoCoNot (ODC-BY-1.0), 10,983 prompts (Brahman et al., 2024)
36
+ # FLAN v2 via ai2-adapt-dev/flan_v2_converted, 89,982 prompts (Longpre et al., 2023)
37
+ # No Robots (CC-BY-NC-4.0), 9,500 prompts (Rajani et al. 2023)
38
+ # OpenAssistant Guanaco (Apache 2.0), 7,132 prompts (Kopf et al., 2024)
39
+ # Tulu 3 Persona MATH (ODC-BY-1.0), 149,960 prompts
40
+ # Tulu 3 Persona GSM (ODC-BY-1.0), 49,980 prompts
41
+ # Tulu 3 Persona Python (ODC-BY-1.0), 34,999 prompts
42
+ # Tulu 3 Persona Algebra (ODC-BY-1.0), 20,000 prompts
43
+ # Tulu 3 Persona IF (ODC-BY-1.0), 29,980 prompts
44
+ # NuminaMath-TIR (Apache 2.0), 64,312 prompts (Beeching et al. 2024)
45
+ # Tulu 3 WildGuardMix (Apache 2.0), 50,000 prompts (Han et al., 2024)
46
+ # Tulu 3 WildJailbreak (ODC-BY-1.0), 50,000 prompts (Wildteaming, 2024)
47
+ # Tulu 3 Hardcoded (CC-BY-4.0), 240 prompts
48
+ # Aya (Apache 2.0), 100,000 prompts (Singh et al., 2024)
49
+ # WildChat GPT-4 (ODC-BY-1.0), 100,000 prompts (Zhao et al., 2024)
50
+ # TableGPT (MIT), 5,000 prompts (Zha et al., 2023)
51
+ # SciRIFF (ODC-BY-1.0), 10,000 prompts (Wadden et al., 2024)
52
+ # Evol CodeAlpaca (Apache 2.0), 107,276 prompts (Luo et al., 2023)
53
+ {'kind': 'instruct', 'path': 'allenai/tulu-3-sft-mixture', 'split': 'train', 'field': 'messages'},
54
+
55
+ #
56
+ # multilingual instructs
57
+ #
58
+ # 2.48 GB, 5,808,694
59
+ # rombodawg/Everything_Instruct_Multilingual
60
+ # Science:
61
+ # antiven0m/physical-reasoning-dpoScience
62
+ # LawalAfeez/science-dataset
63
+ # Social media:
64
+ # Kyle1668/AG-Tweets
65
+ # euclaise/reddit-instruct-curated
66
+ # General Knowledge:
67
+ # NousResearch/CharacterCodex_Characters
68
+ # jstet/quotes-500k_Famous_Quotes
69
+ # FronkonGames/steam-games-dataset_Video_Games
70
+ # totuta_youtube_subs_howto100M_HowTo
71
+ # Multi-lingual:
72
+ # Amani27/massive_translation_dataset
73
+ # udmurtNLP/udmurt-russian-english-labse
74
+ # grosenthal/latin_english
75
+ # msarmi9/korean-english-multitarget-ted-talks-task
76
+ # HaiderSultanArc/MT-Urdu-English_Translate
77
+ # Garsa3112/ChineseEnglishTranslationDataset
78
+ # Cooking:
79
+ # andrewsiah/se_cooking_preference_sft
80
+ # Hieu-Phamkaggle/food_recipes
81
+ # Writing:
82
+ # shahules786/PoetryFoundationData
83
+ # euclaise/writingprompts
84
+ # qwedsacf/ivypanda-essaysEssay
85
+ # Medicine:
86
+ # keivalya/MedQuad-MedicalQnADataset
87
+ # nuvocare/MSD
88
+ # History:
89
+ # ambrosfitz10k/history_data_v4
90
+ # Law:
91
+ # dzunggg/legal-qa-v1
92
+ # Role-Play:
93
+ # roleplay4/fun_CoupleRP
94
+ # Undi95andrijdavid/roleplay-conversation-sharegpt
95
+ # News:
96
+ # RealTimeData/bbc_news_alltime
97
+ # Coding: (rombodawg/code_bagel)
98
+ # layoric/tiny-codes-alpaca
99
+ # glaiveai/glaive-code-assistant-v3
100
+ # ajibawa-2023/Code-290k-ShareGPT
101
+ # chargoddard/commitpack-ft-instruct-rated
102
+ # iamtarun/code_instructions_120k_alpaca
103
+ # ise-uiuc/Magicoder-Evol-Instruct-110K
104
+ # cognitivecomputations/dolphin-coder
105
+ # nickrosh/Evol-Instruct-Code-80k-v1
106
+ # coseal/CodeUltraFeedback_binarized
107
+ # CyberNative/Code_Vulnerability_Security_DPO
108
+ # Math: (rombodawg/code_bagel)
109
+ # TIGER-Lab/MathInstruct
110
+ # Function calling: (rombodawg/code_bagel)
111
+ # glaiveai/glaive-function-calling-v2
112
+ # General Instruct: (rombodawg/OpenHermes-2.5-Uncensored)
113
+ # teknium/OpenHermes-2.5
114
+ {'kind': 'instruct', 'path': 'rombodawg/Everything_Instruct_Multilingual', 'split': 'train', 'transform': lambda r: [
115
+ {'role': 'system', 'content': r['instruction']},
116
+ {'role': 'user', 'content': r['input']},
117
+ {'role': 'assistant', 'content': r['output']},
118
+ ]},
119
+ ]
scripts/backup/merge-core-into-base.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ slices:
2
+ # 1
3
+ - sources:
4
+ - model: ../out/pretrain-core-converted/
5
+ layer_range: [0, 1]
6
+ # 2
7
+ - sources:
8
+ - model: ../out/pretrain-core-converted/
9
+ layer_range: [0, 1]
10
+ # 3
11
+ - sources:
12
+ - model: ../out/pretrain-core-converted/
13
+ layer_range: [0, 1]
14
+ # 4
15
+ - sources:
16
+ - model: ../out/pretrain-core-converted/
17
+ layer_range: [0, 1]
18
+ # 5
19
+ - sources:
20
+ - model: ../out/pretrain-core-converted/
21
+ layer_range: [0, 1]
22
+ # 6
23
+ - sources:
24
+ - model: ../out/pretrain-core-converted/
25
+ layer_range: [0, 1]
26
+ # 7
27
+ - sources:
28
+ - model: ../out/pretrain-core-converted/
29
+ layer_range: [0, 1]
30
+ # 8
31
+ - sources:
32
+ - model: ../out/pretrain-core-converted/
33
+ layer_range: [0, 1]
34
+ # 9
35
+ - sources:
36
+ - model: ../out/pretrain-core-converted/
37
+ layer_range: [0, 1]
38
+ # 10
39
+ - sources:
40
+ - model: ../out/pretrain-core-converted/
41
+ layer_range: [0, 1]
42
+ # 11
43
+ - sources:
44
+ - model: ../out/pretrain-core-converted/
45
+ layer_range: [0, 1]
46
+ # 12
47
+ - sources:
48
+ - model: ../out/pretrain-core-converted/
49
+ layer_range: [0, 1]
50
+ # 13
51
+ - sources:
52
+ - model: ../out/pretrain-core-converted/
53
+ layer_range: [0, 1]
54
+ # 14
55
+ - sources:
56
+ - model: ../out/pretrain-core-converted/
57
+ layer_range: [0, 1]
58
+ # 15
59
+ - sources:
60
+ - model: ../out/pretrain-core-converted/
61
+ layer_range: [0, 1]
62
+ # 16
63
+ - sources:
64
+ - model: ../out/pretrain-core-converted/
65
+ layer_range: [0, 1]
66
+ # 17
67
+ - sources:
68
+ - model: ../out/pretrain-core-converted/
69
+ layer_range: [0, 1]
70
+ # 18
71
+ - sources:
72
+ - model: ../out/pretrain-core-converted/
73
+ layer_range: [0, 1]
74
+ # 19
75
+ - sources:
76
+ - model: ../out/pretrain-core-converted/
77
+ layer_range: [0, 1]
78
+ # 20
79
+ - sources:
80
+ - model: ../out/pretrain-core-converted/
81
+ layer_range: [0, 1]
82
+ # 21
83
+ - sources:
84
+ - model: ../out/pretrain-core-converted/
85
+ layer_range: [0, 1]
86
+ # 22
87
+ - sources:
88
+ - model: ../out/pretrain-core-converted/
89
+ layer_range: [0, 1]
90
+ # 23
91
+ - sources:
92
+ - model: ../out/pretrain-core-converted/
93
+ layer_range: [0, 1]
94
+ # 24
95
+ - sources:
96
+ - model: ../out/pretrain-core-converted/
97
+ layer_range: [0, 1]
98
+
99
+ merge_method: passthrough
100
+ dtype: bfloat16
scripts/backup/prepare_pretrain_base_datasets.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+
3
+ from litgpt.tokenizer import Tokenizer
4
+ from litdata import optimize, TokensLoader, StreamingDataset
5
+ from transformers import AutoTokenizer
6
+
7
+ from utils import tokenize_fn
8
+ from pretrain_base_datasets import pretrain_base_datasets
9
+ from pretrain_instruct_datasets import pretrain_instruct_datasets
10
+ from pretrain_reflection_datasets import pretrain_reflection_datasets
11
+ from pretrain_reasoning_datasets import pretrain_reasoning_datasets
12
+
13
+
14
+ #
15
+ # optimize datasets
16
+ #
17
+ for i, (block_size, subchunk_size) in enumerate([(4097, 4000)]):
18
+ chunk_size = block_size * subchunk_size
19
+ output_dir = f'../pretrain-base-data-{i}-{block_size}-{subchunk_size}'
20
+
21
+ outputs = optimize(
22
+ fn=partial(
23
+ tokenize_fn,
24
+ hf_tokenizer=AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True),
25
+ tokenizer=Tokenizer('..'),
26
+ ),
27
+ inputs=(
28
+ pretrain_base_datasets +
29
+ pretrain_instruct_datasets +
30
+ pretrain_reflection_datasets +
31
+ pretrain_reasoning_datasets
32
+ ),
33
+ output_dir=output_dir,
34
+ chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
35
+ num_workers=32,
36
+ reorder_files=False,
37
+ ## This is important to inform LitData that we are encoding contiguous 1D array (tokens).
38
+ ## LitData skips storing metadata for each sample e.g all the tokens are concatenated to form one large tensor.
39
+ # item_loader=TokensLoader(block_size=block_size),
40
+ )
41
+
42
+ #
43
+ # total number of chunks in datasets
44
+ #
45
+ for i, (block_size, subchunk_size) in enumerate([(4097, 4000)]):
46
+ chunk_size = block_size * subchunk_size
47
+ input_dir = f'../pretrain-base-data-{i}-{block_size}-{subchunk_size}'
48
+
49
+ dataset = StreamingDataset(
50
+ input_dir=input_dir,
51
+ item_loader=TokensLoader(block_size=block_size),
52
+ )
53
+
54
+ print(f'{i=}, {block_size=}, {chunk_size=}, {len(dataset)=}, {len(dataset) * block_size=}')
55
+
56
+ # total_tokens = sum(len(data) for data in dataset)
57
+ # print(f'Total number of tokens in the optimized dataset {input_dir!r} is {total_tokens}')
58
+ total_tokens = len(dataset) * block_size
59
+ print(f'Total number of tokens in the optimized dataset {input_dir!r} is {total_tokens}')
scripts/backup/pretrain_base_datasets.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pretrain_base_datasets = [
2
+ #
3
+ # multilingual
4
+ #
5
+ # 3.17 GB, 2,226,907
6
+ *[
7
+ {'kind': 'base', 'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['text']}
8
+ for i in range(0, 100, 10)
9
+ ],
10
+ # 1.64 GB, 1,001,000
11
+ *[
12
+ {'kind': 'base', 'path': 'distily/c4_multilingual_1M', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['text']}
13
+ for i in range(0, 100, 10)
14
+ ],
15
+ # 3.8 GB, 19,454,996
16
+ *[
17
+ {'kind': 'base', 'path': 'sentence-transformers/parallel-sentences-wikimatrix', 'data_dir': 'all', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['non_english']}
18
+ for i in range(0, 100, 10)
19
+ ],
20
+
21
+ #
22
+ # general knowledge
23
+ #
24
+ # 65.1 MB, 7,819
25
+ {'kind': 'base', 'path': 'Sketched33/Cities_Wikipedia_Information', 'format': lambda n: n['wikipedia_content']},
26
+ # 135 MB, 1,795
27
+ {'kind': 'base', 'path': 'open-phi/textbooks', 'format': lambda n: n['markdown']},
28
+ # 631 MB, 111,048
29
+ {'kind': 'base', 'path': 'open-phi/programming_books_llama', 'format': lambda n: n['markdown']},
30
+
31
+ #
32
+ # misc
33
+ #
34
+ # 472 KB, 5,034
35
+ {'kind': 'base', 'path': 'badrex/llm-emoji-dataset', 'format': '{short description}. {LLM description}. {character}'},
36
+
37
+ #
38
+ # math
39
+ #
40
+ # 12.6 GB, 14M rows
41
+ *[
42
+ {'kind': 'base', 'path': 'nvidia/OpenMathInstruct-2', 'split': f'train[{i}%:{i + 10}%]', 'format': '{problem} {generated_solution} {expected_answer}'}
43
+ for i in range(0, 100, 10)
44
+ ],
45
+
46
+ #
47
+ # stem
48
+ #
49
+ # 1.44 GB, 63,357
50
+ *[
51
+ {'kind': 'base', 'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['abstract']}
52
+ for i in range(0, 100, 10)
53
+ ],
54
+ *[
55
+ {'kind': 'base', 'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['markdown']}
56
+ for i in range(0, 100, 10)
57
+ ],
58
+
59
+ #
60
+ # code
61
+ #
62
+ # 7.81 GB, ~2,804,025
63
+ *[
64
+ {'kind': 'base', 'path': 'rombodawg/code_bagel_hermes-2.5', 'split': f'train[{i}%:{i + 10}%]', 'format': '{input} {output}'}
65
+ for i in range(0, 100, 10)
66
+ ],
67
+
68
+ #
69
+ # multilingual
70
+ #
71
+ # 742 MB, 321,697
72
+ *[
73
+ {'kind': 'base', 'path': 'data-silence/sumnews', 'split': split, 'format': lambda n: n[field]}
74
+ for split in ['train', 'test']
75
+ for field in ['title', 'resume', 'news']
76
+ ],
77
+ # 193 MB, 1,141,967
78
+ *[
79
+ {'kind': 'base', 'path': 'xu-song/cc100-samples', 'name': name, 'split': 'train', 'format': lambda n: n['text']}
80
+ for name in [
81
+ 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
82
+ 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
83
+ 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl',
84
+ 'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu',
85
+ 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km',
86
+ 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt',
87
+ 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw',
88
+ 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt',
89
+ 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl',
90
+ 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom',
91
+ 'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur',
92
+ 'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
93
+ 'zh-Hans', 'zh-Hant', 'zu',
94
+ ]
95
+ ],
96
+
97
+ #
98
+ # general knowledge
99
+ #
100
+ # 3.18 GB, 1,010,500 - uncompressed 6GB
101
+ *[
102
+ {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['text']}
103
+ for i in range(0, 100, 10)
104
+ ],
105
+ {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
106
+ {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
107
+ ]
scripts/backup/pretrain_instruct_datasets.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+
4
+ roles_map = {
5
+ 'system': 'system',
6
+ 'user': 'user',
7
+ 'human': 'user',
8
+ 'assistant': 'assistant',
9
+ 'gpt': 'assistant',
10
+ 'AI': 'assistant',
11
+ }
12
+
13
+
14
+ pretrain_instruct_datasets = [
15
+ #
16
+ # general instructs
17
+ #
18
+ # 138 MB, 205,568
19
+ {'kind': 'instruct', 'path': 'CohereForAI/aya_dataset', 'transform': lambda r: [
20
+ {'role': 'user', 'content': r['inputs']},
21
+ {'role': 'assistant', 'content': r['targets']},
22
+ ]},
23
+
24
+ # ~3 GB, 4,976,850
25
+ *[
26
+ {'kind': 'instruct', 'path': 'saillab/taco-datasets', 'data_dir': name, 'split': 'train', 'transform': lambda r: [
27
+ {'role': 'system', 'content': r['instruction']},
28
+ {'role': 'user', 'content': r['input']},
29
+ {'role': 'assistant', 'content': r['output']},
30
+ ]}
31
+ for name in [
32
+ 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
33
+ 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
34
+ ]
35
+ ],
36
+
37
+ # 1.48 GB, 1,420,909
38
+ # mlabonne/open-perfectblend
39
+ # meta-math/MetaMathQA 395,000
40
+ # openbmb/UltraInteract_sft 288,579
41
+ # HuggingFaceH4/ultrachat_200k 207,865
42
+ # microsoft/orca-math-word-problems-200k 200,035
43
+ # HuggingFaceH4/ultrafeedback_binarized 187,405
44
+ # theblackcat102/evol-codealpaca-v1 111,272
45
+ # Post-training-Data-Flywheel/AutoIF-instruct-61k 61,492
46
+ # mlabonne/lmsys-arena-human-preference-55k-sharegpt 57,362
47
+ *[
48
+ {'kind': 'instruct', 'path': 'mlabonne/open-perfectblend', 'split': f'train[{i}%:{i + 10}%]', 'field': 'conversations', 'transform': lambda msgs: [
49
+ {'role': roles_map[m['from']], 'content': m['value']}
50
+ for m in msgs
51
+ ]}
52
+ for i in range(0, 100, 10)
53
+ ],
54
+ # 4.58 GB, 1,752,473
55
+ # arcee-ai/The-Tome
56
+ # - arcee-ai/infini-instruct-top-500k (BAAI/Infinity-Instruct)
57
+ # - TIGER-Lab/WebInstructSub (top-500k) - IGNORE
58
+ # - jondurbin/airoboros-3.2
59
+ # - gardner/glaive-function-calling-v2-sharegpt
60
+ # - arcee-ai/reasoning-sharegpt (SkunkworksAI/reasoning-0.01)
61
+ # - arcee-ai/self-instruct-sharegpt (bigcode/self-oss-instruct-sc2-exec-filter-50k)
62
+ # - cognitivecomputations/ultrainteract_trajectories_sharegpt
63
+ # - cognitivecomputations/SystemChat-2.0
64
+ # - arcee-ai/qwen2-72b-magpie-en
65
+ *[
66
+ {'kind': 'instruct', 'path': 'arcee-ai/The-Tome', 'split': f'train[{i}%:{i + 10}%]', 'field': 'conversations', 'transform': lambda msgs: [
67
+ {'role': roles_map[m['from']], 'content': m['value']}
68
+ for m in msgs
69
+ ]}
70
+ for i in range(0, 100, 10)
71
+ ],
72
+ # 2.48 GB, 5,808,694
73
+ # rombodawg/Everything_Instruct_Multilingual
74
+ # Science:
75
+ # antiven0m/physical-reasoning-dpoScience
76
+ # LawalAfeez/science-dataset
77
+ # Social media:
78
+ # Kyle1668/AG-Tweets
79
+ # euclaise/reddit-instruct-curated
80
+ # General Knowledge:
81
+ # NousResearch/CharacterCodex_Characters
82
+ # jstet/quotes-500k_Famous_Quotes
83
+ # FronkonGames/steam-games-dataset_Video_Games
84
+ # totuta_youtube_subs_howto100M_HowTo
85
+ # Multi-lingual:
86
+ # Amani27/massive_translation_dataset
87
+ # udmurtNLP/udmurt-russian-english-labse
88
+ # grosenthal/latin_english
89
+ # msarmi9/korean-english-multitarget-ted-talks-task
90
+ # HaiderSultanArc/MT-Urdu-English_Translate
91
+ # Garsa3112/ChineseEnglishTranslationDataset
92
+ # Cooking:
93
+ # andrewsiah/se_cooking_preference_sft
94
+ # Hieu-Phamkaggle/food_recipes
95
+ # Writing:
96
+ # shahules786/PoetryFoundationData
97
+ # euclaise/writingprompts
98
+ # qwedsacf/ivypanda-essaysEssay
99
+ # Medicine:
100
+ # keivalya/MedQuad-MedicalQnADataset
101
+ # nuvocare/MSD
102
+ # History:
103
+ # ambrosfitz10k/history_data_v4
104
+ # Law:
105
+ # dzunggg/legal-qa-v1
106
+ # Role-Play:
107
+ # roleplay4/fun_CoupleRP
108
+ # Undi95andrijdavid/roleplay-conversation-sharegpt
109
+ # News:
110
+ # RealTimeData/bbc_news_alltime
111
+ # Coding: (rombodawg/code_bagel)
112
+ # layoric/tiny-codes-alpaca
113
+ # glaiveai/glaive-code-assistant-v3
114
+ # ajibawa-2023/Code-290k-ShareGPT
115
+ # chargoddard/commitpack-ft-instruct-rated
116
+ # iamtarun/code_instructions_120k_alpaca
117
+ # ise-uiuc/Magicoder-Evol-Instruct-110K
118
+ # cognitivecomputations/dolphin-coder
119
+ # nickrosh/Evol-Instruct-Code-80k-v1
120
+ # coseal/CodeUltraFeedback_binarized
121
+ # CyberNative/Code_Vulnerability_Security_DPO
122
+ # Math: (rombodawg/code_bagel)
123
+ # TIGER-Lab/MathInstruct
124
+ # Function calling: (rombodawg/code_bagel)
125
+ # glaiveai/glaive-function-calling-v2
126
+ # General Instruct: (rombodawg/OpenHermes-2.5-Uncensored)
127
+ # teknium/OpenHermes-2.5
128
+ *[
129
+ {'kind': 'instruct', 'path': 'rombodawg/Everything_Instruct_Multilingual', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: [
130
+ {'role': 'system', 'content': r['instruction']},
131
+ {'role': 'user', 'content': r['input']},
132
+ {'role': 'assistant', 'content': r['output']},
133
+ ]}
134
+ for i in range(0, 100, 10)
135
+ ],
136
+ # 1.41 GB, 939,343
137
+ # allenai/tulu-3-sft-mixture
138
+ # CoCoNot (ODC-BY-1.0), 10,983 prompts (Brahman et al., 2024)
139
+ # FLAN v2 via ai2-adapt-dev/flan_v2_converted, 89,982 prompts (Longpre et al., 2023)
140
+ # No Robots (CC-BY-NC-4.0), 9,500 prompts (Rajani et al. 2023)
141
+ # OpenAssistant Guanaco (Apache 2.0), 7,132 prompts (Kopf et al., 2024)
142
+ # Tulu 3 Persona MATH (ODC-BY-1.0), 149,960 prompts
143
+ # Tulu 3 Persona GSM (ODC-BY-1.0), 49,980 prompts
144
+ # Tulu 3 Persona Python (ODC-BY-1.0), 34,999 prompts
145
+ # Tulu 3 Persona Algebra (ODC-BY-1.0), 20,000 prompts
146
+ # Tulu 3 Persona IF (ODC-BY-1.0), 29,980 prompts
147
+ # NuminaMath-TIR (Apache 2.0), 64,312 prompts (Beeching et al. 2024)
148
+ # Tulu 3 WildGuardMix (Apache 2.0), 50,000 prompts (Han et al., 2024)
149
+ # Tulu 3 WildJailbreak (ODC-BY-1.0), 50,000 prompts (Wildteaming, 2024)
150
+ # Tulu 3 Hardcoded (CC-BY-4.0), 240 prompts
151
+ # Aya (Apache 2.0), 100,000 prompts (Singh et al., 2024)
152
+ # WildChat GPT-4 (ODC-BY-1.0), 100,000 prompts (Zhao et al., 2024)
153
+ # TableGPT (MIT), 5,000 prompts (Zha et al., 2023)
154
+ # SciRIFF (ODC-BY-1.0), 10,000 prompts (Wadden et al., 2024)
155
+ # Evol CodeAlpaca (Apache 2.0), 107,276 prompts (Luo et al., 2023)
156
+ *[
157
+ {'kind': 'instruct', 'path': 'allenai/tulu-3-sft-mixture', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
158
+ for i in range(0, 100, 10)
159
+ ],
160
+
161
+ #
162
+ # tool/function calling
163
+ #
164
+ # 65.7 MB, 11,578
165
+ {'kind': 'instruct', 'path': 'NousResearch/hermes-function-calling-v1', 'field': 'conversations', 'transform': lambda msgs: [
166
+ {'role': roles_map[m['from']], 'content': m['value']}
167
+ for m in msgs
168
+ ]},
169
+
170
+ #
171
+ # agent
172
+ #
173
+ # 1.51 GB, 485,874
174
+ *[
175
+ {'kind': 'instruct', 'path': 'arcee-ai/agent-data', 'split': f'train[{i}%:{i + 10}%]', 'field': 'conversations', 'transform': lambda msgs: [
176
+ {'role': roles_map[m['from']], 'content': m['value']}
177
+ for m in msgs
178
+ ]}
179
+ for i in range(0, 100, 10)
180
+ ],
181
+ # 2.21 GB, 1,046,410
182
+ *[
183
+ {'kind': 'instruct', 'path': 'microsoft/orca-agentinstruct-1M-v1', 'split': split, 'field': 'messages', 'transform': lambda msgs: json.loads(msgs)}
184
+ for split in [
185
+ 'creative_content', 'text_modification', 'struct2text_flow', 'rc', 'rag',
186
+ 'text_extraction', 'mcq', 'follow_up', 'analytical_reasoning', 'fermi', 'fs_cot_flow',
187
+ 'code_', 'brain_teaser', 'text_classification', 'open_domain_qa',
188
+ ]
189
+ ],
190
+
191
+ #
192
+ # general instructs
193
+ #
194
+ # 1.52 GB, 214k (3.98 GB, 814,334)
195
+ {'kind': 'instruct', 'path': 'cognitivecomputations/dolphin-r1', 'data_files': 'dolphin-r1-nonreasoning.jsonl', 'split': 'train', 'field': 'messages'},
196
+ # 4.15 GB, 2,197,730
197
+ {'kind': 'instruct', 'path': 'HuggingFaceTB/smoltalk', 'name': 'all', 'field': 'messages'},
198
+ ]
scripts/backup/pretrain_reasoning_datasets.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ roles_map = {
2
+ 'system': 'system',
3
+ 'user': 'user',
4
+ 'human': 'user',
5
+ 'assistant': 'assistant',
6
+ 'gpt': 'assistant',
7
+ 'AI': 'assistant',
8
+ }
9
+
10
+
11
+ pretrain_reasoning_datasets = [
12
+ #
13
+ # basic reasoning
14
+ #
15
+ # 10.8 MB, 15,770
16
+ {'kind': 'instruct', 'path': 'AtlasUnified/Atlas-Reasoning', 'data_files': 'reasoning.csv', 'transform': lambda r: [
17
+ {'role': 'user', 'content': r['Prompt']},
18
+ {'role': 'assistant', 'content': r['Step-by-step reasoning'] + '\n' + r['Solution']},
19
+ ]},
20
+ # 1.23 GB, 859,594
21
+ *[
22
+ {'kind': 'instruct', 'path': 'AI-MO/NuminaMath-CoT', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
23
+ for i in range(0, 100, 10)
24
+ ],
25
+ # 148 MB, 72,540
26
+ *[
27
+ {'kind': 'instruct', 'path': 'AI-MO/NuminaMath-TIR', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
28
+ for i in range(0, 100, 10)
29
+ ],
30
+
31
+ #
32
+ # math reasoning
33
+ #
34
+ # 1.79 MB, 3,963
35
+ {'kind': 'instruct', 'path': 'AlgorithmicResearchGroup/math_reasoning_autoformalization_track', 'transform': lambda r: [
36
+ {'role': 'user', 'content': r['informal_statement']},
37
+ {'role': 'assistant', 'content': r['informal_proof'] + '\n' + r['formal_proof']},
38
+ ]},
39
+ # 307 MB, 19,944
40
+ {'kind': 'instruct', 'path': 'KingNish/reasoning-base-20k', 'transform': lambda r: [
41
+ {'role': 'user', 'content': r['user']},
42
+ {'role': 'assistant', 'content': r['reasoning'] + '\n' + r['assistant']},
43
+ ]},
44
+ # 9.45 MB, 10,000
45
+ {'kind': 'instruct', 'path': 'Aarushhh/math-reasoning-10k', 'transform': lambda r: [
46
+ {'role': 'user', 'content': r['problem']},
47
+ {'role': 'assistant', 'content': r['plan'] + '\n' + r['solution']},
48
+ ]},
49
+
50
+ #
51
+ # cot reasoning
52
+ #
53
+ # 11.7 GB, 1,850,809
54
+ *[
55
+ {'kind': 'instruct', 'path': 'ServiceNow-AI/R1-Distill-SFT', 'data_dir': 'v0', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: [
56
+ {'role': 'user', 'content': r['problem']},
57
+ {'role': 'assistant', 'content': r['reannotated_assistant_content']},
58
+ ]}
59
+ for i in range(0, 100, 10)
60
+ ],
61
+ *[
62
+ {'kind': 'instruct', 'path': 'ServiceNow-AI/R1-Distill-SFT', 'data_dir': 'v1', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: r['reannotated_messages']}
63
+ for i in range(0, 100, 10)
64
+ ],
65
+ # 3.85 GB, 300k (3.98 GB, 814,334)
66
+ *[
67
+ {'kind': 'instruct', 'path': 'cognitivecomputations/dolphin-r1', 'data_files': 'dolphin-r1-reasoning-deepseek.jsonl', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: [
68
+ *r['messages'],
69
+ # {'role': 'assistant', 'content': (('<think>\n' + r['reasoning'] + '\n</think>\n') if r.get('reasoning') else '') + r['answer']},
70
+ {'role': 'assistant', 'content': (r.get('reasoning') or '') + (r.get('answer') or '')},
71
+ ]}
72
+ for i in range(0, 100, 10)
73
+ ],
74
+ # 3.49 GB, 300k (3.98 GB, 814,334)
75
+ *[
76
+ {'kind': 'instruct', 'path': 'cognitivecomputations/dolphin-r1', 'data_files': 'dolphin-r1-reasoning-flash.jsonl', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: [
77
+ *r['messages'],
78
+ # {'role': 'assistant', 'content': (('<think>\n' + r['reasoning'] + '\n</think>\n') if r.get('reasoning') else '') + r['answer']},
79
+ {'role': 'assistant', 'content': (r.get('reasoning') or '') + (r.get('answer') or '')},
80
+ ]}
81
+ for i in range(0, 100, 10)
82
+ ],
83
+ # 1.08 GB, 113,957
84
+ {'kind': 'instruct', 'path': 'open-thoughts/OpenThoughts-114k', 'split': 'train', 'field': 'conversations', 'transform': lambda msgs: [
85
+ {'role': roles_map[m['from']], 'content': m['value']}
86
+ for m in msgs
87
+ ]},
88
+ # 384 MB, 77,685
89
+ {'kind': 'instruct', 'path': 'O1-OPEN/OpenO1-SFT', 'split': 'train', 'transform': lambda r: [
90
+ {'role': 'user', 'content': r['instruction']},
91
+ {'role': 'assistant', 'content': r['output']},
92
+ ]},
93
+ # 6.88 MB, 1,000
94
+ {'kind': 'instruct', 'path': 'simplescaling/s1K', 'split': 'train', 'transform': lambda r: [
95
+ {'role': 'user', 'content': r['question']},
96
+ {'role': 'assistant', 'content': '<think>\n' + '\n'.join(r['thinking_trajectories']) + '\n</think>\n' + r['solution']},
97
+ ]},
98
+ ]
scripts/backup/pretrain_reflection_datasets.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ roles_map = {
2
+ 'system': 'system',
3
+ 'user': 'user',
4
+ 'human': 'user',
5
+ 'assistant': 'assistant',
6
+ 'gpt': 'assistant',
7
+ 'AI': 'assistant',
8
+ }
9
+
10
+
11
+ pretrain_reflection_datasets = [
12
+ #
13
+ # reflection
14
+ #
15
+ # 4.17 MB, 1,000
16
+ {'kind': 'instruct', 'path': 'dvilasuero/reflection-v1-gpt-4o-judge', 'transform': lambda r: [
17
+ {'role': 'system', 'content': r['system']},
18
+ {'role': 'user', 'content': r['prompt']},
19
+ {'role': 'assistant', 'content': r['response']},
20
+ ]},
21
+ # 12.4 MB, 3,000
22
+ {'kind': 'instruct', 'path': 'dvilasuero/reflection-v1-openai-o-mini-judge', 'transform': lambda r: [
23
+ {'role': 'system', 'content': r['system']},
24
+ {'role': 'user', 'content': r['prompt']},
25
+ {'role': 'assistant', 'content': r['response']},
26
+ ]},
27
+ # 70.8 MB, 36,549
28
+ {'kind': 'instruct', 'path': 'dvilasuero/reflection-v1-final-dedup', 'transform': lambda r: [
29
+ {'role': 'system', 'content': r['system']},
30
+ {'role': 'user', 'content': r['prompt']},
31
+ {'role': 'assistant', 'content': r['response']},
32
+ ]},
33
+ # 30.6 MB, 25,391
34
+ {'kind': 'instruct', 'path': 'flozi00/reflection-qwen2.5-72b-260924', 'transform': lambda r: [
35
+ r['system'][0],
36
+ {'role': 'user', 'content': r['input']},
37
+ {'role': 'assistant', 'content': r['reflection'] + '\n' + r['output']},
38
+ ]},
39
+ ]
scripts/backup/unsloth_utils.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Iterator, Callable, Any
2
+
3
+ import torch
4
+ from datasets import load_dataset, concatenate_datasets
5
+ from transformers import AutoTokenizer
6
+
7
+
8
+ def load_text_dataset(tokenizer: AutoTokenizer,
9
+ kind: str,
10
+ path: str,
11
+ name: Optional[str]=None,
12
+ data_dir: Optional[str]=None,
13
+ data_files: Optional[str]=None,
14
+ keep_in_memory: bool=False,
15
+ revision: Optional[str]=None,
16
+ split: str='train',
17
+ num_proc: Optional[int]=None,
18
+ format: Optional[Callable|str]=None) -> Any:
19
+ assert isinstance(format, str) or callable(format), f'{path=} {format=}'
20
+ assert kind == 'base'
21
+
22
+ dataset = load_dataset(path=path,
23
+ name=name,
24
+ data_dir=data_dir,
25
+ data_files=data_files,
26
+ keep_in_memory=keep_in_memory,
27
+ revision=revision,
28
+ split=split,
29
+ trust_remote_code=True,
30
+ num_proc=num_proc)
31
+
32
+ EOS_TOKEN = tokenizer.eos_token
33
+
34
+ def format_dataset(batch):
35
+ nonlocal EOS_TOKEN
36
+ nonlocal format
37
+ texts: list = []
38
+ rows = [dict(zip(batch.keys(), values)) for values in zip(*batch.values())]
39
+
40
+ if callable(format):
41
+ for row in rows:
42
+ # print(f'{row=}')
43
+ text = format(row)
44
+
45
+ if not text:
46
+ text = '[NONE]'
47
+
48
+ text += EOS_TOKEN
49
+ texts.append(text)
50
+ else:
51
+ for row in rows:
52
+ # print(f'{row=}')
53
+ text = format.format(**row)
54
+
55
+ if not text:
56
+ text = '[NONE]'
57
+
58
+ text += EOS_TOKEN
59
+ texts.append(text)
60
+
61
+ return {'text': texts}
62
+
63
+ dataset = dataset.map(format_dataset, batched=True)
64
+ return dataset
65
+
66
+
67
+ def load_chat_dataset(tokenizer: AutoTokenizer,
68
+ kind: str,
69
+ path: str,
70
+ name: Optional[str]=None,
71
+ data_dir: Optional[str]=None,
72
+ data_files: Optional[str]=None,
73
+ keep_in_memory: bool=False,
74
+ revision: Optional[str]=None,
75
+ split: str='train',
76
+ num_proc: Optional[int]=None,
77
+ field: Optional[str]=None,
78
+ transform: Optional[Callable]=None) -> Any:
79
+ assert kind == 'instruct'
80
+
81
+ dataset = load_dataset(path=path,
82
+ name=name,
83
+ data_dir=data_dir,
84
+ data_files=data_files,
85
+ keep_in_memory=keep_in_memory,
86
+ revision=revision,
87
+ split=split,
88
+ trust_remote_code=True,
89
+ num_proc=num_proc)
90
+
91
+ EOS_TOKEN = tokenizer.eos_token
92
+
93
+ def format_dataset(batch):
94
+ nonlocal EOS_TOKEN
95
+ nonlocal tokenizer
96
+ nonlocal field
97
+ nonlocal transform
98
+ texts: list = []
99
+ rows = [dict(zip(batch.keys(), values)) for values in zip(*batch.values())]
100
+
101
+ if callable(transform):
102
+ for row in rows:
103
+ if field:
104
+ messages = transform(row[field])
105
+ else:
106
+ messages = transform(row)
107
+
108
+ text = tokenizer.apply_chat_template(messages, tokenize=False)
109
+ text += EOS_TOKEN
110
+ texts.append(text)
111
+ else:
112
+ for row in rows:
113
+ if field:
114
+ messages = row[field]
115
+ else:
116
+ raise ValueError(field)
117
+
118
+ text = tokenizer.apply_chat_template(messages, tokenize=False)
119
+ text += EOS_TOKEN
120
+ texts.append(text)
121
+
122
+ return {'text': texts}
123
+
124
+ dataset = dataset.map(format_dataset, batched=True)
125
+ return dataset
scripts/core_base_datasets.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ core_base_datasets = [
2
+ #
3
+ # multilingual
4
+ #
5
+ # 3.17 GB, 2,226,907
6
+ *[
7
+ {'kind': 'base', 'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
8
+ for i in range(0, 100, 5)
9
+ ],
10
+ # 1.64 GB, 1,001,000
11
+ *[
12
+ {'kind': 'base', 'path': 'distily/c4_multilingual_1M', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
13
+ for i in range(0, 100, 5)
14
+ ],
15
+ # 742 MB, 321,697
16
+ *[
17
+ {'kind': 'base', 'path': 'data-silence/sumnews', 'split': split, 'format': lambda n: n[field]}
18
+ for split in ['train', 'test']
19
+ for field in ['title', 'resume', 'news']
20
+ ],
21
+ # 193 MB, 1,141,967
22
+ *[
23
+ {'kind': 'base', 'path': 'xu-song/cc100-samples', 'name': name, 'split': 'train', 'format': lambda n: n['text']}
24
+ for name in [
25
+ 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
26
+ 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
27
+ 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl',
28
+ 'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu',
29
+ 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km',
30
+ 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt',
31
+ 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw',
32
+ 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt',
33
+ 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl',
34
+ 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom',
35
+ 'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur',
36
+ 'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
37
+ 'zh-Hans', 'zh-Hant', 'zu',
38
+ ]
39
+ ],
40
+
41
+ #
42
+ # misc
43
+ #
44
+ # 472 KB, 5,034
45
+ {'kind': 'base', 'path': 'badrex/llm-emoji-dataset', 'format': '{short description}. {LLM description}. {character}'},
46
+
47
+ #
48
+ # stem
49
+ #
50
+ # 12.2 MB, 500,000
51
+ {'kind': 'base', 'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{instruction} = {output}'},
52
+ {'kind': 'base', 'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'test', 'format': '{instruction} = {output}'},
53
+ # 125 MB, 1,000,000
54
+ {'kind': 'base', 'path': 'Gusarich/math-expressions-1m', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{expression} = {result}'},
55
+
56
+ # 1.44 GB, 63,357
57
+ *[
58
+ {'kind': 'base', 'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['abstract']}
59
+ for i in range(0, 100, 10)
60
+ ],
61
+ *[
62
+ {'kind': 'base', 'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['markdown']}
63
+ for i in range(0, 100, 10)
64
+ ],
65
+
66
+ #
67
+ # code
68
+ #
69
+ # 36.8 MB, 79,013
70
+ # Rosetta Code currently has 1,203 tasks, 389 draft tasks, and is aware of 883 languages
71
+ {'kind': 'base', 'path': 'christopher/rosetta-code', 'format': lambda n: n['code']},
72
+ # 1.62 GB, 1,632,309
73
+ # Python, TypeScript, JavaScript, Ruby, Julia, Rust, C++, Bash, Java, C#, and Go; SQL, Cypher
74
+ *[
75
+ {'kind': 'base', 'path': 'nampdn-ai/tiny-codes', 'split': f'train[{i}%:{i + 10}%]', 'format': '{prompt} {response}'}
76
+ for i in range(0, 100, 10)
77
+ ],
78
+
79
+ #
80
+ # general knowledge
81
+ #
82
+ # 3.18 GB, 1,010,500 - uncompressed 6GB
83
+ *[
84
+ {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
85
+ for i in range(0, 100, 5)
86
+ ],
87
+ {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
88
+ {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
89
+ ]
scripts/core_instruct_datasets.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ roles_map = {
2
+ 'system': 'system',
3
+ 'user': 'user',
4
+ 'human': 'user',
5
+ 'assistant': 'assistant',
6
+ 'gpt': 'assistant',
7
+ 'AI': 'assistant',
8
+ }
9
+
10
+ R1_SYSTEM_PROMPT = '''\
11
+ You are an AI assistant.
12
+
13
+ Your primary directive is to provide well-reasoned, structured, and extensively detailed responses.
14
+
15
+ Formatting Requirements:
16
+ - Always structure your replies using: <think>{reasoning}</think>{answer}
17
+ - The <think></think> block should contain at least six reasoning steps when applicable.
18
+ - If the answer requires minimal thought, the <think></think> block may be left empty.
19
+ - The user does not see the <think></think> section. Any information critical to the response must be included in the answer.
20
+ - If you notice that you have engaged in circular reasoning or repetition, immediately terminate {reasoning} with a </think> and proceed to the {answer}
21
+
22
+ Response Guidelines:
23
+ - Detailed and Structured: Use rich Markdown formatting for clarity and readability.
24
+ - Scientific and Logical Approach: Your explanations should reflect the depth and precision of the greatest scientific minds.
25
+ - Prioritize Reasoning: Always reason through the problem first, unless the answer is trivial.
26
+ - Concise yet Complete: Ensure responses are informative, yet to the point without unnecessary elaboration.
27
+ - Maintain a professional, intelligent, and analytical tone in all interactions.'''
28
+
29
+ core_instruct_datasets = [
30
+ # 65.7 MB, 11,578
31
+ # 1.89k
32
+ {'kind': 'instruct', 'path': 'NousResearch/hermes-function-calling-v1', 'data_files': 'func-calling-singleturn.json', 'split': 'train', 'field': 'conversations', 'transform': lambda msgs: [
33
+ {'role': roles_map[m['from']], 'content': m['value']}
34
+ for m in msgs
35
+ ]},
36
+
37
+ # 21.1 MB, 1,000
38
+ {'kind': 'instruct', 'path': 'simplescaling/s1K-1.1', 'split': 'train', 'transform': lambda r: [
39
+ {'role': 'system', 'content': R1_SYSTEM_PROMPT},
40
+ {'role': 'user', 'content': r.get('question') or ''},
41
+ {'role': 'assistant', 'content': '<think>\n' + (r.get('deepseek_thinking_trajectory') or '') + '\n</think>\n' + (r.get('solution') or '')},
42
+ ]}
43
+ ]
scripts/prepare_core_datasets.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+
3
+ from litgpt.tokenizer import Tokenizer
4
+ from litdata import optimize, TokensLoader, StreamingDataset
5
+ from transformers import AutoTokenizer
6
+
7
+ from utils import tokenize_fn
8
+ from core_base_datasets import core_base_datasets
9
+ from core_instruct_datasets import core_instruct_datasets
10
+
11
+
12
+ #
13
+ # optimize datasets
14
+ #
15
+ for i, (block_size, subchunk_size) in enumerate([(8192, 2000)]):
16
+ chunk_size = block_size * subchunk_size
17
+ output_dir = f'../core-data-{i}-{block_size}-{subchunk_size}'
18
+
19
+ outputs = optimize(
20
+ fn=partial(
21
+ tokenize_fn,
22
+ hf_tokenizer=AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True),
23
+ tokenizer=Tokenizer('..'),
24
+ ),
25
+ inputs=core_base_datasets + core_instruct_datasets,
26
+ output_dir=output_dir,
27
+ chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
28
+ num_workers=32,
29
+ reorder_files=False,
30
+ ## This is important to inform LitData that we are encoding contiguous 1D array (tokens).
31
+ ## LitData skips storing metadata for each sample e.g all the tokens are concatenated to form one large tensor.
32
+ # item_loader=TokensLoader(block_size=block_size),
33
+ )
34
+
35
+ #
36
+ # total number of chunks in datasets
37
+ #
38
+ for i, (block_size, subchunk_size) in enumerate([(8192, 2000)]):
39
+ chunk_size = block_size * subchunk_size
40
+ input_dir = f'../core-data-{i}-{block_size}-{subchunk_size}'
41
+
42
+ dataset = StreamingDataset(
43
+ input_dir=input_dir,
44
+ item_loader=TokensLoader(block_size=block_size),
45
+ )
46
+
47
+ print(f'{i=}, {block_size=}, {chunk_size=}, {len(dataset)=}, {len(dataset) * block_size=}')
48
+
49
+ # total_tokens = sum(len(data) for data in dataset)
50
+ # print(f'Total number of tokens in the optimized dataset {input_dir!r} is {total_tokens}')
51
+ total_tokens = len(dataset) * block_size
52
+ print(f'Total number of tokens in the optimized dataset {input_dir!r} is {total_tokens}')
scripts/pretrain-core-model.yaml ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
2
+ # ``model_config``. (type: Optional[str], default: null)
3
+ model_name: 'tangled-alpha-0.1-core'
4
+
5
+ # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
6
+ # ``model_config``. (type: Optional[Config], default: null)
7
+ model_config:
8
+ name: 'tangled-alpha-0.1-core'
9
+ block_size: 131072
10
+ vocab_size: 32064
11
+ padded_vocab_size: 32064
12
+ n_layer: 32
13
+ n_head: 4
14
+ n_embd: 512
15
+ n_query_groups: 4
16
+ rotary_percentage: 1.0
17
+ parallel_residual: False
18
+ bias: False
19
+ norm_class_name: "RMSNorm"
20
+ mlp_class_name: "LLaMAMLP"
21
+ intermediate_size: 1792
22
+ norm_eps: 1e-5
23
+ rope_base: 500000
24
+ rope_adjustments:
25
+ factor: 32.0
26
+ low_freq_factor: 1.0
27
+ high_freq_factor: 4.0
28
+ original_max_seq_len: 8192
29
+ head_size: 128 # n_embd / n_head
30
+
31
+ # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
32
+ # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
33
+ out_dir: "../out/pretrain-core/"
34
+
35
+ # The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
36
+ # precision: bf16-mixed
37
+ precision: bf16-true
38
+
39
+ # Optional path to a checkpoint directory to initialize the model from.
40
+ # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
41
+ initial_checkpoint_dir:
42
+
43
+ # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
44
+ # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
45
+ # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
46
+ # (type: Union[bool, Literal["auto"], Path], default: False)
47
+ resume: "auto"
48
+
49
+ # Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
50
+ data:
51
+ class_path: LitData
52
+
53
+ init_args:
54
+ data_path: "../core-data-0-8192-2000/"
55
+ num_workers: 32
56
+
57
+ # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
58
+ train:
59
+ # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
60
+ save_interval: 100
61
+
62
+ # Number of iterations between logging calls (type: int, default: 1)
63
+ log_interval: 1
64
+
65
+ # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
66
+ global_batch_size: 512
67
+ # global_batch_size: 256
68
+
69
+ # Number of samples per data-parallel rank (type: int, default: 4)
70
+ micro_batch_size: 2
71
+ # micro_batch_size: 1
72
+
73
+ # Number of iterations with learning rate warmup active (type: int, default: 2000)
74
+ lr_warmup_steps: 200
75
+
76
+ # Number of epochs to train on (type: Optional[int], default: null)
77
+ epochs:
78
+
79
+ # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
80
+ max_tokens: 7318364160
81
+
82
+ # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
83
+ max_steps:
84
+
85
+ # Limits the length of samples. Off by default (type: Optional[int], default: null)
86
+ max_seq_length:
87
+
88
+ # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
89
+ tie_embeddings: true
90
+
91
+ # (type: Optional[float], default: 1.0)
92
+ max_norm: 1.0
93
+
94
+ # (type: float, default: 4e-05)
95
+ min_lr: 1e-05
96
+
97
+ # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
98
+ eval:
99
+ # Number of optimizer steps between evaluation calls (type: int, default: 1000)
100
+ interval: 50
101
+
102
+ # Number of tokens to generate (type: Optional[int], default: null)
103
+ max_new_tokens:
104
+
105
+ # Number of iterations (type: int, default: 100)
106
+ max_iters: 100
107
+
108
+ # Whether to evaluate on the validation set at the beginning of the training
109
+ initial_validation: false
110
+
111
+ # Whether to evaluate on the validation set at the end the training
112
+ final_validation: true
113
+
114
+ # Optimizer-related arguments
115
+
116
+ optimizer:
117
+ class_path: torch.optim.AdamW
118
+ init_args:
119
+ # (type: float, default: 0.001)
120
+ lr: 1e-4
121
+ # (type: float, default: 0.01)
122
+ weight_decay: 0.01
123
+ # (type: tuple, default: (0.9,0.999))
124
+ betas:
125
+ - 0.9
126
+ - 0.99
127
+
128
+ # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
129
+ devices: auto
130
+
131
+ # How many nodes to use. (type: int, default: 1)
132
+ num_nodes: 1
133
+
134
+ # Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
135
+ # module require this. (type: Optional[Path], default: null)
136
+ tokenizer_dir: "../"
137
+
138
+ # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
139
+ logger_name: "wandb"
140
+
141
+ # The random seed to use for reproducibility. (type: int, default: 42)
142
+ seed: 23
scripts/requirements.in ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
2
+ torch>=2.5.0,<2.6.0
3
+ numpy<2.0
4
+
5
+ tqdm
6
+ datasets
7
+ jinja2
8
+ transformers
9
+ wandb
10
+ # litgpt[all]
11
+ litgpt[all] @ git+https://github.com/Lightning-AI/litgpt.git
12
+ mergekit @ git+https://github.com/arcee-ai/mergekit.git
13
+ # litgpt @ git+https://github.com/Lightning-AI/litgpt.git
14
+ # litdata
15
+ # litdata @ git+https://github.com/Lightning-AI/litdata.git
16
+ # lpmm @ git+https://github.com/thu-ml/low-bit-optimizers.git
17
+ # muon @ git+https://github.com/KellerJordan/Muon
18
+ # pytorch-optimizer
19
+ lm_eval[ifeval,math]
20
+ bitsandbytes
21
+ # grokadamw
22
+ # sophia-opt
23
+ # bitsandbytes
24
+ # pyzstd
25
+ # zstd
26
+ unsloth
27
+
28
+ Pillow
scripts/utils.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ from typing import Optional, Iterator, Callable
3
+
4
+ import torch
5
+ from datasets import load_dataset
6
+ from litgpt.tokenizer import Tokenizer
7
+ from transformers import AutoTokenizer
8
+
9
+
10
+ def batch_text_iterator(kind: str,
11
+ path: str,
12
+ name: Optional[str]=None,
13
+ data_dir: Optional[str]=None,
14
+ data_files: Optional[str]=None,
15
+ keep_in_memory: bool=False,
16
+ revision: Optional[str]=None,
17
+ split: str='train',
18
+ num_proc: Optional[int]=None,
19
+ format: Optional[Callable|str]=None) -> Iterator[str]:
20
+ assert isinstance(format, str) or callable(format), f'{path=} {format=}'
21
+ assert kind == 'base'
22
+
23
+ dataset = load_dataset(path=path,
24
+ name=name,
25
+ data_dir=data_dir,
26
+ data_files=data_files,
27
+ keep_in_memory=keep_in_memory,
28
+ revision=revision,
29
+ split=split,
30
+ trust_remote_code=True,
31
+ num_proc=num_proc)
32
+
33
+ if callable(format):
34
+ for row in dataset:
35
+ text = format(row)
36
+ yield text
37
+ else:
38
+ for row in dataset:
39
+ text = format.format(**row)
40
+ yield text
41
+
42
+ del dataset
43
+ gc.collect()
44
+
45
+
46
+ def batch_chat_iterator(kind: str,
47
+ path: str,
48
+ name: Optional[str]=None,
49
+ data_dir: Optional[str]=None,
50
+ data_files: Optional[str]=None,
51
+ keep_in_memory: bool=False,
52
+ revision: Optional[str]=None,
53
+ split: str='train',
54
+ num_proc: Optional[int]=None,
55
+ field: Optional[str]=None,
56
+ transform: Optional[Callable]=None) -> Iterator[list[dict[str, str]]]:
57
+ assert kind == 'instruct'
58
+
59
+ dataset = load_dataset(path=path,
60
+ name=name,
61
+ data_dir=data_dir,
62
+ data_files=data_files,
63
+ keep_in_memory=keep_in_memory,
64
+ revision=revision,
65
+ split=split,
66
+ trust_remote_code=True,
67
+ num_proc=num_proc)
68
+
69
+ if callable(transform):
70
+ for row in dataset:
71
+ if field:
72
+ messages = transform(row[field])
73
+ else:
74
+ messages = transform(row)
75
+
76
+ yield messages
77
+ else:
78
+ for row in dataset:
79
+ if field:
80
+ messages = row[field]
81
+ else:
82
+ raise ValueError(field)
83
+
84
+ yield messages
85
+
86
+ del dataset
87
+ gc.collect()
88
+
89
+
90
+ def tokenize_text_fn(dataset_config: dict, hf_tokenizer: AutoTokenizer, tokenizer: Tokenizer) -> Iterator[torch.Tensor]:
91
+ for text in batch_text_iterator(**dataset_config):
92
+ text_ids: torch.Tensor = tokenizer.encode(text, bos=False, eos=True)
93
+ yield text_ids
94
+
95
+
96
+ def tokenize_chat_fn(dataset_config: dict, hf_tokenizer: AutoTokenizer, tokenizer: Tokenizer) -> Iterator[torch.Tensor]:
97
+ for messages in batch_chat_iterator(**dataset_config):
98
+ text: str = hf_tokenizer.apply_chat_template(messages, tokenize=False)
99
+ text_ids: torch.Tensor = tokenizer.encode(text, bos=False, eos=False)
100
+ yield text_ids
101
+
102
+
103
+ def tokenize_fn(dataset_config: dict, hf_tokenizer: AutoTokenizer, tokenizer: Tokenizer) -> Iterator[torch.Tensor]:
104
+ if dataset_config['kind'] == 'base':
105
+ for text in batch_text_iterator(**dataset_config):
106
+ try:
107
+ text_ids: torch.Tensor = tokenizer.encode(text, bos=False, eos=True)
108
+ except Exception as e:
109
+ print(f'Skip base raw: {e=} {type(text)=} {text=}')
110
+ continue
111
+
112
+ yield text_ids
113
+ elif dataset_config['kind'] == 'instruct':
114
+ for messages in batch_chat_iterator(**dataset_config):
115
+ try:
116
+ text: str = hf_tokenizer.apply_chat_template(messages, tokenize=False)
117
+ text_ids: torch.Tensor = tokenizer.encode(text, bos=False, eos=False)
118
+ except Exception as e:
119
+ print(f'Skip instruct row: {e=} {type(messages)=} {messages=}')
120
+ continue
121
+
122
+ yield text_ids
123
+ else:
124
+ raise ValueError(dataset_config['kind'])
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83b2d408ebeae398f24964d4e7ce0c847cd7ff554519941355641c7d0f68b09b
3
+ size 1845893
tokenizer_config.json ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": true,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "32000": {
30
+ "content": "<|endoftext|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "32001": {
38
+ "content": "<|assistant|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": true,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "32002": {
46
+ "content": "<|placeholder1|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": true,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "32003": {
54
+ "content": "<|placeholder2|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": true,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "32004": {
62
+ "content": "<|placeholder3|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": true,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "32005": {
70
+ "content": "<|placeholder4|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": true,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "32006": {
78
+ "content": "<|system|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": true,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "32007": {
86
+ "content": "<|end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": true,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "32008": {
94
+ "content": "<|placeholder5|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": true,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "32009": {
102
+ "content": "<|placeholder6|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": true,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "32010": {
110
+ "content": "<|user|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": true,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "32011": {
118
+ "content": "<tools>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": true,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "32012": {
126
+ "content": "</tools>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": true,
130
+ "single_word": false,
131
+ "special": true
132
+ },
133
+ "32013": {
134
+ "content": "<tool_call>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": true,
138
+ "single_word": false,
139
+ "special": true
140
+ },
141
+ "32014": {
142
+ "content": "</tool_call>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": true,
146
+ "single_word": false,
147
+ "special": true
148
+ },
149
+ "32015": {
150
+ "content": "<tool_response>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": true,
154
+ "single_word": false,
155
+ "special": true
156
+ },
157
+ "32016": {
158
+ "content": "</tool_response>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": true,
162
+ "single_word": false,
163
+ "special": true
164
+ },
165
+ "32017": {
166
+ "content": "<think>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": true,
170
+ "single_word": false,
171
+ "special": true
172
+ },
173
+ "32018": {
174
+ "content": "</think>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": true,
178
+ "single_word": false,
179
+ "special": true
180
+ }
181
+ },
182
+ "bos_token": "<s>",
183
+ "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>' + message['content'] + '<|end|>'}}{% elif message['role'] == 'user' %}{{'<|user|>' + message['content'] + '<|end|>'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>' + message['content'] + '<|end|>'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}",
184
+ "clean_up_tokenization_spaces": false,
185
+ "eos_token": "<|endoftext|>",
186
+ "legacy": false,
187
+ "model_max_length": 131072,
188
+ "pad_token": "<|endoftext|>",
189
+ "padding_side": "left",
190
+ "sp_model_kwargs": {},
191
+ "tokenizer_class": "LlamaTokenizer",
192
+ "unk_token": "<unk>",
193
+ "use_default_system_prompt": false
194
+ }