import json roles_map = { 'system': 'system', 'user': 'user', 'human': 'user', 'assistant': 'assistant', 'gpt': 'assistant', 'AI': 'assistant', } cpt_instruct_datasets = [ # # general instructs # # 1.48 GB, 1,420,909 # mlabonne/open-perfectblend # meta-math/MetaMathQA 395,000 # openbmb/UltraInteract_sft 288,579 # HuggingFaceH4/ultrachat_200k 207,865 # microsoft/orca-math-word-problems-200k 200,035 # HuggingFaceH4/ultrafeedback_binarized 187,405 # theblackcat102/evol-codealpaca-v1 111,272 # Post-training-Data-Flywheel/AutoIF-instruct-61k 61,492 # mlabonne/lmsys-arena-human-preference-55k-sharegpt 57,362 {'kind': 'instruct', 'path': 'mlabonne/open-perfectblend', 'split': 'train', 'field': 'conversations', 'transform': lambda msgs: [ {'role': roles_map[m['from']], 'content': m['value']} for m in msgs ]}, # 1.41 GB, 939,343 # allenai/tulu-3-sft-mixture # CoCoNot (ODC-BY-1.0), 10,983 prompts (Brahman et al., 2024) # FLAN v2 via ai2-adapt-dev/flan_v2_converted, 89,982 prompts (Longpre et al., 2023) # No Robots (CC-BY-NC-4.0), 9,500 prompts (Rajani et al. 2023) # OpenAssistant Guanaco (Apache 2.0), 7,132 prompts (Kopf et al., 2024) # Tulu 3 Persona MATH (ODC-BY-1.0), 149,960 prompts # Tulu 3 Persona GSM (ODC-BY-1.0), 49,980 prompts # Tulu 3 Persona Python (ODC-BY-1.0), 34,999 prompts # Tulu 3 Persona Algebra (ODC-BY-1.0), 20,000 prompts # Tulu 3 Persona IF (ODC-BY-1.0), 29,980 prompts # NuminaMath-TIR (Apache 2.0), 64,312 prompts (Beeching et al. 2024) # Tulu 3 WildGuardMix (Apache 2.0), 50,000 prompts (Han et al., 2024) # Tulu 3 WildJailbreak (ODC-BY-1.0), 50,000 prompts (Wildteaming, 2024) # Tulu 3 Hardcoded (CC-BY-4.0), 240 prompts # Aya (Apache 2.0), 100,000 prompts (Singh et al., 2024) # WildChat GPT-4 (ODC-BY-1.0), 100,000 prompts (Zhao et al., 2024) # TableGPT (MIT), 5,000 prompts (Zha et al., 2023) # SciRIFF (ODC-BY-1.0), 10,000 prompts (Wadden et al., 2024) # Evol CodeAlpaca (Apache 2.0), 107,276 prompts (Luo et al., 2023) {'kind': 'instruct', 'path': 'allenai/tulu-3-sft-mixture', 'split': 'train', 'field': 'messages'}, # # multilingual instructs # # 2.48 GB, 5,808,694 # rombodawg/Everything_Instruct_Multilingual # Science: # antiven0m/physical-reasoning-dpoScience # LawalAfeez/science-dataset # Social media: # Kyle1668/AG-Tweets # euclaise/reddit-instruct-curated # General Knowledge: # NousResearch/CharacterCodex_Characters # jstet/quotes-500k_Famous_Quotes # FronkonGames/steam-games-dataset_Video_Games # totuta_youtube_subs_howto100M_HowTo # Multi-lingual: # Amani27/massive_translation_dataset # udmurtNLP/udmurt-russian-english-labse # grosenthal/latin_english # msarmi9/korean-english-multitarget-ted-talks-task # HaiderSultanArc/MT-Urdu-English_Translate # Garsa3112/ChineseEnglishTranslationDataset # Cooking: # andrewsiah/se_cooking_preference_sft # Hieu-Phamkaggle/food_recipes # Writing: # shahules786/PoetryFoundationData # euclaise/writingprompts # qwedsacf/ivypanda-essaysEssay # Medicine: # keivalya/MedQuad-MedicalQnADataset # nuvocare/MSD # History: # ambrosfitz10k/history_data_v4 # Law: # dzunggg/legal-qa-v1 # Role-Play: # roleplay4/fun_CoupleRP # Undi95andrijdavid/roleplay-conversation-sharegpt # News: # RealTimeData/bbc_news_alltime # Coding: (rombodawg/code_bagel) # layoric/tiny-codes-alpaca # glaiveai/glaive-code-assistant-v3 # ajibawa-2023/Code-290k-ShareGPT # chargoddard/commitpack-ft-instruct-rated # iamtarun/code_instructions_120k_alpaca # ise-uiuc/Magicoder-Evol-Instruct-110K # cognitivecomputations/dolphin-coder # nickrosh/Evol-Instruct-Code-80k-v1 # coseal/CodeUltraFeedback_binarized # CyberNative/Code_Vulnerability_Security_DPO # Math: (rombodawg/code_bagel) # TIGER-Lab/MathInstruct # Function calling: (rombodawg/code_bagel) # glaiveai/glaive-function-calling-v2 # General Instruct: (rombodawg/OpenHermes-2.5-Uncensored) # teknium/OpenHermes-2.5 {'kind': 'instruct', 'path': 'rombodawg/Everything_Instruct_Multilingual', 'split': 'train', 'transform': lambda r: [ {'role': 'system', 'content': r['instruction']}, {'role': 'user', 'content': r['input']}, {'role': 'assistant', 'content': r['output']}, ]}, ]