import json | |
roles_map = { | |
'system': 'system', | |
'user': 'user', | |
'human': 'user', | |
'assistant': 'assistant', | |
'gpt': 'assistant', | |
'AI': 'assistant', | |
} | |
cpt_instruct_datasets = [ | |
# | |
# general instructs | |
# | |
# 1.48 GB, 1,420,909 | |
# mlabonne/open-perfectblend | |
# meta-math/MetaMathQA 395,000 | |
# openbmb/UltraInteract_sft 288,579 | |
# HuggingFaceH4/ultrachat_200k 207,865 | |
# microsoft/orca-math-word-problems-200k 200,035 | |
# HuggingFaceH4/ultrafeedback_binarized 187,405 | |
# theblackcat102/evol-codealpaca-v1 111,272 | |
# Post-training-Data-Flywheel/AutoIF-instruct-61k 61,492 | |
# mlabonne/lmsys-arena-human-preference-55k-sharegpt 57,362 | |
{'kind': 'instruct', 'path': 'mlabonne/open-perfectblend', 'split': 'train', 'field': 'conversations', 'transform': lambda msgs: [ | |
{'role': roles_map[m['from']], 'content': m['value']} | |
for m in msgs | |
]}, | |
# 1.41 GB, 939,343 | |
# allenai/tulu-3-sft-mixture | |
# CoCoNot (ODC-BY-1.0), 10,983 prompts (Brahman et al., 2024) | |
# FLAN v2 via ai2-adapt-dev/flan_v2_converted, 89,982 prompts (Longpre et al., 2023) | |
# No Robots (CC-BY-NC-4.0), 9,500 prompts (Rajani et al. 2023) | |
# OpenAssistant Guanaco (Apache 2.0), 7,132 prompts (Kopf et al., 2024) | |
# Tulu 3 Persona MATH (ODC-BY-1.0), 149,960 prompts | |
# Tulu 3 Persona GSM (ODC-BY-1.0), 49,980 prompts | |
# Tulu 3 Persona Python (ODC-BY-1.0), 34,999 prompts | |
# Tulu 3 Persona Algebra (ODC-BY-1.0), 20,000 prompts | |
# Tulu 3 Persona IF (ODC-BY-1.0), 29,980 prompts | |
# NuminaMath-TIR (Apache 2.0), 64,312 prompts (Beeching et al. 2024) | |
# Tulu 3 WildGuardMix (Apache 2.0), 50,000 prompts (Han et al., 2024) | |
# Tulu 3 WildJailbreak (ODC-BY-1.0), 50,000 prompts (Wildteaming, 2024) | |
# Tulu 3 Hardcoded (CC-BY-4.0), 240 prompts | |
# Aya (Apache 2.0), 100,000 prompts (Singh et al., 2024) | |
# WildChat GPT-4 (ODC-BY-1.0), 100,000 prompts (Zhao et al., 2024) | |
# TableGPT (MIT), 5,000 prompts (Zha et al., 2023) | |
# SciRIFF (ODC-BY-1.0), 10,000 prompts (Wadden et al., 2024) | |
# Evol CodeAlpaca (Apache 2.0), 107,276 prompts (Luo et al., 2023) | |
{'kind': 'instruct', 'path': 'allenai/tulu-3-sft-mixture', 'split': 'train', 'field': 'messages'}, | |
# | |
# multilingual instructs | |
# | |
# 2.48 GB, 5,808,694 | |
# rombodawg/Everything_Instruct_Multilingual | |
# Science: | |
# antiven0m/physical-reasoning-dpoScience | |
# LawalAfeez/science-dataset | |
# Social media: | |
# Kyle1668/AG-Tweets | |
# euclaise/reddit-instruct-curated | |
# General Knowledge: | |
# NousResearch/CharacterCodex_Characters | |
# jstet/quotes-500k_Famous_Quotes | |
# FronkonGames/steam-games-dataset_Video_Games | |
# totuta_youtube_subs_howto100M_HowTo | |
# Multi-lingual: | |
# Amani27/massive_translation_dataset | |
# udmurtNLP/udmurt-russian-english-labse | |
# grosenthal/latin_english | |
# msarmi9/korean-english-multitarget-ted-talks-task | |
# HaiderSultanArc/MT-Urdu-English_Translate | |
# Garsa3112/ChineseEnglishTranslationDataset | |
# Cooking: | |
# andrewsiah/se_cooking_preference_sft | |
# Hieu-Phamkaggle/food_recipes | |
# Writing: | |
# shahules786/PoetryFoundationData | |
# euclaise/writingprompts | |
# qwedsacf/ivypanda-essaysEssay | |
# Medicine: | |
# keivalya/MedQuad-MedicalQnADataset | |
# nuvocare/MSD | |
# History: | |
# ambrosfitz10k/history_data_v4 | |
# Law: | |
# dzunggg/legal-qa-v1 | |
# Role-Play: | |
# roleplay4/fun_CoupleRP | |
# Undi95andrijdavid/roleplay-conversation-sharegpt | |
# News: | |
# RealTimeData/bbc_news_alltime | |
# Coding: (rombodawg/code_bagel) | |
# layoric/tiny-codes-alpaca | |
# glaiveai/glaive-code-assistant-v3 | |
# ajibawa-2023/Code-290k-ShareGPT | |
# chargoddard/commitpack-ft-instruct-rated | |
# iamtarun/code_instructions_120k_alpaca | |
# ise-uiuc/Magicoder-Evol-Instruct-110K | |
# cognitivecomputations/dolphin-coder | |
# nickrosh/Evol-Instruct-Code-80k-v1 | |
# coseal/CodeUltraFeedback_binarized | |
# CyberNative/Code_Vulnerability_Security_DPO | |
# Math: (rombodawg/code_bagel) | |
# TIGER-Lab/MathInstruct | |
# Function calling: (rombodawg/code_bagel) | |
# glaiveai/glaive-function-calling-v2 | |
# General Instruct: (rombodawg/OpenHermes-2.5-Uncensored) | |
# teknium/OpenHermes-2.5 | |
{'kind': 'instruct', 'path': 'rombodawg/Everything_Instruct_Multilingual', 'split': 'train', 'transform': lambda r: [ | |
{'role': 'system', 'content': r['instruction']}, | |
{'role': 'user', 'content': r['input']}, | |
{'role': 'assistant', 'content': r['output']}, | |
]}, | |
] | |