{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch\n", "torch.cuda.is_available()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os\n", "from datasets import load_dataset, Dataset, concatenate_datasets\n", "from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments, GPT2LMHeadModel" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['source', 'problem', 'solution', 'messages'],\n", " num_rows: 859494\n", " })\n", " test: Dataset({\n", " features: ['source', 'problem', 'solution', 'messages'],\n", " num_rows: 100\n", " })\n", "})" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cache_dir = os.path.join(os.getcwd(), \"data\")\n", "\n", "dataset = load_dataset(\"AI-MO/NuminaMath-CoT\", cache_dir = cache_dir)\n", "dataset" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Problem: Find all solutions to the equation $\\displaystyle\\sqrt[3]{3 - \\frac{x}{3}} = -2$. \n", "Solution: Start by isolating the cube root:\n", "$$ \\sqrt[3]{3 - \\frac{x}{3}} = -2 $$\n", "\n", "Cube both sides to eliminate the cube root:\n", "$$ 3 - \\frac{x}{3} = (-2)^3 $$\n", "$$ 3 - \\frac{x}{3} = -8 $$\n", "\n", "Solve for $x$:\n", "$$ 3 + 8 = \\frac{x}{3} $$\n", "$$ 11 = \\frac{x}{3} $$\n", "$$ x = 33 $$\n", "\n", "Thus, the solution to the equation is:\n", "$$ \\boxed{x = 33} $$\n" ] } ], "source": [ "constanta = 4\n", "\n", "problem = dataset['train']['problem'][constanta]\n", "solution = dataset['train']['solution'][constanta]\n", "\n", "print(f\"Problem: {problem} \\nSolution: {solution}\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def format_text(item):\n", " return f\"Problem: {item['problem']}\\nSolution: {item['solution']}\\n\\n\"\n", "\n", "def prepare_dataset(dataset_dict, file_name, cache_dir):\n", "\n", " file = os.path.join(cache_dir, f\"{file_name}.txt\")\n", "\n", " if os.path.exists(file):\n", " return file\n", " \n", " with open(file, 'w', encoding='utf-8') as f:\n", " for item in dataset_dict:\n", " f.write(format_text(item))\n", " \n", " return file\n", "\n", "train_file = prepare_dataset(\n", " dataset_dict = dataset['train'],\n", " file_name = \"train\",\n", " cache_dir = cache_dir\n", ")\n", "\n", "test_file = prepare_dataset(\n", " dataset_dict = dataset['test'],\n", " file_name = \"test\",\n", " cache_dir = cache_dir\n", ")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "tokenizer = GPT2Tokenizer.from_pretrained('gpt2')\n", "tokenizer.pad_token = tokenizer.eos_token" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "26997273" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with open(train_file, 'r') as f:\n", " num_lines = sum(1 for line in f)\n", "num_lines" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def tokenize_function(examples, tokenizer):\n", " item = tokenizer(\n", " examples['text'],\n", " truncation = True,\n", " max_length = 64,\n", " padding = True,\n", " return_tensors = 'pt'\n", " ).to(\"cuda\")\n", " return item\n", "\n", "\n", "def process_chunk(text_chunk, tokenizer):\n", " dataset = Dataset.from_dict({\n", " 'text': text_chunk\n", " }).map(\n", " lambda x: tokenize_function(x, tokenizer),\n", " batched = True\n", " )\n", " return dataset\n", "\n", "\n", "def create_dataset(file, tokenizer, dataset_name, cache_dir, chunk_size = 750_000, max_chunks = 20):\n", " dataset_path = os.path.join(cache_dir, dataset_name)\n", "\n", " if os.path.exists(dataset_path):\n", " return Dataset.load_from_disk(dataset_path)\n", " \n", " datasets = []\n", " chunks_processed = 0\n", " \n", " with open(file, 'r', encoding = 'utf-8') as f:\n", " while chunks_processed < max_chunks:\n", " text_chunk = []\n", " while len(text_chunk) < chunk_size:\n", " line = f.readline()\n", " if not line:\n", " break\n", " text_chunk.append(line)\n", " \n", " if text_chunk:\n", " datasets.append(process_chunk(text_chunk, tokenizer))\n", " chunks_processed += 1\n", " \n", "\n", " the_dataset = concatenate_datasets(datasets)\n", " the_dataset.save_to_disk(dataset_path)\n", " \n", " return the_dataset\n", "\n", "\n", "train_dataset = create_dataset(\n", " file = train_file, \n", " tokenizer = tokenizer,\n", " dataset_name = \"train_dataset\",\n", " cache_dir = cache_dir\n", " )\n", "\n", "test_dataset = create_dataset(\n", " file = test_file, \n", " tokenizer = tokenizer,\n", " dataset_name = \"test_dataset\",\n", " cache_dir = cache_dir,\n", " max_chunks = 1\n", " )" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "data_collator = DataCollatorForLanguageModeling(\n", " tokenizer = tokenizer,\n", " mlm = False\n", " )" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'text': 'Problem: Suppose that $g(x) = 5x - 3$. What is $g^{-1}(g^{-1}(14))$?\\n', 'input_ids': [40781, 25, 39200, 326, 720, 70, 7, 87, 8, 796, 642, 87, 532, 513, 35307, 1867, 318, 720, 70, 36796, 12, 16, 92, 7, 70, 36796, 12, 16, 92, 7, 1415, 4008, 3, 30, 198, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}\n" ] } ], "source": [ "print(train_dataset[13])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Problem: Suppose that $g(x) = 5x - 3$. What is $g^{-1}(g^{-1}(14))$?\n", "<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>\n" ] } ], "source": [ "print(tokenizer.decode(train_dataset[13]['input_ids']))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "def compute_metrics(eval_pred):\n", " predictions, labels = eval_pred\n", " \n", " loss = torch.nn.CrossEntropyLoss()(\n", " torch.tensor(predictions).cuda(), \n", " torch.tensor(labels).cuda()\n", " ).item()\n", " \n", " return loss" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "15000000" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(train_dataset)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "train_dataset = train_dataset.select(range(len(train_dataset)//15_000))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "os.makedirs(\"models\", exist_ok = True)\n", "os.makedirs(\"models/math-gpt2-v0\", exist_ok = True)\n", "os.makedirs(\"models/math-gpt2-v0/logs\", exist_ok = True)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "import gc\n", "\n", "gc.collect()\n", "torch.cuda.empty_cache()\n", "\n", "# идея - сохранить лучший и оба по обе стороны\n", "# правильно ли я понимаю, что в текущих условиях, будет соохранена только одна модель, а logs - все" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "model = GPT2LMHeadModel.from_pretrained('gpt2')\n", "model.resize_token_embeddings(len(tokenizer))\n", "model = model.to(\"cuda\")\n", "\n", "training_args = TrainingArguments(\n", "\n", " output_dir = \"models/math-gpt2-v0\", # директория для сохранения модели\n", " logging_dir = \"models/math-gpt2-v0/logs\", # куда будут сохраняться логи обучения в формате TensorBoard \n", " run_name = \"math-gpt2-v0\", # название эксперимента в системах логирования (например, TensorBoard)\n", " overwrite_output_dir = False,\n", " save_total_limit = 1, # сохранить модель с лучшим счетом из всех steps\n", " save_strategy = \"steps\",\n", " \n", " do_eval = True, # валидация\n", " eval_strategy = \"steps\", # стратегия запуска волидации: по эпохам или по шагам\n", " eval_steps = 10000, # количество шагов между валидациями\n", "\n", " load_best_model_at_end = True,\n", " metric_for_best_model = \"cross_entropy\", # по какой метрике оцениваем\n", " greater_is_better = False, # хотим ее минимизировать или максимизировать\n", "\n", " num_train_epochs = 4,\n", " per_device_train_batch_size = 32, # количество примеров для обучения на одном устройстве GPU\n", " per_device_eval_batch_size = 64, \n", " gradient_checkpointing = True, # экономит память GPU за счет пересчета некоторых промежуточных активаций вместо их хранения\n", " fp16 = True, # использование 16-битных чисел с плавающей точкой вместо стандартных 32-битных (fp32)\n", "\n", " gradient_accumulation_steps = 2, # накапливает градиенты N батчей перед обновлением весов\n", " eval_accumulation_steps = 2, # накапливает результаты N батчей при валидации\n", "\n", " learning_rate = 5e-5, # ОТВЕТ ЗНАТОКА: это базовый lr\n", " warmup_steps = 500, # Количество шагов для прогрева (warmup) learning rate. Прогрев в этом случае - обновление learning rate.\n", " weight_decay = 0.01, # ниже тоже базовые, знаю\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.999,\n", " max_grad_norm = 1.0,\n", " \n", " save_steps = 10000, # частота сохранения модели\n", " logging_steps = 10000, # частота логирования метрик\n", " push_to_hub = False, # загрузка модели на HuggingFace\n", ")\n", "\n", "trainer = Trainer(\n", " model = model,\n", " args = training_args,\n", " data_collator = data_collator,\n", " train_dataset = train_dataset,\n", " eval_dataset = test_dataset,\n", " compute_metrics = compute_metrics\n", ")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c4125cf1faae456796edb8f452529acb", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/64 [00:00