In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
import os
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments, GPT2LMHeadModel

In [3]:
cache_dir = os.path.join(os.getcwd(), "data")

dataset = load_dataset("AI-MO/NuminaMath-CoT", cache_dir = cache_dir)
dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'problem', 'solution', 'messages'],
        num_rows: 859494
    })
    test: Dataset({
        features: ['source', 'problem', 'solution', 'messages'],
        num_rows: 100
    })
})

In [4]:
constanta = 4

problem = dataset['train']['problem'][constanta]
solution = dataset['train']['solution'][constanta]

print(f"Problem: {problem} \nSolution: {solution}")

Problem: Find all solutions to the equation $\displaystyle\sqrt[3]{3 - \frac{x}{3}} = -2$. 
Solution: Start by isolating the cube root:
$$ \sqrt[3]{3 - \frac{x}{3}} = -2 $$

Cube both sides to eliminate the cube root:
$$ 3 - \frac{x}{3} = (-2)^3 $$
$$ 3 - \frac{x}{3} = -8 $$

Solve for $x$:
$$ 3 + 8 = \frac{x}{3} $$
$$ 11 = \frac{x}{3} $$
$$ x = 33 $$

Thus, the solution to the equation is:
$$ \boxed{x = 33} $$


In [5]:
def format_text(item):
    return f"Problem: {item['problem']}\nSolution: {item['solution']}\n\n"

def prepare_dataset(dataset_dict, file_name, cache_dir):

    file = os.path.join(cache_dir, f"{file_name}.txt")

    if os.path.exists(file):
        return file
    
    with open(file, 'w', encoding='utf-8') as f:
        for item in dataset_dict:
            f.write(format_text(item))
    
    return file

train_file = prepare_dataset(
    dataset_dict = dataset['train'],
    file_name = "train",
    cache_dir = cache_dir
)

test_file = prepare_dataset(
    dataset_dict = dataset['test'],
    file_name = "test",
    cache_dir = cache_dir
)

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [7]:
with open(train_file, 'r') as f:
    num_lines = sum(1 for line in f)
num_lines

26997273

In [8]:
def tokenize_function(examples, tokenizer):
    item = tokenizer(
        examples['text'],
        truncation = True,
        max_length = 64,
        padding = True,
        return_tensors = 'pt'
    ).to("cuda")
    return item


def process_chunk(text_chunk, tokenizer):
    dataset = Dataset.from_dict({
        'text': text_chunk
    }).map(
        lambda x: tokenize_function(x, tokenizer),
        batched = True
    )
    return dataset


def create_dataset(file, tokenizer, dataset_name, cache_dir, chunk_size = 750_000, max_chunks = 20):
    dataset_path = os.path.join(cache_dir, dataset_name)

    if os.path.exists(dataset_path):
        return Dataset.load_from_disk(dataset_path)
    
    datasets = []
    chunks_processed = 0
    
    with open(file, 'r', encoding = 'utf-8') as f:
        while chunks_processed < max_chunks:
            text_chunk = []
            while len(text_chunk) < chunk_size:
                line = f.readline()
                if not line:
                    break
                text_chunk.append(line)
            
            if text_chunk:
                datasets.append(process_chunk(text_chunk, tokenizer))
                chunks_processed += 1
    

    the_dataset = concatenate_datasets(datasets)
    the_dataset.save_to_disk(dataset_path)
    
    return the_dataset


train_dataset = create_dataset(
    file = train_file, 
    tokenizer = tokenizer,
    dataset_name = "train_dataset",
    cache_dir = cache_dir
    )

test_dataset = create_dataset(
    file = test_file, 
    tokenizer = tokenizer,
    dataset_name = "test_dataset",
    cache_dir = cache_dir,
    max_chunks = 1
    )

In [9]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = False
    )

In [10]:
print(train_dataset[13])

{'text': 'Problem: Suppose that $g(x) = 5x - 3$. What is $g^{-1}(g^{-1}(14))$?\n', 'input_ids': [40781, 25, 39200, 326, 720, 70, 7, 87, 8, 796, 642, 87, 532, 513, 35307, 1867, 318, 720, 70, 36796, 12, 16, 92, 7, 70, 36796, 12, 16, 92, 7, 1415, 4008, 3, 30, 198, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [11]:
print(tokenizer.decode(train_dataset[13]['input_ids']))

Problem: Suppose that $g(x) = 5x - 3$. What is $g^{-1}(g^{-1}(14))$?
<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>


In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    loss = torch.nn.CrossEntropyLoss()(
        torch.tensor(predictions).cuda(), 
        torch.tensor(labels).cuda()
    ).item()
    
    return loss

In [13]:
len(train_dataset)

15000000

In [14]:
train_dataset = train_dataset.select(range(len(train_dataset)//15_000))

In [21]:
os.makedirs("models", exist_ok = True)
os.makedirs("models/math-gpt2-v0", exist_ok = True)
os.makedirs("models/math-gpt2-v0/logs", exist_ok = True)

In [23]:
import gc

gc.collect()
torch.cuda.empty_cache()

# идея - сохранить лучший и оба по обе стороны
# правильно ли я понимаю, что в текущих условиях, будет соохранена только одна модель, а logs - все

In [24]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))
model = model.to("cuda")

training_args = TrainingArguments(

    output_dir = "models/math-gpt2-v0",             # директория для сохранения модели
    logging_dir = "models/math-gpt2-v0/logs",       # куда будут сохраняться логи обучения в формате TensorBoard 
    run_name = "math-gpt2-v0",                      # название эксперимента в системах логирования (например, TensorBoard)
    overwrite_output_dir = False,
    save_total_limit = 1, # сохранить модель с лучшим счетом из всех steps
    save_strategy = "steps",
    
    do_eval = True,                     # валидация
    eval_strategy = "steps",            # стратегия запуска волидации: по эпохам или по шагам
    eval_steps = 10000,                   # количество шагов между валидациями

    load_best_model_at_end = True,
    metric_for_best_model = "cross_entropy",    # по какой метрике оцениваем
    greater_is_better = False,                  # хотим ее минимизировать или максимизировать

    num_train_epochs = 4,
    per_device_train_batch_size = 32,   # количество примеров для обучения на одном устройстве GPU
    per_device_eval_batch_size = 64, 
    gradient_checkpointing = True,      # экономит память GPU за счет пересчета некоторых промежуточных активаций вместо их хранения
    fp16 = True,                        # использование 16-битных чисел с плавающей точкой вместо стандартных 32-битных (fp32)

    gradient_accumulation_steps = 2,    # накапливает градиенты N батчей перед обновлением весов
    eval_accumulation_steps = 2,        # накапливает результаты N батчей при валидации

    learning_rate = 5e-5,               # ОТВЕТ ЗНАТОКА: это базовый lr
    warmup_steps = 500,                 # Количество шагов для прогрева (warmup) learning rate. Прогрев в этом случае - обновление learning rate.
    weight_decay = 0.01,                # ниже тоже базовые, знаю
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    max_grad_norm = 1.0,
    
    save_steps = 10000,                   # частота сохранения модели
    logging_steps = 10000,                # частота логирования метрик
    push_to_hub = False,                # загрузка модели на HuggingFace
)

trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    compute_metrics = compute_metrics
)

In [25]:
trainer.train()

  0%|          | 0/64 [00:00<?, ?it/s]

{'train_runtime': 27.7282, 'train_samples_per_second': 144.257, 'train_steps_per_second': 2.308, 'train_loss': 6.928213119506836, 'epoch': 4.0}


TrainOutput(global_step=64, training_loss=6.928213119506836, metrics={'train_runtime': 27.7282, 'train_samples_per_second': 144.257, 'train_steps_per_second': 2.308, 'total_flos': 130646016000000.0, 'train_loss': 6.928213119506836, 'epoch': 4.0})