Spaces:
Sleeping
Sleeping
import os | |
import numpy as np | |
import torch | |
from model import Transformer, TransformerConfig | |
from data_collector import DataCollector | |
from torch.utils.data import DataLoader | |
from datasets import load_from_disk, load_dataset,DatasetDict,concatenate_datasets | |
from transformers import get_scheduler | |
from tokenizer import CustomTokenizer | |
from tqdm import tqdm | |
from accelerate import Accelerator | |
import wandb | |
from huggingface_hub import HfApi, create_repo | |
import shutil | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
# MODEL CONFIG | |
src_vocab_size: int = 32000 | |
tgt_vocab_size: int = 32000 | |
max_seq_length: int = 512 | |
d_model: int = 512 | |
num_heads: int = 8 | |
num_encoder_layers: int = 6 | |
num_decoder_layers: int = 6 | |
dropout_p: float = 0.1 | |
dff: int = 2048 | |
config = TransformerConfig( | |
src_vocab_size=src_vocab_size, | |
tgt_vocab_size=tgt_vocab_size, | |
max_seq_length=max_seq_length, | |
d_model=d_model, | |
num_heads=num_heads, | |
num_encoder_layers=num_encoder_layers, | |
num_decoder_layers=num_decoder_layers, | |
dropout_p=0.1, | |
dff=dff | |
) | |
# TOKENIZER CONFIG | |
src_tokenizer_path = "trained_tokenizers/vocab_en.json" | |
tgt_tokenizer_path = "trained_tokenizers/vocab_fr.json" | |
src_tokenizer = CustomTokenizer(path_to_vocab=src_tokenizer_path, max_length=config.max_seq_length) | |
tgt_tokenizer = CustomTokenizer(path_to_vocab=tgt_tokenizer_path, max_length=config.max_seq_length) | |
# DATALOADER CONFIG | |
path_to_data = "data/tokenized_dataset" | |
batch_size = 64 | |
gradient_accumulation_steps = 2 | |
# num_workers = 4 | |
# Training Config | |
learning_rate = 1e-4 | |
training_steps = 170000 | |
warmup_steps = 2000 | |
scheduler_type = "cosine" | |
evaluation_steps = 5000 | |
bias_norm_weight_decay = False | |
weight_decay = 0.001 | |
betas = (0.9, 0.98) | |
adam_eps = 1e-6 | |
#Logging Config | |
working_directory = "work_dir" | |
experiment_name = "Seq2Seq_Neural_Machine_Translation" | |
logging_interval = 1 | |
#Resume from checkpoint | |
resume_from_checkpoint = "checkpoint_170000" | |
#Prepare Accelerator | |
path_to_experiment = os.path.join(working_directory, experiment_name) | |
accelerator = Accelerator(project_dir=path_to_experiment, | |
log_with="wandb") | |
accelerator.init_trackers(experiment_name) | |
#config model device | |
config.device = accelerator.device | |
# Prepare Dataloaders | |
dataset = load_dataset("ngia/tokenized-translation-en-fr") | |
accelerator.print("Dataset:", dataset) | |
min_batch_size = batch_size // gradient_accumulation_steps | |
train_dataset = DataCollector(dataset=dataset["train"], english_tokenizer=src_tokenizer, french_tokenizer=tgt_tokenizer, max_length=config.max_seq_length) | |
test_dataset = DataCollector(dataset=dataset["test"], english_tokenizer=tgt_tokenizer, french_tokenizer=tgt_tokenizer, max_length=config.max_seq_length) | |
train_loader = DataLoader(dataset=train_dataset, batch_size=min_batch_size, shuffle=True) | |
test_loader = DataLoader(dataset=test_dataset, batch_size=min_batch_size, shuffle=False) | |
# Prepare model | |
model = Transformer(config=config) | |
model_parameters = filter(lambda p: p.requires_grad, model.parameters()) | |
params = sum([np.prod(p.size()) for p in model_parameters]) | |
accelerator.print("Number of trainable parameters:", params) | |
# Prepare Optimizer | |
optimizer = torch.optim.AdamW(model.parameters(), | |
lr=learning_rate, | |
betas=betas, | |
eps=adam_eps) | |
# Define scheduler | |
scheduler = get_scheduler( | |
name=scheduler_type, | |
optimizer=optimizer, | |
num_warmup_steps=warmup_steps, | |
num_training_steps=training_steps | |
) | |
# Define Loss Function | |
loss_fn = torch.nn.CrossEntropyLoss() | |
### Define a Sample Sentence for Testing ### | |
src_ids = torch.tensor(src_tokenizer.encode("I want to learn how to training a machine translation")).unsqueeze(0) | |
model, optimizer, trainloader, testloader, scheduler = accelerator.prepare( | |
model, optimizer, train_loader, test_loader, scheduler | |
) | |
accelerator.register_for_checkpointing(scheduler) | |
if resume_from_checkpoint is not None: | |
path_to_checkpoint = os.path.join(path_to_experiment, resume_from_checkpoint) | |
with accelerator.main_process_first(): | |
accelerator.load_state(path_to_checkpoint) | |
completed_steps = int(resume_from_checkpoint.split("_")[-1]) | |
accelerator.print(f"Resuming from Iteration: {completed_steps}") | |
else: | |
completed_steps = 0 | |
def push_model_HF(repo_id, path_to_experiment, step): | |
"""Save model and tokenizer locally, then push to Hugging Face Hub.""" | |
# Push to Hugging Face Hub | |
api = HfApi() | |
create_repo(repo_id, exist_ok=True) | |
api.upload_folder( | |
folder_path=path_to_experiment, | |
repo_id=repo_id, | |
repo_type="model" # or "dataset" if it's a dataset | |
) | |
print(f"Checkpoint {step} pushed to {repo_id}") | |
#copy tokenizers | |
shutil.copy2("trained_tokenizers/vocab_en.json", f"{path_to_experiment}/vocab_en.json") | |
shutil.copy2("trained_tokenizers/vocab_fr.json", f"{path_to_experiment}/vocab_fr.json") | |
#push model on HF | |
push_model_HF(repo_id="ngia/ml-translation-en-fr", path_to_experiment=path_to_experiment, step=completed_steps) | |
train = True | |
progress_bar = tqdm(range(completed_steps, training_steps), disable= not accelerator.is_local_main_process) | |
save_dir = "" | |
while train: | |
accumulate_steps = 0 | |
accumulate_loss = 0 | |
accuracy = 0 | |
for batch in trainloader: | |
src_input_ids = batch["src_input_ids"].to(accelerator.device) | |
src_pad_mask = batch["src_pad_mask"].to(accelerator.device) | |
tgt_input_ids = batch["tgt_input_ids"].to(accelerator.device) | |
tgt_pad_mask = batch["tgt_pad_mask"].to(accelerator.device) | |
tgt_labels = batch["tgt_labels"].to(accelerator.device) | |
model_output = model( | |
src_input_ids, | |
tgt_input_ids, | |
src_pad_mask, | |
tgt_pad_mask | |
) | |
model_output = model_output.flatten(0,1) | |
tgt_labels = tgt_labels.flatten() | |
loss = loss_fn(model_output, tgt_labels) | |
### Scale Loss and Accumulate ### | |
loss = loss / gradient_accumulation_steps | |
accumulate_loss += loss | |
### Compute Gradients ### | |
accelerator.backward(loss) | |
### Compute Accuracy (ignoring -100 padding labels) ### | |
model_output = model_output.argmax(axis=-1) | |
mask = (tgt_labels != -100) | |
output = model_output[mask] | |
tgt_outputs = tgt_labels[mask] | |
acc = (output == tgt_outputs).sum() / len(output) | |
accuracy += acc / gradient_accumulation_steps | |
### Iterate Accumulation ### | |
accumulate_steps += 1 | |
if accumulate_steps % gradient_accumulation_steps == 0: | |
### Clip and Update Model ### | |
accelerator.clip_grad_norm_(model.parameters(), max_norm=1.0) | |
optimizer.step() | |
optimizer.zero_grad(set_to_none=True) | |
scheduler.step() | |
### Log Results ### | |
if completed_steps % logging_interval == 0: | |
accumulate_loss = accumulate_loss.detach() | |
accuracy = accuracy.detach() | |
if accelerator.num_processes > 1: | |
accumulate_loss = torch.mean(accelerator.gather_for_metrics(accumulate_loss)) | |
accuracy = torch.mean(accelerator.gather_for_metrics(accuracy)) | |
log = {"train_loss": accumulate_loss, | |
"training_acc": accuracy, | |
"learning_rate": scheduler.get_last_lr()[0]} | |
accelerator.log(log, step=completed_steps) | |
logging_string = f"[{completed_steps}/{training_steps}] Training Loss: {accumulate_loss} | Training Acc: {accuracy}" | |
if accelerator.is_main_process: | |
progress_bar.write(logging_string) | |
if completed_steps % evaluation_steps == 0: | |
model.eval() | |
print("Evaluating!") | |
test_losses = [] | |
test_accs = [] | |
for batch in tqdm(testloader, disable=not accelerator.is_main_process): | |
src_input_ids = batch["src_input_ids"].to(accelerator.device) | |
src_pad_mask = batch["src_pad_mask"].to(accelerator.device) | |
tgt_input_ids = batch["tgt_input_ids"].to(accelerator.device) | |
tgt_pad_mask = batch["tgt_pad_mask"].to(accelerator.device) | |
tgt_labels = batch["tgt_labels"].to(accelerator.device) | |
with torch.inference_mode(): | |
model_output = model(src_input_ids, | |
tgt_input_ids, | |
src_pad_mask, | |
tgt_pad_mask) | |
### Flatten for Loss ### | |
model_output = model_output.flatten(0,1) | |
tgt_labels = tgt_labels.flatten() | |
### Compute Loss ### | |
loss = loss_fn(model_output, tgt_labels) | |
### Compute Accuracy (make sure to ignore -100 targets) ### | |
model_output = model_output.argmax(axis=-1) | |
mask = (tgt_labels != -100) | |
model_output = model_output[mask] | |
tgt_labels = tgt_labels[mask] | |
accuracy = (model_output == tgt_labels).sum() / len(model_output) | |
### Store Results ### | |
loss = loss.detach() | |
accuracy = accuracy.detach() | |
if accelerator.num_processes > 1: | |
loss = torch.mean(accelerator.gather_for_metrics(loss)) | |
accuracy = torch.mean(accelerator.gather_for_metrics(accuracy)) | |
### Store Metrics ### | |
test_losses.append(loss.item()) | |
test_accs.append(accuracy.item()) | |
test_loss = np.mean(test_losses) | |
test_acc = np.mean(test_accs) | |
log = {"test_loss": test_loss, | |
"test_acc": test_acc} | |
logging_string = f"Testing Loss: {test_loss} | Testing Acc: {test_acc}" | |
if accelerator.is_main_process: | |
progress_bar.write(logging_string) | |
### Log and Save Model ### | |
accelerator.log(log, step=completed_steps) | |
accelerator.save_state(os.path.join(path_to_experiment, f"checkpoint_{completed_steps}")) | |
push_model_HF(repo_id="ngia/ml-translation-en-fr", path_to_experiment=path_to_experiment, step=completed_steps) | |
### Testing Sentence ### | |
if accelerator.is_main_process: | |
src_ids = src_ids.to(accelerator.device) | |
unrwapped = accelerator.unwrap_model(model) | |
translated = unrwapped.inference(src_ids, | |
tgt_start_id=tgt_tokenizer.bos_token_id, | |
tgt_end_id=tgt_tokenizer.eos_token_id, max_seq_length=config.max_seq_length) | |
translated = tgt_tokenizer.decode(translated, skip_special_tokens=False) | |
if accelerator.is_main_process: | |
progress_bar.write(f"Translation: {translated}") | |
model.train() | |
if completed_steps >= training_steps: | |
train = False | |
accelerator.save_state(os.path.join(path_to_experiment, f"final_checkpoint")) | |
push_model_HF(repo_id="ngia/ml-translation-en-fr", path_to_experiment=path_to_experiment, step=completed_steps) | |
break | |
### Iterate Completed Steps ### | |
completed_steps += 1 | |
progress_bar.update(1) | |
### Reset Accumulated Variables ### | |
accumulate_loss = 0 | |
accuracy = 0 | |
accelerator.end_training() | |