3outeille HF staff commited on
Commit
45adef9
·
verified ·
1 Parent(s): 5b3fd24

Upload llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512

Browse files
llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512/bench.slurm ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=bench_cluster
4
+ #SBATCH --time=02:00:00
5
+ #SBATCH --partition=hopper-prod
6
+ #SBATCH --nodes=1
7
+ #SBATCH --gres=gpu:8
8
+ #SBATCH --qos=normal
9
+ #SBATCH --ntasks-per-node=1
10
+ #SBATCH --cpus-per-task=96
11
+ #SBATCH --exclusive
12
+ #SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512/log.out
13
+ #SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512/log.out
14
+
15
+ # Function to update status based on squeue output
16
+ update_status() {
17
+ job_id=$1
18
+ status_file=$2
19
+ # For unknown reasons, it doenst update status for pending. It only works for running
20
+ while true; do
21
+ job_status=$(squeue --job $job_id --noheader --format=%T)
22
+ echo "Job status: $job_status"
23
+ if [ -z "$job_status" ]; then
24
+ # Job has finished or is not found
25
+ break
26
+ elif [ "$job_status" = "RUNNING" ]; then
27
+ printf "running" > $status_file
28
+ break
29
+ fi
30
+ sleep 10
31
+ done
32
+ }
33
+
34
+ # Misc initializations.
35
+ echo "========================"
36
+ echo "START TIME: $(date)"
37
+ source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh
38
+ conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster
39
+ echo python3 version = $(python3 --version)
40
+ echo "========================"
41
+
42
+ # Slurm stuff
43
+ export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
44
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
45
+ export MASTER_PORT=$((1024 + RANDOM % 64511))
46
+
47
+ export TMPDIR=/scratch
48
+ export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache"
49
+ export CUBLAS_WORKSPACE_CONFIG=":4096:8"
50
+ export CUDA_DEVICE_MAX_CONNECTIONS="1"
51
+
52
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
53
+
54
+
55
+ NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
56
+ CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512/config.yaml"
57
+
58
+ LAUNCHER="torchrun \
59
+ --nproc_per_node 8 \
60
+ --nnodes 1 \
61
+ --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
62
+ --rdzv_backend c10d \
63
+ --max_restarts 0 \
64
+ --tee 3 \
65
+ --node_rank ${SLURM_PROCID}"
66
+
67
+ # Checkout the bench_cluster branch
68
+ cd $NANOTRON_REPO
69
+ git checkout bench_cluster
70
+ cd ..
71
+ # Get the current job ID
72
+ job_id=${SLURM_JOB_ID}
73
+
74
+ # Update status to "pending" or "running" in the background
75
+ update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512/status.txt &
76
+
77
+ # Run the main command
78
+ srun -u $LAUNCHER $CMD
79
+ exit_status=$?
80
+
81
+ # Update status based on the exit status of `srun`
82
+ if [ $exit_status -eq 0 ]; then
83
+ printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512/status.txt
84
+ else
85
+ if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512/log.out; then
86
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512/status.txt
87
+ elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512/log.out; then
88
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512/status.txt
89
+ elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512/log.out; then
90
+ printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512/status.txt
91
+ else
92
+ printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512/status.txt
93
+ fi
94
+ fi
95
+
96
+ # Run the report script if the job completed successfully
97
+ if [ $exit_status -eq 0 ]; then
98
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512 --is_logs
99
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512 --is_profiler
100
+ fi
101
+
102
+
103
+ # Push to hub the folder using huggingface_cli
104
+ huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512 llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512 --commit-message "Upload llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512"
105
+
106
+ # Verify the upload
107
+ if [ $? -eq 0 ]; then
108
+ echo "Uploading to Huggingface Hub successful"
109
+ else
110
+ echo "Failed to upload to Huggingface Hub"
111
+ fi
llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512/config.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ general:
2
+ project: bench_cluster
3
+ seed: 42
4
+ model:
5
+ ddp_bucket_cap_mb: 25
6
+ dtype: bfloat16
7
+ init_method:
8
+ std: 0.025
9
+ make_vocab_size_divisible_by: 1
10
+ model_config:
11
+ bos_token_id: 1
12
+ eos_token_id: 2
13
+ hidden_act: silu
14
+ hidden_size: 2048
15
+ initializer_range: 0.02
16
+ intermediate_size: 4096
17
+ is_llama_config: true
18
+ max_position_embeddings: 4096
19
+ num_attention_heads: 32
20
+ num_hidden_layers: 24
21
+ num_key_value_heads: 32
22
+ pad_token_id: null
23
+ pretraining_tp: 1
24
+ rms_norm_eps: 1.0e-05
25
+ rope_scaling: null
26
+ rope_theta: 10000.0
27
+ tie_word_embeddings: true
28
+ use_cache: true
29
+ vocab_size: 50257
30
+ optimizer:
31
+ accumulate_grad_in_fp32: true
32
+ clip_grad: 1.0
33
+ learning_rate_scheduler:
34
+ learning_rate: 0.0001
35
+ lr_decay_style: linear
36
+ lr_warmup_style: linear
37
+ lr_warmup_steps: 1
38
+ min_decay_lr: 1.0e-05
39
+ optimizer_factory:
40
+ adam_beta1: 0.9
41
+ adam_beta2: 0.95
42
+ adam_eps: 1.0e-08
43
+ name: adamW
44
+ torch_adam_is_fused: true
45
+ weight_decay: 0.01
46
+ zero_stage: 1
47
+ parallelism:
48
+ dp: 2
49
+ expert_parallel_size: 1
50
+ pp: 1
51
+ pp_engine: 1f1b
52
+ tp: 4
53
+ tp_linear_async_communication: false
54
+ tp_mode: REDUCE_SCATTER
55
+ profiler:
56
+ profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512
57
+ tokenizer:
58
+ tokenizer_max_length: null
59
+ tokenizer_name_or_path: openai-community/gpt2
60
+ tokenizer_revision: null
61
+ data_stages:
62
+ - name: Training Stage
63
+ start_training_step: 1
64
+ data:
65
+ dataset:
66
+ dataset_overwrite_cache: false
67
+ dataset_processing_num_proc_per_process: 64
68
+ hf_dataset_config_name: null
69
+ hf_dataset_or_datasets: roneneldan/TinyStories
70
+ hf_dataset_splits: train
71
+ text_column_name: text
72
+ num_loading_workers: 0
73
+ seed: 42
74
+ lighteval: null
75
+ tokens:
76
+ train_steps: 20
77
+ val_check_interval: -1
78
+ batch_accumulation_per_replica: 1
79
+ limit_test_batches: 0
80
+ limit_val_batches: 0
81
+ micro_batch_size: 512
82
+ sequence_length: 4096
83
+ logging:
84
+ iteration_step_info_interval: 1
85
+ log_level: info
86
+ log_level_replica: info
87
+ checkpoints:
88
+ checkpoint_interval: 100000
89
+ checkpoints_path: /dev/null
90
+ resume_checkpoint_path: null
llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512/log.out ADDED
@@ -0,0 +1,642 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========================
2
+ START TIME: Wed Jul 3 23:02:27 UTC 2024
3
+ python3 version = Python 3.10.14
4
+ ========================
5
+ The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
6
+ Token is valid (permission: write).
7
+ Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token
8
+ Login successful
9
+ Already on 'bench_cluster'
10
+ M examples/config_tiny_llama.py
11
+ M examples/config_tiny_llama.yaml
12
+ M examples/train_tiny_llama.sh
13
+ M src/nanotron/models/llama.py
14
+ M src/nanotron/trainer.py
15
+ Your branch is up to date with 'origin/bench_cluster'.
16
+ Job status: RUNNING
17
+ W0703 23:02:30.165000 140703814809408 torch/distributed/run.py:757]
18
+ W0703 23:02:30.165000 140703814809408 torch/distributed/run.py:757] *****************************************
19
+ W0703 23:02:30.165000 140703814809408 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
20
+ W0703 23:02:30.165000 140703814809408 torch/distributed/run.py:757] *****************************************
21
+ [default0]:07/03/2024 23:02:46 [WARNING|DP=0|PP=0|TP=0|ip-26-0-164-187]: [Vocab Size Padding] Padded vocab (size: 50257) with 3 dummy tokens (new size: 50260)
22
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Config:
23
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Config(general=GeneralArgs(project='bench_cluster',
24
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: run='%date_%jobid',
25
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: seed=42,
26
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: step=None,
27
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: consumed_train_samples=None,
28
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: benchmark_csv_path=None,
29
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: ignore_sanity_checks=True),
30
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: parallelism=ParallelismArgs(dp=2,
31
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: pp=1,
32
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tp=4,
33
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: pp_engine=<nanotron.parallel.pipeline_parallel.engine.OneForwardOneBackwardPipelineEngine object at 0x7f7964bfc8b0>,
34
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tp_mode=<TensorParallelLinearMode.REDUCE_SCATTER: 2>,
35
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tp_linear_async_communication=False,
36
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: expert_parallel_size=1),
37
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1,
38
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: eos_token_id=2,
39
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: hidden_act='silu',
40
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: hidden_size=2048,
41
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: initializer_range=0.02,
42
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: intermediate_size=4096,
43
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: is_llama_config=True,
44
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: max_position_embeddings=4096,
45
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_attention_heads=32,
46
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_hidden_layers=24,
47
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_key_value_heads=32,
48
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: pad_token_id=None,
49
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: pretraining_tp=1,
50
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: rms_norm_eps=1e-05,
51
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: rope_scaling=None,
52
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: rope_theta=10000.0,
53
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tie_word_embeddings=True,
54
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: use_cache=True,
55
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: vocab_size=50260),
56
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: init_method=RandomInit(std=0.025),
57
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: dtype=torch.bfloat16,
58
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: make_vocab_size_divisible_by=1,
59
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: ddp_bucket_cap_mb=25),
60
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2',
61
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tokenizer_revision=None,
62
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tokenizer_max_length=None),
63
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'),
64
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: checkpoint_interval=100000,
65
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: save_initial_state=False,
66
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: resume_checkpoint_path=None,
67
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: checkpoints_path_is_shared_file_system=False),
68
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: logging=LoggingArgs(log_level='info',
69
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: log_level_replica='info',
70
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: iteration_step_info_interval=1),
71
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tokens=TokensArgs(sequence_length=4096,
72
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: train_steps=20,
73
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: micro_batch_size=512,
74
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: batch_accumulation_per_replica=1,
75
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: val_check_interval=-1,
76
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: limit_val_batches=0,
77
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: limit_test_batches=0),
78
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08,
79
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: adam_beta1=0.9,
80
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: adam_beta2=0.95,
81
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: torch_adam_is_fused=True,
82
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: name='adamW'),
83
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: zero_stage=1,
84
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: weight_decay=0.01,
85
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: clip_grad=1.0,
86
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: accumulate_grad_in_fp32=True,
87
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001,
88
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: lr_warmup_steps=1,
89
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: lr_warmup_style='linear',
90
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: lr_decay_style='linear',
91
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: lr_decay_steps=19,
92
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: lr_decay_starting_step=None,
93
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: min_decay_lr=1e-05)),
94
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: data_stages=[DatasetStageArgs(name='Training Stage',
95
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: start_training_step=1,
96
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories',
97
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: hf_dataset_splits='train',
98
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: hf_dataset_config_name=None,
99
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: dataset_processing_num_proc_per_process=64,
100
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: dataset_overwrite_cache=False,
101
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: text_column_name='text'),
102
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: seed=42,
103
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_loading_workers=0))],
104
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512')),
105
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: lighteval=None)
106
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Model Config:
107
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: LlamaConfig(bos_token_id=1,
108
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: eos_token_id=2,
109
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: hidden_act='silu',
110
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: hidden_size=2048,
111
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: initializer_range=0.02,
112
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: intermediate_size=4096,
113
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: is_llama_config=True,
114
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: max_position_embeddings=4096,
115
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_attention_heads=32,
116
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_hidden_layers=24,
117
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_key_value_heads=32,
118
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: pad_token_id=None,
119
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: pretraining_tp=1,
120
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: rms_norm_eps=1e-05,
121
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: rope_scaling=None,
122
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: rope_theta=10000.0,
123
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tie_word_embeddings=True,
124
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: use_cache=True,
125
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: vocab_size=50260)
126
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Building model..
127
+ [default0]:07/03/2024 23:02:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Setting PP block ranks...
128
+ [default2]:07/03/2024 23:02:59 [INFO|DP=0|PP=0|TP=2|ip-26-0-164-187]: Local number of parameters: 277M (529.27MiB)
129
+ [default2]:07/03/2024 23:02:59 [INFO|DP=0|PP=0|TP=2|ip-26-0-164-187]: [After model building] Memory usage: 554.21MiB. Peak allocated: 606.24MiB Peak reserved: 608.00MiB
130
+ [default2]:07/03/2024 23:02:59 [INFO|DP=0|PP=0|TP=2|ip-26-0-164-187]: No checkpoint path provided.
131
+ [default4]:07/03/2024 23:02:59 [INFO|DP=1|PP=0|TP=0|ip-26-0-164-187]: No checkpoint path provided.
132
+ [default1]:07/03/2024 23:02:59 [INFO|DP=0|PP=0|TP=1|ip-26-0-164-187]: Local number of parameters: 277M (529.27MiB)
133
+ [default1]:07/03/2024 23:02:59 [INFO|DP=0|PP=0|TP=1|ip-26-0-164-187]: [After model building] Memory usage: 554.21MiB. Peak allocated: 606.24MiB Peak reserved: 608.00MiB
134
+ [default1]:07/03/2024 23:02:59 [INFO|DP=0|PP=0|TP=1|ip-26-0-164-187]: No checkpoint path provided.
135
+ [default7]:07/03/2024 23:02:59 [INFO|DP=1|PP=0|TP=3|ip-26-0-164-187]: No checkpoint path provided.
136
+ [default3]:07/03/2024 23:02:59 [INFO|DP=0|PP=0|TP=3|ip-26-0-164-187]: Local number of parameters: 277M (529.27MiB)
137
+ [default3]:07/03/2024 23:02:59 [INFO|DP=0|PP=0|TP=3|ip-26-0-164-187]: [After model building] Memory usage: 554.21MiB. Peak allocated: 606.24MiB Peak reserved: 608.00MiB
138
+ [default3]:07/03/2024 23:02:59 [INFO|DP=0|PP=0|TP=3|ip-26-0-164-187]: No checkpoint path provided.
139
+ [default6]:07/03/2024 23:02:59 [INFO|DP=1|PP=0|TP=2|ip-26-0-164-187]: No checkpoint path provided.
140
+ [default5]:07/03/2024 23:02:59 [INFO|DP=1|PP=0|TP=1|ip-26-0-164-187]: No checkpoint path provided.
141
+ [default0]:07/03/2024 23:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Total number of parameters: 1.11G (2117.09MiB)
142
+ [default0]:07/03/2024 23:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Local number of parameters: 277M (529.27MiB)
143
+ [default0]:07/03/2024 23:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [After model building] Memory usage: 554.21MiB. Peak allocated: 606.24MiB Peak reserved: 608.00MiB
144
+ [default0]:07/03/2024 23:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: No checkpoint path provided.
145
+ [default0]:07/03/2024 23:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Parametrizing model parameters using StandardParametrizator
146
+ [default0]:07/03/2024 23:03:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [Optimizer Building] Using LearningRateForSP as learning rate
147
+ [default0]:07/03/2024 23:03:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [ZeRO sharding] Size of optimizer params per rank:
148
+ [default0]:07/03/2024 23:03:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [ZeRO sharding] DP Rank 0 has 139M out of 277M (50.00%) params' optimizer states
149
+ [default0]:07/03/2024 23:03:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [ZeRO sharding] DP Rank 1 has 139M out of 277M (50.00%) params' optimizer states
150
+ [default0]:07/03/2024 23:03:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples
151
+ [default0]:07/03/2024 23:03:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Using `datasets` library
152
+ [default0]:07/03/2024 23:03:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4')
153
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
154
+ [default0]:07/03/2024 23:03:03 [WARNING|DP=0|PP=0|TP=0|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
155
+ [default0]:07/03/2024 23:03:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [Training Plan] There are 1 training stages
156
+ [default0]:07/03/2024 23:03:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [Stage Training Stage] start from step 1
157
+ [default0]:07/03/2024 23:03:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]:
158
+ [default0]:07/03/2024 23:03:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [Start training] datetime: 2024-07-03 23:03:04.007731 | mbs: 512 | grad_accum: 1 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0
159
+ [default0]:07/03/2024 23:03:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps
160
+ [default0]:07/03/2024 23:03:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Memory usage: 2142.76MiB. Peak allocated 2142.76MiB. Peak reserved: 2198.00MiB
161
+ [default2]:07/03/2024 23:03:04 [WARNING|DP=0|PP=0|TP=2|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
162
+ [default7]:07/03/2024 23:03:04 [WARNING|DP=1|PP=0|TP=3|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
163
+ [default4]:07/03/2024 23:03:04 [WARNING|DP=1|PP=0|TP=0|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
164
+ [default5]:07/03/2024 23:03:04 [WARNING|DP=1|PP=0|TP=1|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
165
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
166
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
167
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
168
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
169
+ [default3]:07/03/2024 23:03:04 [WARNING|DP=0|PP=0|TP=3|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
170
+ [default6]:07/03/2024 23:03:04 [WARNING|DP=1|PP=0|TP=2|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
171
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
172
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
173
+ [default1]:07/03/2024 23:03:04 [WARNING|DP=0|PP=0|TP=1|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
174
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
175
+ [default1]:[rank1]: Traceback (most recent call last):
176
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
177
+ [default1]:[rank1]: trainer.train(dataloader)
178
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
179
+ [default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
180
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
181
+ [default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter(
182
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
183
+ [default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
184
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
185
+ [default1]:[rank1]: output = model(**micro_batch)
186
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
187
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
188
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
189
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
190
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
191
+ [default1]:[rank1]: sharded_logits = self.model(
192
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
193
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
194
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
195
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
196
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
197
+ [default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
198
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
199
+ [default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
200
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
201
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
202
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
203
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
204
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
205
+ [default1]:[rank1]: output = self.pp_block(**new_kwargs)
206
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
207
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
208
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
209
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
210
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
211
+ [default1]:[rank1]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
212
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
213
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
214
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
215
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
216
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
217
+ [default1]:[rank1]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
218
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
219
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
220
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
221
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
222
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
223
+ [default1]:[rank1]: return row_linear(
224
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear
225
+ [default1]:[rank1]: out = differentiable_reduce_scatter_sum(out, group=group)
226
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum
227
+ [default1]:[rank1]: return DifferentiableReduceScatterSum.apply(tensor, group)
228
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
229
+ [default1]:[rank1]: return super().apply(*args, **kwargs) # type: ignore[misc]
230
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward
231
+ [default1]:[rank1]: sharded_tensor = torch.empty(
232
+ [default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.68 GiB is free. Including non-PyTorch memory, this process has 77.64 GiB memory in use. Of the allocated memory 64.24 GiB is allocated by PyTorch, and 1.94 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
233
+ [default0]:[rank0]: Traceback (most recent call last):
234
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
235
+ [default0]:[rank0]: trainer.train(dataloader)
236
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
237
+ [default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
238
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
239
+ [default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter(
240
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
241
+ [default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
242
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
243
+ [default0]:[rank0]: output = model(**micro_batch)
244
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
245
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
246
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
247
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
248
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
249
+ [default0]:[rank0]: sharded_logits = self.model(
250
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
251
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
252
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
253
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
254
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
255
+ [default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
256
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
257
+ [default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
258
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
259
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
260
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
261
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
262
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
263
+ [default0]:[rank0]: output = self.pp_block(**new_kwargs)
264
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
265
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
266
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
267
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
268
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
269
+ [default0]:[rank0]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
270
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
271
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
272
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
273
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
274
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
275
+ [default0]:[rank0]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
276
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
277
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
278
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
279
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
280
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
281
+ [default0]:[rank0]: return row_linear(
282
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear
283
+ [default0]:[rank0]: out = differentiable_reduce_scatter_sum(out, group=group)
284
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum
285
+ [default0]:[rank0]: return DifferentiableReduceScatterSum.apply(tensor, group)
286
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
287
+ [default0]:[rank0]: return super().apply(*args, **kwargs) # type: ignore[misc]
288
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward
289
+ [default0]:[rank0]: sharded_tensor = torch.empty(
290
+ [default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU
291
+ [default2]:[rank2]: Traceback (most recent call last):
292
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
293
+ [default2]:[rank2]: trainer.train(dataloader)
294
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
295
+ [default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
296
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
297
+ [default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter(
298
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
299
+ [default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
300
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
301
+ [default2]:[rank2]: output = model(**micro_batch)
302
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
303
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
304
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
305
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
306
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
307
+ [default2]:[rank2]: sharded_logits = self.model(
308
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
309
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
310
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
311
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
312
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
313
+ [default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
314
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
315
+ [default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
316
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
317
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
318
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
319
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
320
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
321
+ [default2]:[rank2]: output = self.pp_block(**new_kwargs)
322
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
323
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
324
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
325
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
326
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
327
+ [default2]:[rank2]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
328
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
329
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
330
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
331
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
332
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
333
+ [default2]:[rank2]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
334
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
335
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
336
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
337
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
338
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
339
+ [default2]:[rank2]: return row_linear(
340
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear
341
+ [default2]:[rank2]: out = differentiable_reduce_scatter_sum(out, group=group)
342
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum
343
+ [default2]:[rank2]: return DifferentiableReduceScatterSum.apply(tensor, group)
344
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
345
+ [default2]:[rank2]: return super().apply(*args, **kwargs) # type: ignore[misc]
346
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward
347
+ [default2]:[rank2]: sharded_tensor = torch.empty(
348
+ [default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.68 GiB is free. Including non-PyTorch memory, this process has 77.64 GiB memory in use. Of the allocated memory 64.24 GiB is allocated by PyTorch, and 1.94 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
349
+ [default3]:[rank3]: Traceback (most recent call last):
350
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
351
+ [default3]:[rank3]: trainer.train(dataloader)
352
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
353
+ [default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
354
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
355
+ [default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter(
356
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
357
+ [default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
358
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
359
+ [default3]:[rank3]: output = model(**micro_batch)
360
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
361
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
362
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
363
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
364
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
365
+ [default3]:[rank3]: sharded_logits = self.model(
366
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
367
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
368
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
369
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
370
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
371
+ [default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
372
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
373
+ [default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
374
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
375
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
376
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
377
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
378
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
379
+ [default3]:[rank3]: output = self.pp_block(**new_kwargs)
380
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
381
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
382
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
383
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
384
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
385
+ [default3]:[rank3]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
386
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
387
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
388
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
389
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
390
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
391
+ [default3]:[rank3]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
392
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
393
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
394
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
395
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
396
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
397
+ [default3]:[rank3]: return row_linear(
398
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear
399
+ [default3]:[rank3]: out = differentiable_reduce_scatter_sum(out, group=group)
400
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum
401
+ [default3]:[rank3]: return DifferentiableReduceScatterSum.apply(tensor, group)
402
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
403
+ [default3]:[rank3]: return super().apply(*args, **kwargs) # type: ignore[misc]
404
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward
405
+ [default3]:[rank3]: sharded_tensor = torch.empty(
406
+ [default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.91 GiB is free. Including non-PyTorch memory, this process has 77.41 GiB memory in use. Of the allocated memory 64.24 GiB is allocated by PyTorch, and 1.94 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
407
+ [default5]:[rank5]: Traceback (most recent call last):
408
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
409
+ [default5]:[rank5]: trainer.train(dataloader)
410
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
411
+ [default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
412
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
413
+ [default4]:[rank4]: Traceback (most recent call last):
414
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
415
+ [default4]:[rank4]: trainer.train(dataloader)
416
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
417
+ [default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
418
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
419
+ [default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter(
420
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
421
+ [default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
422
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
423
+ [default4]:[rank4]: output = model(**micro_batch)
424
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
425
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
426
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
427
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
428
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
429
+ [default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter(
430
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
431
+ [default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
432
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
433
+ [default4]:[rank4]: sharded_logits = self.model(
434
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
435
+ [default5]:[rank5]: output = model(**micro_batch)
436
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
437
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
438
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
439
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
440
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
441
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
442
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
443
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
444
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
445
+ [default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
446
+ [default5]:[rank5]: sharded_logits = self.model(
447
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
448
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
449
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
450
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
451
+ [default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
452
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
453
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
454
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
455
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
456
+ [default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
457
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
458
+ [default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
459
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
460
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
461
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
462
+ [default4]:[rank4]: output = self.pp_block(**new_kwargs)
463
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
464
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
465
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
466
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
467
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
468
+ [default4]:[rank4]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
469
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
470
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
471
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
472
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
473
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
474
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
475
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
476
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
477
+ [default5]:[rank5]: output = self.pp_block(**new_kwargs)
478
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
479
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
480
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
481
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
482
+ [default4]:[rank4]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
483
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
484
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
485
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
486
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
487
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
488
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
489
+ [default4]:[rank4]: return row_linear(
490
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear
491
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
492
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
493
+ [default4]:[rank4]: out = differentiable_reduce_scatter_sum(out, group=group)
494
+ [default5]:[rank5]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
495
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum
496
+ [default4]:[rank4]: return DifferentiableReduceScatterSum.apply(tensor, group)
497
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
498
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
499
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
500
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
501
+ [default4]:[rank4]: return super().apply(*args, **kwargs) # type: ignore[misc]
502
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward
503
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
504
+ [default4]:[rank4]: sharded_tensor = torch.empty(
505
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
506
+ [default5]:[rank5]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
507
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
508
+ [default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.72 GiB is free. Including non-PyTorch memory, this process has 77.59 GiB memory in use. Of the allocated memory 64.24 GiB is allocated by PyTorch, and 1.94 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
509
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
510
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
511
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
512
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
513
+ [default5]:[rank5]: return row_linear(
514
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear
515
+ [default5]:[rank5]: out = differentiable_reduce_scatter_sum(out, group=group)
516
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum
517
+ [default5]:[rank5]: return DifferentiableReduceScatterSum.apply(tensor, group)
518
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
519
+ [default5]:[rank5]: return super().apply(*args, **kwargs) # type: ignore[misc]
520
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward
521
+ [default5]:[rank5]: sharded_tensor = torch.empty(
522
+ [default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.68 GiB is free. Including non-PyTorch memory, this process has 77.64 GiB memory in use. Of the allocated memory 64.24 GiB is allocated by PyTorch, and 1.94 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
523
+ [default6]:[rank6]: Traceback (most recent call last):
524
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
525
+ [default6]:[rank6]: trainer.train(dataloader)
526
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
527
+ [default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
528
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
529
+ [default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter(
530
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
531
+ [default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
532
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
533
+ [default6]:[rank6]: output = model(**micro_batch)
534
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
535
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
536
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
537
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
538
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
539
+ [default6]:[rank6]: sharded_logits = self.model(
540
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
541
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
542
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
543
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
544
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
545
+ [default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
546
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
547
+ [default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
548
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
549
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
550
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
551
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
552
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
553
+ [default6]:[rank6]: output = self.pp_block(**new_kwargs)
554
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
555
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
556
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
557
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
558
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
559
+ [default6]:[rank6]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
560
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
561
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
562
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
563
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
564
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
565
+ [default6]:[rank6]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
566
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
567
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
568
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
569
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
570
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
571
+ [default6]:[rank6]: return row_linear(
572
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear
573
+ [default6]:[rank6]: out = differentiable_reduce_scatter_sum(out, group=group)
574
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum
575
+ [default6]:[rank6]: return DifferentiableReduceScatterSum.apply(tensor, group)
576
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
577
+ [default6]:[rank6]: return super().apply(*args, **kwargs) # type: ignore[misc]
578
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward
579
+ [default6]:[rank6]: sharded_tensor = torch.empty(
580
+ [default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.68 GiB is free. Including non-PyTorch memory, this process has 77.64 GiB memory in use. Of the allocated memory 64.24 GiB is allocated by PyTorch, and 1.94 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
581
+ W0703 23:03:15.486000 140703814809408 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 26448 closing signal SIGTERM
582
+ W0703 23:03:15.486000 140703814809408 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 26449 closing signal SIGTERM
583
+ W0703 23:03:15.486000 140703814809408 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 26451 closing signal SIGTERM
584
+ E0703 23:03:16.403000 140703814809408 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 26444) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10
585
+ Traceback (most recent call last):
586
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
587
+ sys.exit(main())
588
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
589
+ return f(*args, **kwargs)
590
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
591
+ run(args)
592
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
593
+ elastic_launch(
594
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
595
+ return launch_agent(self._config, self._entrypoint, list(args))
596
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
597
+ raise ChildFailedError(
598
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
599
+ ============================================================
600
+ /fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED
601
+ ------------------------------------------------------------
602
+ Failures:
603
+ [1]:
604
+ time : 2024-07-03_23:03:15
605
+ host : ip-26-0-164-187.ec2.internal
606
+ rank : 1 (local_rank: 1)
607
+ exitcode : 1 (pid: 26445)
608
+ error_file: <N/A>
609
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
610
+ [2]:
611
+ time : 2024-07-03_23:03:15
612
+ host : ip-26-0-164-187.ec2.internal
613
+ rank : 2 (local_rank: 2)
614
+ exitcode : 1 (pid: 26446)
615
+ error_file: <N/A>
616
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
617
+ [3]:
618
+ time : 2024-07-03_23:03:15
619
+ host : ip-26-0-164-187.ec2.internal
620
+ rank : 3 (local_rank: 3)
621
+ exitcode : 1 (pid: 26447)
622
+ error_file: <N/A>
623
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
624
+ [4]:
625
+ time : 2024-07-03_23:03:15
626
+ host : ip-26-0-164-187.ec2.internal
627
+ rank : 6 (local_rank: 6)
628
+ exitcode : 1 (pid: 26450)
629
+ error_file: <N/A>
630
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
631
+ ------------------------------------------------------------
632
+ Root Cause (first observed failure):
633
+ [0]:
634
+ time : 2024-07-03_23:03:15
635
+ host : ip-26-0-164-187.ec2.internal
636
+ rank : 0 (local_rank: 0)
637
+ exitcode : 1 (pid: 26444)
638
+ error_file: <N/A>
639
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
640
+ ============================================================
641
+ srun: error: ip-26-0-164-187: task 0: Exited with exit code 1
642
+ Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-512/status.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ oom