wissamantoun commited on
Commit
1ca7324
·
verified ·
1 Parent(s): 83ab974

Upload checkpoints/roleback-iter-checkpoints.sh with huggingface_hub

Browse files
checkpoints/roleback-iter-checkpoints.sh ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # To be run in the main checkpoints directory
3
+ # Define the step number for the new model checkpoint path
4
+ TARGET_STEP="273500"
5
+
6
+ # Function to update a single checkpoint file
7
+ update_checkpoint_file() {
8
+ local checkpoint_file="$1"
9
+ local tmp_file
10
+ tmp_file=$(mktemp)
11
+
12
+ # Initialize counters for the paths and timestamps
13
+ path_count=0
14
+ timestamp_count=0
15
+
16
+ # Read the file line by line
17
+ while IFS= read -r line; do
18
+ # Count the number of path and timestamp entries
19
+ if [[ $line == all_model_checkpoint_paths* ]]; then
20
+ path_count=$((path_count + 1))
21
+ elif [[ $line == all_model_checkpoint_timestamps* ]]; then
22
+ timestamp_count=$((timestamp_count + 1))
23
+ fi
24
+
25
+ # Add the line to the temporary file
26
+ echo "$line" >> "$tmp_file"
27
+ done < "$checkpoint_file"
28
+
29
+ # Remove the last two paths and timestamps and rename the model checkpoint path
30
+ new_path_count=0
31
+ new_timestamp_count=0
32
+ while IFS= read -r line; do
33
+ if [[ $line == all_model_checkpoint_paths* ]]; then
34
+ new_path_count=$((new_path_count + 1))
35
+ # Skip the last two paths
36
+ if (( new_path_count > path_count - 1 )); then
37
+ continue
38
+ fi
39
+ elif [[ $line == all_model_checkpoint_timestamps* ]]; then
40
+ new_timestamp_count=$((new_timestamp_count + 1))
41
+ # Skip the last two timestamps
42
+ if (( new_timestamp_count > timestamp_count - 1 )); then
43
+ continue
44
+ fi
45
+ elif [[ $line == model_checkpoint_path* ]]; then
46
+ # Rename the model checkpoint path to the target step
47
+ line="model_checkpoint_path: \"iter_ckpt_rank_$(dirname $checkpoint_file | cut -d'_' -f4)-$TARGET_STEP\""
48
+ fi
49
+ # Add the line to the final temporary file
50
+ echo "$line" >> "${tmp_file}.final"
51
+ done < "$tmp_file"
52
+
53
+ # Replace the original file with the updated content
54
+ mv "${tmp_file}.final" "$checkpoint_file"
55
+
56
+ # Clean up temporary files
57
+ rm "$tmp_file"
58
+ }
59
+
60
+ # Find all checkpoint files with the given glob pattern
61
+ for checkpoint_file in iter_ckpt_rank_*/checkpoint; do
62
+ # Backup the original checkpoint file
63
+ cp "$checkpoint_file" "${checkpoint_file}.bak"
64
+
65
+ # Update the checkpoint file
66
+ update_checkpoint_file "$checkpoint_file"
67
+ done
68
+
69
+ echo "Checkpoint files have been updated."