Upload checkpoints/roleback-iter-checkpoints.sh with huggingface_hub
Browse files
checkpoints/roleback-iter-checkpoints.sh
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# To be run in the main checkpoints directory
|
3 |
+
# Define the step number for the new model checkpoint path
|
4 |
+
TARGET_STEP="273500"
|
5 |
+
|
6 |
+
# Function to update a single checkpoint file
|
7 |
+
update_checkpoint_file() {
|
8 |
+
local checkpoint_file="$1"
|
9 |
+
local tmp_file
|
10 |
+
tmp_file=$(mktemp)
|
11 |
+
|
12 |
+
# Initialize counters for the paths and timestamps
|
13 |
+
path_count=0
|
14 |
+
timestamp_count=0
|
15 |
+
|
16 |
+
# Read the file line by line
|
17 |
+
while IFS= read -r line; do
|
18 |
+
# Count the number of path and timestamp entries
|
19 |
+
if [[ $line == all_model_checkpoint_paths* ]]; then
|
20 |
+
path_count=$((path_count + 1))
|
21 |
+
elif [[ $line == all_model_checkpoint_timestamps* ]]; then
|
22 |
+
timestamp_count=$((timestamp_count + 1))
|
23 |
+
fi
|
24 |
+
|
25 |
+
# Add the line to the temporary file
|
26 |
+
echo "$line" >> "$tmp_file"
|
27 |
+
done < "$checkpoint_file"
|
28 |
+
|
29 |
+
# Remove the last two paths and timestamps and rename the model checkpoint path
|
30 |
+
new_path_count=0
|
31 |
+
new_timestamp_count=0
|
32 |
+
while IFS= read -r line; do
|
33 |
+
if [[ $line == all_model_checkpoint_paths* ]]; then
|
34 |
+
new_path_count=$((new_path_count + 1))
|
35 |
+
# Skip the last two paths
|
36 |
+
if (( new_path_count > path_count - 1 )); then
|
37 |
+
continue
|
38 |
+
fi
|
39 |
+
elif [[ $line == all_model_checkpoint_timestamps* ]]; then
|
40 |
+
new_timestamp_count=$((new_timestamp_count + 1))
|
41 |
+
# Skip the last two timestamps
|
42 |
+
if (( new_timestamp_count > timestamp_count - 1 )); then
|
43 |
+
continue
|
44 |
+
fi
|
45 |
+
elif [[ $line == model_checkpoint_path* ]]; then
|
46 |
+
# Rename the model checkpoint path to the target step
|
47 |
+
line="model_checkpoint_path: \"iter_ckpt_rank_$(dirname $checkpoint_file | cut -d'_' -f4)-$TARGET_STEP\""
|
48 |
+
fi
|
49 |
+
# Add the line to the final temporary file
|
50 |
+
echo "$line" >> "${tmp_file}.final"
|
51 |
+
done < "$tmp_file"
|
52 |
+
|
53 |
+
# Replace the original file with the updated content
|
54 |
+
mv "${tmp_file}.final" "$checkpoint_file"
|
55 |
+
|
56 |
+
# Clean up temporary files
|
57 |
+
rm "$tmp_file"
|
58 |
+
}
|
59 |
+
|
60 |
+
# Find all checkpoint files with the given glob pattern
|
61 |
+
for checkpoint_file in iter_ckpt_rank_*/checkpoint; do
|
62 |
+
# Backup the original checkpoint file
|
63 |
+
cp "$checkpoint_file" "${checkpoint_file}.bak"
|
64 |
+
|
65 |
+
# Update the checkpoint file
|
66 |
+
update_checkpoint_file "$checkpoint_file"
|
67 |
+
done
|
68 |
+
|
69 |
+
echo "Checkpoint files have been updated."
|