# USAGE: | |
# This script allows you to prepare your dataset using multiple nodes by chunking the individual files and distributed the chunks | |
# over the processes. | |
# This bash script takes a single text file as input argument. | |
# The text file contains a valid filepath in each line, leading to a jsonl-file. | |
# Furthermore an environment variable for the rank and the world size needs to be set. | |
# These default to the SLURM and OMPI variables in this order of priority, but they can be set manually as well | |
# using the variables $RANK and $WORLD_SIZE, which will overwrite the cluster-specific variables. | |
# You can also add all arguments of the prepare_data.py script to this script and it will simply pass them through. | |
# Parse command-line arguments | |
text_file="$1" | |
rank="${RANK:-${SLURM_PROCID:-$OMPI_COMM_WORLD_RANK}}" | |
world_size="${WORLD_SIZE:-${SLURM_NTASKS:-$OMPI_COMM_WORLD_SIZE}}" | |
num_lines=$(wc -l < "$text_file") | |
chunk_size=$((num_lines / world_size)) | |
start_line=$((rank * chunk_size + 1)) | |
end_line=$((start_line + chunk_size - 1)) | |
# Make sure the last chunk includes all remaining lines | |
if [[ $rank == $((world_size - 1)) ]]; then | |
end_line=$num_lines | |
fi | |
# Select the chunk of the text file that corresponds to the rank | |
chunk_file="chunk_${rank}.txt" | |
sed -n "${start_line},${end_line}p" "$text_file" > "$chunk_file" | |
# Parse additional flags to be passed to the Python script | |
shift 1 # Shift past the first three arguments | |
py_args="" | |
prefix_arg="" | |
while [[ $# -gt 0 ]]; do | |
case "$1" in | |
--output-prefix=*) prefix_arg="$1"; shift;; | |
--output-prefix) prefix_arg="$1 $2"; shift 2;; | |
--*) py_args="$py_args $1 $2"; shift 2;; | |
*) echo "Unknown argument: $1"; exit 1;; | |
esac | |
done | |
# Add the rank to the --output-prefix argument if it is set | |
if [[ -n "$prefix_arg" ]]; then | |
py_args="$py_args $prefix_arg$rank" | |
else | |
# Inject a default --output-prefix argument containing the rank | |
py_args="$py_args --output-prefix rank${rank}" | |
fi | |
echo "processing $chunk_file with rank $rank at world size $world_size" | |
echo "using the following args: $py_args" | |
# Call the Python script with the list of file paths in the chunk | |
python tools/datasets/preprocess_data.py --input $(tr '\n' ',' < "$chunk_file" | sed 's/,$/\n/') $py_args | |
# Clean up | |
rm "$chunk_file" | |