Spaces:

daresearch
/

train_70b_4bit

Runtime error

App Files Files Community

daresearch commited on 4 days ago

Commit

2532c03

•

1 Parent(s): af50ea0

Update Dockerfile

Browse files

Files changed (1) hide show

Dockerfile +12 -6

Dockerfile CHANGED Viewed

@@ -1,7 +1,9 @@
 FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y \
     tzdata \
     software-properties-common \
@@ -9,14 +11,15 @@ RUN apt-get update && apt-get install -y \
     ln -fs /usr/share/zoneinfo/Etc/UTC /etc/localtime && \
     echo "Etc/UTC" > /etc/timezone
 RUN add-apt-repository ppa:deadsnakes/ppa && apt-get update && \
     apt-get install -y python3.11 python3.11-dev
 RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
 RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1
-# Install Python packages, ensuring PyTorch matches CUDA 11.8
 RUN python -m pip install --no-cache-dir \
     torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 && \
     python -m pip install --no-cache-dir \
@@ -30,7 +33,7 @@ RUN python -m pip install --no-cache-dir \
     safetensors \
     bitsandbytes
-# Helps reduce CUDA memory fragmentation, though on some platforms it may show "not supported"
 ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 # Set your cache directories
@@ -41,6 +44,7 @@ ENV TORCH_HOME=/workspace/.cache
 ENV XDG_CACHE_HOME=/workspace/.cache
 ENV TRITON_CACHE_DIR=/workspace/.cache/triton
 RUN mkdir -p /workspace/outputs \
     && mkdir -p /workspace/.huggingface \
     && mkdir -p /workspace/.cache/triton \
@@ -48,13 +52,15 @@ RUN mkdir -p /workspace/outputs \
 WORKDIR /workspace
 COPY finetune_script.py /workspace/
 COPY train.csv /workspace/train.csv
 COPY valid.csv /workspace/valid.csv
-# Preconfigure Accelerate
 RUN mkdir -p /root/.cache/huggingface/accelerate && \
-    echo '{"compute_environment": "LOCAL_MACHINE", "distributed_type": "MULTI_GPU", "num_processes": 4, "mixed_precision": "fp16", "machine_rank": 0, "main_training_function": "main", "use_cpu": false, "num_machines": 1}' > /root/.cache/huggingface/accelerate/default_config.yaml
-# Use Accelerate to launch on 4 GPUs with fp16
 CMD ["accelerate", "launch", "--num_processes=4", "--mixed_precision=fp16", "finetune_script.py"]

+# Use CUDA 11.8 with cuDNN 8
 FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
 ENV DEBIAN_FRONTEND=noninteractive
+# Basic utilities
 RUN apt-get update && apt-get install -y \
     tzdata \
     software-properties-common \
     ln -fs /usr/share/zoneinfo/Etc/UTC /etc/localtime && \
     echo "Etc/UTC" > /etc/timezone
+# Install Python 3.11
 RUN add-apt-repository ppa:deadsnakes/ppa && apt-get update && \
     apt-get install -y python3.11 python3.11-dev
+# Install pip
 RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
 RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1
+# Install Python packages (torch, Transformers, Accelerate, etc.)
 RUN python -m pip install --no-cache-dir \
     torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 && \
     python -m pip install --no-cache-dir \
     safetensors \
     bitsandbytes
+# Helps reduce CUDA memory fragmentation
 ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 # Set your cache directories
 ENV XDG_CACHE_HOME=/workspace/.cache
 ENV TRITON_CACHE_DIR=/workspace/.cache/triton
+# Create directories and set permissions
 RUN mkdir -p /workspace/outputs \
     && mkdir -p /workspace/.huggingface \
     && mkdir -p /workspace/.cache/triton \
 WORKDIR /workspace
+# Copy training script and data
 COPY finetune_script.py /workspace/
 COPY train.csv /workspace/train.csv
 COPY valid.csv /workspace/valid.csv
+# Preconfigure Accelerate for multi-GPU usage (4 GPUs, fp16)
 RUN mkdir -p /root/.cache/huggingface/accelerate && \
+    echo '{"compute_environment": "LOCAL_MACHINE", "distributed_type": "MULTI_GPU", "num_processes": 4, "mixed_precision": "fp16", "machine_rank": 0, "main_training_function": "main", "use_cpu": false, "num_machines": 1}' \
+    > /root/.cache/huggingface/accelerate/default_config.yaml
+# Launch your training script on 4 GPUs with fp16
 CMD ["accelerate", "launch", "--num_processes=4", "--mixed_precision=fp16", "finetune_script.py"]