daresearch commited on
Commit
2532c03
1 Parent(s): af50ea0

Update Dockerfile

Browse files
Files changed (1) hide show
  1. Dockerfile +12 -6
Dockerfile CHANGED
@@ -1,7 +1,9 @@
 
1
  FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
2
 
3
  ENV DEBIAN_FRONTEND=noninteractive
4
 
 
5
  RUN apt-get update && apt-get install -y \
6
  tzdata \
7
  software-properties-common \
@@ -9,14 +11,15 @@ RUN apt-get update && apt-get install -y \
9
  ln -fs /usr/share/zoneinfo/Etc/UTC /etc/localtime && \
10
  echo "Etc/UTC" > /etc/timezone
11
 
 
12
  RUN add-apt-repository ppa:deadsnakes/ppa && apt-get update && \
13
  apt-get install -y python3.11 python3.11-dev
14
 
 
15
  RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
16
-
17
  RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1
18
 
19
- # Install Python packages, ensuring PyTorch matches CUDA 11.8
20
  RUN python -m pip install --no-cache-dir \
21
  torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 && \
22
  python -m pip install --no-cache-dir \
@@ -30,7 +33,7 @@ RUN python -m pip install --no-cache-dir \
30
  safetensors \
31
  bitsandbytes
32
 
33
- # Helps reduce CUDA memory fragmentation, though on some platforms it may show "not supported"
34
  ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
35
 
36
  # Set your cache directories
@@ -41,6 +44,7 @@ ENV TORCH_HOME=/workspace/.cache
41
  ENV XDG_CACHE_HOME=/workspace/.cache
42
  ENV TRITON_CACHE_DIR=/workspace/.cache/triton
43
 
 
44
  RUN mkdir -p /workspace/outputs \
45
  && mkdir -p /workspace/.huggingface \
46
  && mkdir -p /workspace/.cache/triton \
@@ -48,13 +52,15 @@ RUN mkdir -p /workspace/outputs \
48
 
49
  WORKDIR /workspace
50
 
 
51
  COPY finetune_script.py /workspace/
52
  COPY train.csv /workspace/train.csv
53
  COPY valid.csv /workspace/valid.csv
54
 
55
- # Preconfigure Accelerate
56
  RUN mkdir -p /root/.cache/huggingface/accelerate && \
57
- echo '{"compute_environment": "LOCAL_MACHINE", "distributed_type": "MULTI_GPU", "num_processes": 4, "mixed_precision": "fp16", "machine_rank": 0, "main_training_function": "main", "use_cpu": false, "num_machines": 1}' > /root/.cache/huggingface/accelerate/default_config.yaml
 
58
 
59
- # Use Accelerate to launch on 4 GPUs with fp16
60
  CMD ["accelerate", "launch", "--num_processes=4", "--mixed_precision=fp16", "finetune_script.py"]
 
1
+ # Use CUDA 11.8 with cuDNN 8
2
  FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
3
 
4
  ENV DEBIAN_FRONTEND=noninteractive
5
 
6
+ # Basic utilities
7
  RUN apt-get update && apt-get install -y \
8
  tzdata \
9
  software-properties-common \
 
11
  ln -fs /usr/share/zoneinfo/Etc/UTC /etc/localtime && \
12
  echo "Etc/UTC" > /etc/timezone
13
 
14
+ # Install Python 3.11
15
  RUN add-apt-repository ppa:deadsnakes/ppa && apt-get update && \
16
  apt-get install -y python3.11 python3.11-dev
17
 
18
+ # Install pip
19
  RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
 
20
  RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1
21
 
22
+ # Install Python packages (torch, Transformers, Accelerate, etc.)
23
  RUN python -m pip install --no-cache-dir \
24
  torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 && \
25
  python -m pip install --no-cache-dir \
 
33
  safetensors \
34
  bitsandbytes
35
 
36
+ # Helps reduce CUDA memory fragmentation
37
  ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
38
 
39
  # Set your cache directories
 
44
  ENV XDG_CACHE_HOME=/workspace/.cache
45
  ENV TRITON_CACHE_DIR=/workspace/.cache/triton
46
 
47
+ # Create directories and set permissions
48
  RUN mkdir -p /workspace/outputs \
49
  && mkdir -p /workspace/.huggingface \
50
  && mkdir -p /workspace/.cache/triton \
 
52
 
53
  WORKDIR /workspace
54
 
55
+ # Copy training script and data
56
  COPY finetune_script.py /workspace/
57
  COPY train.csv /workspace/train.csv
58
  COPY valid.csv /workspace/valid.csv
59
 
60
+ # Preconfigure Accelerate for multi-GPU usage (4 GPUs, fp16)
61
  RUN mkdir -p /root/.cache/huggingface/accelerate && \
62
+ echo '{"compute_environment": "LOCAL_MACHINE", "distributed_type": "MULTI_GPU", "num_processes": 4, "mixed_precision": "fp16", "machine_rank": 0, "main_training_function": "main", "use_cpu": false, "num_machines": 1}' \
63
+ > /root/.cache/huggingface/accelerate/default_config.yaml
64
 
65
+ # Launch your training script on 4 GPUs with fp16
66
  CMD ["accelerate", "launch", "--num_processes=4", "--mixed_precision=fp16", "finetune_script.py"]