Spaces:
Runtime error
Runtime error
daresearch
commited on
Commit
•
2532c03
1
Parent(s):
af50ea0
Update Dockerfile
Browse files- Dockerfile +12 -6
Dockerfile
CHANGED
@@ -1,7 +1,9 @@
|
|
|
|
1 |
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
|
2 |
|
3 |
ENV DEBIAN_FRONTEND=noninteractive
|
4 |
|
|
|
5 |
RUN apt-get update && apt-get install -y \
|
6 |
tzdata \
|
7 |
software-properties-common \
|
@@ -9,14 +11,15 @@ RUN apt-get update && apt-get install -y \
|
|
9 |
ln -fs /usr/share/zoneinfo/Etc/UTC /etc/localtime && \
|
10 |
echo "Etc/UTC" > /etc/timezone
|
11 |
|
|
|
12 |
RUN add-apt-repository ppa:deadsnakes/ppa && apt-get update && \
|
13 |
apt-get install -y python3.11 python3.11-dev
|
14 |
|
|
|
15 |
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
|
16 |
-
|
17 |
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1
|
18 |
|
19 |
-
# Install Python packages,
|
20 |
RUN python -m pip install --no-cache-dir \
|
21 |
torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 && \
|
22 |
python -m pip install --no-cache-dir \
|
@@ -30,7 +33,7 @@ RUN python -m pip install --no-cache-dir \
|
|
30 |
safetensors \
|
31 |
bitsandbytes
|
32 |
|
33 |
-
# Helps reduce CUDA memory fragmentation
|
34 |
ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
35 |
|
36 |
# Set your cache directories
|
@@ -41,6 +44,7 @@ ENV TORCH_HOME=/workspace/.cache
|
|
41 |
ENV XDG_CACHE_HOME=/workspace/.cache
|
42 |
ENV TRITON_CACHE_DIR=/workspace/.cache/triton
|
43 |
|
|
|
44 |
RUN mkdir -p /workspace/outputs \
|
45 |
&& mkdir -p /workspace/.huggingface \
|
46 |
&& mkdir -p /workspace/.cache/triton \
|
@@ -48,13 +52,15 @@ RUN mkdir -p /workspace/outputs \
|
|
48 |
|
49 |
WORKDIR /workspace
|
50 |
|
|
|
51 |
COPY finetune_script.py /workspace/
|
52 |
COPY train.csv /workspace/train.csv
|
53 |
COPY valid.csv /workspace/valid.csv
|
54 |
|
55 |
-
# Preconfigure Accelerate
|
56 |
RUN mkdir -p /root/.cache/huggingface/accelerate && \
|
57 |
-
echo '{"compute_environment": "LOCAL_MACHINE", "distributed_type": "MULTI_GPU", "num_processes": 4, "mixed_precision": "fp16", "machine_rank": 0, "main_training_function": "main", "use_cpu": false, "num_machines": 1}'
|
|
|
58 |
|
59 |
-
#
|
60 |
CMD ["accelerate", "launch", "--num_processes=4", "--mixed_precision=fp16", "finetune_script.py"]
|
|
|
1 |
+
# Use CUDA 11.8 with cuDNN 8
|
2 |
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
|
3 |
|
4 |
ENV DEBIAN_FRONTEND=noninteractive
|
5 |
|
6 |
+
# Basic utilities
|
7 |
RUN apt-get update && apt-get install -y \
|
8 |
tzdata \
|
9 |
software-properties-common \
|
|
|
11 |
ln -fs /usr/share/zoneinfo/Etc/UTC /etc/localtime && \
|
12 |
echo "Etc/UTC" > /etc/timezone
|
13 |
|
14 |
+
# Install Python 3.11
|
15 |
RUN add-apt-repository ppa:deadsnakes/ppa && apt-get update && \
|
16 |
apt-get install -y python3.11 python3.11-dev
|
17 |
|
18 |
+
# Install pip
|
19 |
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
|
|
|
20 |
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1
|
21 |
|
22 |
+
# Install Python packages (torch, Transformers, Accelerate, etc.)
|
23 |
RUN python -m pip install --no-cache-dir \
|
24 |
torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 && \
|
25 |
python -m pip install --no-cache-dir \
|
|
|
33 |
safetensors \
|
34 |
bitsandbytes
|
35 |
|
36 |
+
# Helps reduce CUDA memory fragmentation
|
37 |
ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
38 |
|
39 |
# Set your cache directories
|
|
|
44 |
ENV XDG_CACHE_HOME=/workspace/.cache
|
45 |
ENV TRITON_CACHE_DIR=/workspace/.cache/triton
|
46 |
|
47 |
+
# Create directories and set permissions
|
48 |
RUN mkdir -p /workspace/outputs \
|
49 |
&& mkdir -p /workspace/.huggingface \
|
50 |
&& mkdir -p /workspace/.cache/triton \
|
|
|
52 |
|
53 |
WORKDIR /workspace
|
54 |
|
55 |
+
# Copy training script and data
|
56 |
COPY finetune_script.py /workspace/
|
57 |
COPY train.csv /workspace/train.csv
|
58 |
COPY valid.csv /workspace/valid.csv
|
59 |
|
60 |
+
# Preconfigure Accelerate for multi-GPU usage (4 GPUs, fp16)
|
61 |
RUN mkdir -p /root/.cache/huggingface/accelerate && \
|
62 |
+
echo '{"compute_environment": "LOCAL_MACHINE", "distributed_type": "MULTI_GPU", "num_processes": 4, "mixed_precision": "fp16", "machine_rank": 0, "main_training_function": "main", "use_cpu": false, "num_machines": 1}' \
|
63 |
+
> /root/.cache/huggingface/accelerate/default_config.yaml
|
64 |
|
65 |
+
# Launch your training script on 4 GPUs with fp16
|
66 |
CMD ["accelerate", "launch", "--num_processes=4", "--mixed_precision=fp16", "finetune_script.py"]
|