Spaces:
Runtime error
Runtime error
# Use CUDA 11.8 with cuDNN 8 | |
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 | |
ENV DEBIAN_FRONTEND=noninteractive | |
# Install basic utilities | |
RUN apt-get update && apt-get install -y \ | |
tzdata \ | |
software-properties-common \ | |
curl && \ | |
ln -fs /usr/share/zoneinfo/Etc/UTC /etc/localtime && \ | |
echo "Etc/UTC" > /etc/timezone | |
# Install Python 3.11 | |
RUN add-apt-repository ppa:deadsnakes/ppa && apt-get update && \ | |
apt-get install -y python3.11 python3.11-dev | |
# Install pip | |
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 | |
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 | |
# Install Python packages (torch, xFormers, Transformers, etc.) | |
RUN python -m pip install --no-cache-dir \ | |
torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 && \ | |
python -m pip install --no-cache-dir \ | |
xformers==0.0.27 \ | |
--extra-index-url https://download.pytorch.org/whl/nightly/cu118/torch_nightly.html && \ | |
python -m pip install --no-cache-dir \ | |
transformers \ | |
accelerate \ | |
trl \ | |
unsloth \ | |
pandas \ | |
datasets \ | |
huggingface_hub \ | |
safetensors \ | |
bitsandbytes | |
# Helps reduce CUDA memory fragmentation | |
ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True | |
# Set your cache directories | |
ENV HF_HOME=/workspace/.huggingface | |
ENV TRANSFORMERS_CACHE=/workspace/.huggingface | |
ENV DATASETS_CACHE=/workspace/.cache | |
ENV TORCH_HOME=/workspace/.cache | |
ENV XDG_CACHE_HOME=/workspace/.cache | |
ENV TRITON_CACHE_DIR=/workspace/.cache/triton | |
# Create directories and set permissions | |
RUN mkdir -p /workspace/outputs \ | |
&& mkdir -p /workspace/.huggingface \ | |
&& mkdir -p /workspace/.cache/triton \ | |
&& chmod -R 777 /workspace | |
WORKDIR /workspace | |
# Copy training script and data | |
COPY finetune_script.py /workspace/ | |
COPY train.csv /workspace/train.csv | |
COPY valid.csv /workspace/valid.csv | |
# Preconfigure Accelerate for multi-GPU usage (4 GPUs, fp16) | |
RUN mkdir -p /root/.cache/huggingface/accelerate && \ | |
echo '{"compute_environment": "LOCAL_MACHINE", "distributed_type": "MULTI_GPU", "num_processes": 4, "mixed_precision": "fp16", "machine_rank": 0, "main_training_function": "main", "use_cpu": false, "num_machines": 1}' \ | |
> /root/.cache/huggingface/accelerate/default_config.yaml | |
# Launch your training script on 4 GPUs with fp16 | |
CMD ["accelerate", "launch", "--num_processes=4", "--mixed_precision=fp16", "finetune_script.py"] | |