# Use CUDA 11.8 with cuDNN 8 FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 ENV DEBIAN_FRONTEND=noninteractive # Basic utilities RUN apt-get update && apt-get install -y \ tzdata \ software-properties-common \ curl && \ ln -fs /usr/share/zoneinfo/Etc/UTC /etc/localtime && \ echo "Etc/UTC" > /etc/timezone # Install Python 3.11 RUN add-apt-repository ppa:deadsnakes/ppa && apt-get update && \ apt-get install -y python3.11 python3.11-dev # Install pip RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 # Install Python packages (torch, Transformers, Accelerate, etc.) RUN python -m pip install --no-cache-dir \ torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 && \ python -m pip install --no-cache-dir \ transformers \ accelerate \ trl \ unsloth \ pandas \ datasets \ huggingface_hub \ safetensors \ bitsandbytes # Helps reduce CUDA memory fragmentation ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True # Set your cache directories ENV HF_HOME=/workspace/.huggingface ENV TRANSFORMERS_CACHE=/workspace/.huggingface ENV DATASETS_CACHE=/workspace/.cache ENV TORCH_HOME=/workspace/.cache ENV XDG_CACHE_HOME=/workspace/.cache ENV TRITON_CACHE_DIR=/workspace/.cache/triton # Create directories and set permissions RUN mkdir -p /workspace/outputs \ && mkdir -p /workspace/.huggingface \ && mkdir -p /workspace/.cache/triton \ && chmod -R 777 /workspace WORKDIR /workspace # Copy training script and data COPY finetune_script.py /workspace/ COPY train.csv /workspace/train.csv COPY valid.csv /workspace/valid.csv # Preconfigure Accelerate for multi-GPU usage (4 GPUs, fp16) RUN mkdir -p /root/.cache/huggingface/accelerate && \ echo '{"compute_environment": "LOCAL_MACHINE", "distributed_type": "MULTI_GPU", "num_processes": 4, "mixed_precision": "fp16", "machine_rank": 0, "main_training_function": "main", "use_cpu": false, "num_machines": 1}' \ > /root/.cache/huggingface/accelerate/default_config.yaml # Launch your training script on 4 GPUs with fp16 CMD ["accelerate", "launch", "--num_processes=4", "--mixed_precision=fp16", "finetune_script.py"]