Spaces:

daresearch
/

train_70b_4bit

Runtime error

App Files Files Community

train_70b_4bit / Dockerfile

daresearch

Update Dockerfile

2987771 verified 21 days ago

raw

history blame contribute delete

2.45 kB

	# Use CUDA 11.8 with cuDNN 8
	FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04

	ENV DEBIAN_FRONTEND=noninteractive

	# Install basic utilities
	RUN apt-get update && apt-get install -y \
	tzdata \
	software-properties-common \
	curl && \
	ln -fs /usr/share/zoneinfo/Etc/UTC /etc/localtime && \
	echo "Etc/UTC" > /etc/timezone

	# Install Python 3.11
	RUN add-apt-repository ppa:deadsnakes/ppa && apt-get update && \
	apt-get install -y python3.11 python3.11-dev

	# Install pip
	RUN curl -sS https://bootstrap.pypa.io/get-pip.py \| python3.11
	RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1

	# Install Python packages (torch, xFormers, Transformers, etc.)
	RUN python -m pip install --no-cache-dir \
	torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 && \
	python -m pip install --no-cache-dir \
	xformers==0.0.27 \
	--extra-index-url https://download.pytorch.org/whl/nightly/cu118/torch_nightly.html && \
	python -m pip install --no-cache-dir \
	transformers \
	accelerate \
	trl \
	unsloth \
	pandas \
	datasets \
	huggingface_hub \
	safetensors \
	bitsandbytes

	# Helps reduce CUDA memory fragmentation
	ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

	# Set your cache directories
	ENV HF_HOME=/workspace/.huggingface
	ENV TRANSFORMERS_CACHE=/workspace/.huggingface
	ENV DATASETS_CACHE=/workspace/.cache
	ENV TORCH_HOME=/workspace/.cache
	ENV XDG_CACHE_HOME=/workspace/.cache
	ENV TRITON_CACHE_DIR=/workspace/.cache/triton

	# Create directories and set permissions
	RUN mkdir -p /workspace/outputs \
	&& mkdir -p /workspace/.huggingface \
	&& mkdir -p /workspace/.cache/triton \
	&& chmod -R 777 /workspace

	WORKDIR /workspace

	# Copy training script and data
	COPY finetune_script.py /workspace/
	COPY train.csv /workspace/train.csv
	COPY valid.csv /workspace/valid.csv

	# Preconfigure Accelerate for multi-GPU usage (4 GPUs, fp16)
	RUN mkdir -p /root/.cache/huggingface/accelerate && \
	echo '{"compute_environment": "LOCAL_MACHINE", "distributed_type": "MULTI_GPU", "num_processes": 4, "mixed_precision": "fp16", "machine_rank": 0, "main_training_function": "main", "use_cpu": false, "num_machines": 1}' \
	> /root/.cache/huggingface/accelerate/default_config.yaml

	# Launch your training script on 4 GPUs with fp16
	CMD ["accelerate", "launch", "--num_processes=4", "--mixed_precision=fp16", "finetune_script.py"]