bkaiHackathon2024 / Dockerfile.train_bi
coang's picture
initial commit
f889ba2
FROM nvidia/cuda:12.1.1-devel-ubuntu22.04
RUN apt-get update && \
apt-get install -y --no-install-recommends \
git wget build-essential \
ca-certificates libjpeg-dev libpng-dev && \
rm -rf /var/lib/apt/lists/*
ENV CONDA_DIR /opt/conda
ENV PATH $CONDA_DIR/bin:$PATH
ENV CUDA_HOME /usr/local/cuda
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh && \
/bin/bash /tmp/miniconda.sh -b -p $CONDA_DIR && \
rm /tmp/miniconda.sh && \
$CONDA_DIR/bin/conda clean -ya
RUN conda create -n docker python=3.11 -y
WORKDIR /app
COPY requirements_lang.txt /app/requirements_lang.txt
RUN conda run -n docker conda install -c pytorch -c nvidia \
pytorch torchvision torchaudio pytorch-cuda=12.1 -y && \
conda run -n docker pip install --no-cache-dir -r /app/requirements_lang.txt && \
conda run -n docker pip install --no-cache-dir pandas deepspeed sentencepiece tqdm
COPY . /app
RUN conda run -n docker pip install -e .
ENTRYPOINT ["conda", "run", "-n", "docker"]
CMD ["torchrun", "--nproc_per_node", "4", \
"-m", "FlagEmbedding.baai_general_embedding.finetune.run", \
"--output_dir", "output_dir/ckpt_bi_encoder", \
"--model_name_or_path", "BAAI/bge-m3", \
"--train_data", "data/bi/train_1024_chunks.jsonl", \
"--learning_rate", "1e-5", \
"--num_train_epochs", "20", \
"--per_device_train_batch_size", "2", \
"--dataloader_drop_last", "True", \
"--deepspeed", "FlagEmbedding/BGE_M3/ds_config.json", \
"--normlized", "True", \
"--temperature", "0.02", \
"--warmup_ratio", "0.1", \
"--query_max_len", "128", \
"--passage_max_len", "1024", \
"--train_group_size", "4", \
"--negatives_cross_device", \
"--logging_steps", "10", \
"--fp16", \
"--save_steps", "1000"]