summary / fengshen /tokenizer /sentencepiece /pretrain_google_sp.sh
skf15963's picture
Duplicate from fclong/summary
fb238e8
#!/bin/bash
#SBATCH --job-name=google_sp
#SBATCH --nodes=1
#SBATCH --cpus-per-task=100
#SBATCH --ntasks-per-node=1
#SBATCH -o %x-%j.log
set -x -e
echo "START TIME: $(date)"
BIN_PATH=/cognitive_comp/gaoxinyu/sentencepiece/sentencepiece/bin/usr/local/bin/spm_train
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/cognitive_comp/gaoxinyu/sentencepiece/sentencepiece/bin/usr/local/lib
INPUT_FILE=/cognitive_comp/gaoxinyu/github/Fengshenbang-LM/fengshen/tokenizer/sentencepiece/shuffle_corpus_59132213.txt
INPUT_FILE_SMALL=/cognitive_comp/gaoxinyu/github/Fengshenbang-LM/fengshen/tokenizer/sentencepiece/shuffle_corpus_1000000.txt
VOCAB_SIZE=40000
COV=0.9995
MAX_LENGTH=6
TYPE=bpe
SEED=42
MAX_INPUT_LENGTH=100000
OPTION="\
--input=${INPUT_FILE} \
--vocab_size=${VOCAB_SIZE} \
--character_coverage=${COV} \
--max_sentencepiece_length=${MAX_LENGTH} \
--model_type=${TYPE} \
--model_prefix=${TYPE}_v${VOCAB_SIZE}_s${SEED}_cov${COV}_max${MAX_LENGTH} \
--random_seed=${SEED} \
--max_sentence_length=100000 \
--shuffle_input_sentence=true \
--input_sentence_size=${MAX_INPUT_LENGTH} \
--minloglevel 1 \
--num_threads=100 \
--train_extremely_large_corpus=true \
"
eval $BIN_PATH $OPTION