#SBATCH --job-name=google_sp | |
#SBATCH --nodes=1 | |
#SBATCH --cpus-per-task=100 | |
#SBATCH --ntasks-per-node=1 | |
#SBATCH -o %x-%j.log | |
set -x -e | |
echo "START TIME: $(date)" | |
BIN_PATH=/cognitive_comp/gaoxinyu/sentencepiece/sentencepiece/bin/usr/local/bin/spm_train | |
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/cognitive_comp/gaoxinyu/sentencepiece/sentencepiece/bin/usr/local/lib | |
INPUT_FILE=/cognitive_comp/gaoxinyu/github/Fengshenbang-LM/fengshen/tokenizer/sentencepiece/shuffle_corpus_59132213.txt | |
INPUT_FILE_SMALL=/cognitive_comp/gaoxinyu/github/Fengshenbang-LM/fengshen/tokenizer/sentencepiece/shuffle_corpus_1000000.txt | |
VOCAB_SIZE=40000 | |
COV=0.9995 | |
MAX_LENGTH=6 | |
TYPE=bpe | |
SEED=42 | |
MAX_INPUT_LENGTH=100000 | |
OPTION="\ | |
--input=${INPUT_FILE} \ | |
--vocab_size=${VOCAB_SIZE} \ | |
--character_coverage=${COV} \ | |
--max_sentencepiece_length=${MAX_LENGTH} \ | |
--model_type=${TYPE} \ | |
--model_prefix=${TYPE}_v${VOCAB_SIZE}_s${SEED}_cov${COV}_max${MAX_LENGTH} \ | |
--random_seed=${SEED} \ | |
--max_sentence_length=100000 \ | |
--shuffle_input_sentence=true \ | |
--input_sentence_size=${MAX_INPUT_LENGTH} \ | |
--minloglevel 1 \ | |
--num_threads=100 \ | |
--train_extremely_large_corpus=true \ | |
" | |
eval $BIN_PATH $OPTION |