cls: HF base_tokenizer_path: microsoft/Phi-3-mini-128k-instruct dataset: path: allenai/c4 data_dir: fr name: c4_fr split: train column: text target_num_hyper_token: 10 batch_size: 1000 total_training_size: 100000