name: "dyu_fr_transformer-sp" joeynmt_version: "2.3.0" model_dir: "saved_model/dyu_fr" use_cuda: True # False for CPU training fp16: True data: train: "data/dyu_fr" dev: "data/dyu_fr" # test: "data/dyu_fr" dataset_type: "huggingface" dataset_cfg: name: "dyu-fr" sample_dev_subset: 1460 src: lang: "dyu" max_length: 100 lowercase: False normalize: False level: "bpe" voc_limit: 4000 voc_min_freq: 1 voc_file: "data/dyu_fr/vocab.txt" tokenizer_type: "sentencepiece" tokenizer_cfg: model_file: "data/dyu_fr/sp.model" trg: lang: "fr" max_length: 100 lowercase: False normalize: False level: "bpe" voc_limit: 4000 voc_min_freq: 1 voc_file: "data/dyu_fr/vocab.txt" tokenizer_type: "sentencepiece" tokenizer_cfg: model_file: "data/dyu_fr/sp.model" special_symbols: unk_token: "" unk_id: 0 pad_token: "" pad_id: 1 bos_token: "" bos_id: 2 eos_token: "" eos_id: 3 testing: load_model: "models/best.ckpt" n_best: 1 beam_size: 10 beam_alpha: 1.2 batch_size: 256 batch_type: "token" max_output_length: 100 eval_metrics: ["bleu"] #return_prob: "hyp" #return_attention: False sacrebleu_cfg: tokenize: "13a" training: load_model: "joeynmt-models-v11.0/30600.ckpt" #reset_best_ckpt: False #reset_scheduler: False #reset_optimizer: False #reset_iter_state: False random_seed: 42 optimizer: "adamw" normalization: "tokens" adam_betas: [0.9, 0.98] scheduling: "warmupinversesquareroot" learning_rate_warmup: 8000 learning_rate: 0.0003 learning_rate_min: 0.00000001 weight_decay: 0.0001 label_smoothing: 0.1 loss: "crossentropy" batch_size: 8192 batch_type: "token" batch_multiplier: 4 early_stopping_metric: "bleu" epochs: 1800 updates: 90000 validation_freq: 50 logging_freq: 10 overwrite: True shuffle: True print_valid_sents: [0, 1, 2, 3] keep_best_ckpts: 3 model: initializer: "xavier_uniform" bias_initializer: "zeros" init_gain: 1.0 embed_initializer: "xavier_uniform" embed_init_gain: 1.0 tied_embeddings: True tied_softmax: True encoder: type: "transformer" num_layers: 6 num_heads: 4 embeddings: embedding_dim: 256 scale: True dropout: 0.1 # typically ff_size = 4 x hidden_size hidden_size: 256 ff_size: 1024 dropout: 0.2 layer_norm: "pre" decoder: type: "transformer" num_layers: 6 num_heads: 4 embeddings: embedding_dim: 256 scale: True dropout: 0.1 # typically ff_size = 4 x hidden_size hidden_size: 256 ff_size: 1024 dropout: 0.2 layer_norm: "pre"