diff --git a/lms/afr/word_5gram.arpa b/lms/afr/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..b66dbf31c5fb9cdc7697c6bdc12c2d603dad04b7 --- /dev/null +++ b/lms/afr/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/afr/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/afr/word_5gram.bin b/lms/afr/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..676e6de78a551bee49906911b1ceb5b21685b847 --- /dev/null +++ b/lms/afr/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/afr/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/amh/word_5gram.arpa b/lms/amh/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..f2885ad1786493d3a1e120af33993a4cd433c8a9 --- /dev/null +++ b/lms/amh/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/amh/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/amh/word_5gram.bin b/lms/amh/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..c2de099bfe7e251ee14a532142e5a6cd1064c2e9 --- /dev/null +++ b/lms/amh/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/amh/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/ara/word_5gram.arpa b/lms/ara/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..f5019abe15007f4c9c9bfa2ebd49544224dd97d3 --- /dev/null +++ b/lms/ara/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ara/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/ara/word_5gram.bin b/lms/ara/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..4afaea52840ee65e28fd863560296d0d35991d80 --- /dev/null +++ b/lms/ara/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ara/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/asm/word_5gram.arpa b/lms/asm/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..4822ab9114db22b3766f275cfd3d684aae64046b --- /dev/null +++ b/lms/asm/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/asm/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/asm/word_5gram.bin b/lms/asm/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..5a50767d276e61784646730d6bc19c3fd7eea489 --- /dev/null +++ b/lms/asm/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/asm/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/ast/word_5gram.arpa b/lms/ast/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..6e7e5cfb3040df92d1a2b2ff338c662b2650b2e2 --- /dev/null +++ b/lms/ast/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ast/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/ast/word_5gram.bin b/lms/ast/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..d0111d3b3bc3c6b81c4a2029fbddb83c7f373e81 --- /dev/null +++ b/lms/ast/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ast/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/azj-script_latin/word_5gram.arpa b/lms/azj-script_latin/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..08daab5898c6a87fb2647d044db2fc5ae6067458 --- /dev/null +++ b/lms/azj-script_latin/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/azj/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/azj-script_latin/word_5gram.bin b/lms/azj-script_latin/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..1ae7ba53a210af22287d1286d46bd425fffea567 --- /dev/null +++ b/lms/azj-script_latin/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/azj/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/bel/word_5gram.arpa b/lms/bel/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..329cb139339cf90940cbdb9ca56cd3aa009d7d9f --- /dev/null +++ b/lms/bel/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/bel/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/bel/word_5gram.bin b/lms/bel/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..1bd6e1f9cbc6875cdca9b3161d45d131d4f5f8e7 --- /dev/null +++ b/lms/bel/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/bel/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/ben/word_5gram.arpa b/lms/ben/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..a9929f3499280185c3845d5ceda4e281a720670a --- /dev/null +++ b/lms/ben/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ben/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/ben/word_5gram.bin b/lms/ben/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..5f5f336926d869a5f362392a378c10671d76f48b --- /dev/null +++ b/lms/ben/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ben/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/bos/word_5gram.arpa b/lms/bos/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..cc5b289d2786f5339940f950181be1eba5382d48 --- /dev/null +++ b/lms/bos/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/bos/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/bos/word_5gram.bin b/lms/bos/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..9b03ebc9390ce94638109d2255763b4076085216 --- /dev/null +++ b/lms/bos/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/bos/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/bul/word_5gram.arpa b/lms/bul/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..66da5023bf6134c25362150866ea0fd291a15c1c --- /dev/null +++ b/lms/bul/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/bul/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/bul/word_5gram.bin b/lms/bul/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..d3556df0d68b5e4977cb59d0b86d9f1167531784 --- /dev/null +++ b/lms/bul/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/bul/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/cat/word_5gram.arpa b/lms/cat/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..05be56efd68f4bc64a68a19e02314f6f8cf146cc --- /dev/null +++ b/lms/cat/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/cat/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/cat/word_5gram.bin b/lms/cat/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..bd9ee4ba0033ac3c0061055d95dab4d58e66f3c7 --- /dev/null +++ b/lms/cat/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/cat/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/ceb/word_5gram.arpa b/lms/ceb/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..64fe39510459b43f6dfa018bb489a08ca8b28218 --- /dev/null +++ b/lms/ceb/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ceb/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/ceb/word_5gram.bin b/lms/ceb/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..b7456a675172865bf88a6a39068d490b96d35613 --- /dev/null +++ b/lms/ceb/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ceb/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/ces/word_5gram.arpa b/lms/ces/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..c5252303420c7f7c83c9451fa35708c5cfd2f52e --- /dev/null +++ b/lms/ces/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ces/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/ces/word_5gram.bin b/lms/ces/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..9624cbab584db0c11404cf445980a520d0922d3b --- /dev/null +++ b/lms/ces/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ces/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/ckb/word_5gram.arpa b/lms/ckb/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..7150d59cf548a6570c05658e3cbdcb8d3754613d --- /dev/null +++ b/lms/ckb/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ckb/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/ckb/word_5gram.bin b/lms/ckb/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..0c0b0918a82e7805e1b460116ec9525007245d29 --- /dev/null +++ b/lms/ckb/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ckb/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/cmn-script_simplified/char_20gram.arpa b/lms/cmn-script_simplified/char_20gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..edf14a07f6a5199abfd9878a8581bdb353c65923 --- /dev/null +++ b/lms/cmn-script_simplified/char_20gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/cmn/lm_5gram_prune12345_limit250k_fixed.chars.arpa \ No newline at end of file diff --git a/lms/cmn-script_simplified/char_20gram.bin b/lms/cmn-script_simplified/char_20gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..8aa080650360f36422150baa7603774a355d10ad --- /dev/null +++ b/lms/cmn-script_simplified/char_20gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/cmn/lm_5gram_prune12345_limit250k_fixed.chars.bin \ No newline at end of file diff --git a/lms/cym/word_5gram.arpa b/lms/cym/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..d6c928140f562134b2157def31f97d928f44424e --- /dev/null +++ b/lms/cym/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/cym/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/cym/word_5gram.bin b/lms/cym/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..9468b6f94e5f7bec677778239bc627587561fc53 --- /dev/null +++ b/lms/cym/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/cym/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/dan/word_5gram.arpa b/lms/dan/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..8a801b284ba1b5774bc4a662ec787baab28234d9 --- /dev/null +++ b/lms/dan/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/dan/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/dan/word_5gram.bin b/lms/dan/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..aa1d8df913a50de878ca04b6d2cbc562341a5567 --- /dev/null +++ b/lms/dan/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/dan/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/deu/word_5gram.arpa b/lms/deu/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..aef58a7149b5c8ab40ec829ac2ed1dea4bc30af4 --- /dev/null +++ b/lms/deu/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/deu/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/deu/word_5gram.bin b/lms/deu/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..ba2b79f96e81c504a074e9f55c45c559d2a23b2f --- /dev/null +++ b/lms/deu/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/deu/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/ell/word_5gram.arpa b/lms/ell/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..cf76a8672f4ef2bb98320788b724dc36910011e5 --- /dev/null +++ b/lms/ell/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ell/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/ell/word_5gram.bin b/lms/ell/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..381e74af7ac7d2eb2e61c7e211c279d130daa767 --- /dev/null +++ b/lms/ell/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ell/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/eng/word_5gram.arpa b/lms/eng/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..4f3bf1f51090d20fa2aef28133d6290bb8d9dd78 --- /dev/null +++ b/lms/eng/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/eng/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/eng/word_5gram.bin b/lms/eng/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..660bd6c148b3c006cbf9389c35c8d830e660ffc2 --- /dev/null +++ b/lms/eng/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/eng/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/est/word_5gram.arpa b/lms/est/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..a66589f1f47ccf21dcab3d561705119c932f357c --- /dev/null +++ b/lms/est/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/est/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/est/word_5gram.bin b/lms/est/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..2e9093634b8215bba1ffea8839869b59018141e4 --- /dev/null +++ b/lms/est/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/est/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/fas/word_5gram.arpa b/lms/fas/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..1d8aa759f42688264c3ce4e5090a7cf45f05b186 --- /dev/null +++ b/lms/fas/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/fas/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/fas/word_5gram.bin b/lms/fas/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..30216bd3b41579c49a04cf7770e005687d90372d --- /dev/null +++ b/lms/fas/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/fas/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/fin/word_5gram.arpa b/lms/fin/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..525118f35c6c1ecd942ef7eac358fa98596c4fbe --- /dev/null +++ b/lms/fin/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/fin/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/fin/word_5gram.bin b/lms/fin/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..96037022477f30730422cf3ed15f49acbfc35394 --- /dev/null +++ b/lms/fin/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/fin/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/fra/word_5gram.arpa b/lms/fra/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..68efb13cf93f7daa2108162776afc30e88e0c9c1 --- /dev/null +++ b/lms/fra/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/fra/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/fra/word_5gram.bin b/lms/fra/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..2bb470d45966c2cb74dc38e46c8ad31ae8bf98c4 --- /dev/null +++ b/lms/fra/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/fra/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/ful/word_5gram.arpa b/lms/ful/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..47d8e4e5f949971049f8a7e06aeb54dd89861301 --- /dev/null +++ b/lms/ful/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ful/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/ful/word_5gram.bin b/lms/ful/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..f5bcff514e8cdc9e7736db4531eb52d5a6457ad1 --- /dev/null +++ b/lms/ful/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ful/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/gle/word_5gram.arpa b/lms/gle/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..580376b45861c48cbe6eec897be9f6d843be707b --- /dev/null +++ b/lms/gle/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/gle/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/gle/word_5gram.bin b/lms/gle/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..05d48f57b941057958f6a84a63a4640c2233c10a --- /dev/null +++ b/lms/gle/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/gle/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/glg/word_5gram.arpa b/lms/glg/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..70f5b106dc17d0a12647c1082b3e0b6d678bdb0b --- /dev/null +++ b/lms/glg/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/glg/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/glg/word_5gram.bin b/lms/glg/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..0b3e01332f4e3bcc313775d2fd4180b3226503b5 --- /dev/null +++ b/lms/glg/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/glg/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/guj/word_5gram.arpa b/lms/guj/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..e710e436e4afa0e5eaa5936d2e0a1bfa30981747 --- /dev/null +++ b/lms/guj/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/guj/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/guj/word_5gram.bin b/lms/guj/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..7533c4ca10fd4ed7279d09c49b91b8f6807c5c5b --- /dev/null +++ b/lms/guj/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/guj/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/hau/word_5gram.arpa b/lms/hau/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..8f6177d45e50be2abde51e5b89be5f2fa826cdd7 --- /dev/null +++ b/lms/hau/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/hau/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/hau/word_5gram.bin b/lms/hau/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..7a2cc56cc2ba693d0983caaa11fc72cd178c528d --- /dev/null +++ b/lms/hau/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/hau/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/heb/word_5gram.arpa b/lms/heb/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..ac42a1fac99a4f22cf2e5bc61bc446bace5ee6ef --- /dev/null +++ b/lms/heb/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/heb/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/heb/word_5gram.bin b/lms/heb/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..df3e86604edf4e910bb28853821ee5b002ce3133 --- /dev/null +++ b/lms/heb/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/heb/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/hin/word_5gram.arpa b/lms/hin/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..821bf800dbc54d36c29201e90b7e95fe2540ea82 --- /dev/null +++ b/lms/hin/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/hin/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/hin/word_5gram.bin b/lms/hin/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..dd9d09f2ad31db21aeaaed032acb26e3b4e9bfb9 --- /dev/null +++ b/lms/hin/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/hin/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/hrv/word_5gram.arpa b/lms/hrv/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..b2b854055412dfb3d04f4898290d42b3bceff60d --- /dev/null +++ b/lms/hrv/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/hrv/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/hrv/word_5gram.bin b/lms/hrv/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..12087e90f5a230fe08cabda8f902cf32fbe8e0ce --- /dev/null +++ b/lms/hrv/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/hrv/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/hun/word_5gram.arpa b/lms/hun/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..d7b83682bb27ff81f11c018d5c9e557cc53e766f --- /dev/null +++ b/lms/hun/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/hun/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/hun/word_5gram.bin b/lms/hun/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..2ea8d99ce0a55ad815543effc253de1e3fafe1d8 --- /dev/null +++ b/lms/hun/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/hun/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/hye/word_5gram.arpa b/lms/hye/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..1d1643fc2ac8f0163d7a2d57a2edc028b752b23d --- /dev/null +++ b/lms/hye/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/hye/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/hye/word_5gram.bin b/lms/hye/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..b53bbc678881b810bf98ab117a906bd53ba5af13 --- /dev/null +++ b/lms/hye/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/hye/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/ibo/word_5gram.arpa b/lms/ibo/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..423d1ccace3d3efc90aa93fef7cacd6d34957b37 --- /dev/null +++ b/lms/ibo/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ibo/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/ibo/word_5gram.bin b/lms/ibo/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..e90fac353812c5d1e8d7372d74e67deee4d41ba1 --- /dev/null +++ b/lms/ibo/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ibo/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/ind/word_5gram.arpa b/lms/ind/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..b9fcfb5d7f01ae55e1adfed1a779fb38da210c3e --- /dev/null +++ b/lms/ind/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ind/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/ind/word_5gram.bin b/lms/ind/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..2dbba8f99b89984b7e7f3e722758b8ee5ef28a0f --- /dev/null +++ b/lms/ind/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ind/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/isl/word_5gram.arpa b/lms/isl/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..c64d2be757d01c93cc5df9c4dc5ce7fa85470db9 --- /dev/null +++ b/lms/isl/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/isl/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/isl/word_5gram.bin b/lms/isl/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..626cd264f73c65f5dd6fb516fbb9f54d58122327 --- /dev/null +++ b/lms/isl/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/isl/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/ita/word_5gram.arpa b/lms/ita/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..c602e5452e35145136d30ccb9a05adfc7ed427c4 --- /dev/null +++ b/lms/ita/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ita/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/ita/word_5gram.bin b/lms/ita/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..f381aabe5126ba1fb055d7e5ebd27025260e9a8d --- /dev/null +++ b/lms/ita/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ita/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/jav/word_5gram.arpa b/lms/jav/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..f931c58ebd94cfc6063c5977f7e449a3b9aca50e --- /dev/null +++ b/lms/jav/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/jav/lm_5gram_prune00000_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/jav/word_5gram.bin b/lms/jav/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..11c30c1b4ef1a0551d2bc699e4c56023f5b50b55 --- /dev/null +++ b/lms/jav/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/jav/lm_5gram_prune00000_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/jpn/char_20gram.arpa b/lms/jpn/char_20gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..1e9df1d6b35f6f09648771c2e324ed57d3048fcd --- /dev/null +++ b/lms/jpn/char_20gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/jpn/lm_5gram_prune12345_limit250k_fixed.chars.arpa \ No newline at end of file diff --git a/lms/jpn/char_20gram.bin b/lms/jpn/char_20gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..d9c0aa1cb0c93599a9fc9345dee035243f04551c --- /dev/null +++ b/lms/jpn/char_20gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/jpn/lm_5gram_prune12345_limit250k_fixed.chars.bin \ No newline at end of file diff --git a/lms/kam/word_5gram.arpa b/lms/kam/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..174f3492e2dc0093ea2b6ee08d58b573bb73c33b --- /dev/null +++ b/lms/kam/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/kam/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/kam/word_5gram.bin b/lms/kam/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..4df78a0fa90e6ffc1932b8d7168581c5d7608a29 --- /dev/null +++ b/lms/kam/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/kam/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/kan/word_5gram.arpa b/lms/kan/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..0c849293c8ae8b1d0eb64efd8377d449ee98e1fd --- /dev/null +++ b/lms/kan/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/kan/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/kan/word_5gram.bin b/lms/kan/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..b095abe4a3e35d1bc0e81e066ef7cbfdcf8f9168 --- /dev/null +++ b/lms/kan/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/kan/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/kat/word_5gram.arpa b/lms/kat/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..776215bd16f90eaca22888d87aabbf69f8563c81 --- /dev/null +++ b/lms/kat/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/kat/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/kat/word_5gram.bin b/lms/kat/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..e602d78b16a4882e01b2a8c14166803348d585bc --- /dev/null +++ b/lms/kat/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/kat/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/kaz/word_5gram.arpa b/lms/kaz/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..0487f7de4acce8b49c04e158807b0abe33fd93ba --- /dev/null +++ b/lms/kaz/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/kaz/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/kaz/word_5gram.bin b/lms/kaz/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..cc5220cd52bdc071a073cac794b8478c9ffed243 --- /dev/null +++ b/lms/kaz/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/kaz/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/kea/word_5gram.arpa b/lms/kea/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..3015eb885002080f255b5a8ac3f4f645eac4c5ff --- /dev/null +++ b/lms/kea/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/kea/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/kea/word_5gram.bin b/lms/kea/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..52d3cf60ad5c710717000d754911f4413c5152d2 --- /dev/null +++ b/lms/kea/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/kea/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/khm/char_20gram.arpa b/lms/khm/char_20gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..6967a212ff6b59927d8ab8284322c0184311a3b2 --- /dev/null +++ b/lms/khm/char_20gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/khm/lm_5gram_prune01234_limit250k_fixed.chars.arpa \ No newline at end of file diff --git a/lms/khm/char_20gram.bin b/lms/khm/char_20gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..a4822ed64d32c5cb04e12f860669ef2df9f38f3b --- /dev/null +++ b/lms/khm/char_20gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/khm/lm_5gram_prune01234_limit250k_fixed.chars.bin \ No newline at end of file diff --git a/lms/kir/word_5gram.arpa b/lms/kir/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..c0c6ceaff4dec1c594fa2fa245fc5e232cf03032 --- /dev/null +++ b/lms/kir/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/kir/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/kir/word_5gram.bin b/lms/kir/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..49a80cf32b597291914194002a065c2829e8cbdf --- /dev/null +++ b/lms/kir/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/kir/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/kor/word_5gram.arpa b/lms/kor/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..b059b3ca1bbe95d328d48b08d1ce5b831e0360a2 --- /dev/null +++ b/lms/kor/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/kor/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/kor/word_5gram.bin b/lms/kor/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..a1fa565ececd87bebe6c830805638114bac238db --- /dev/null +++ b/lms/kor/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/kor/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/lao/char_20gram.arpa b/lms/lao/char_20gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..00f6489810c4176bc2c85387a1018dce8124c829 --- /dev/null +++ b/lms/lao/char_20gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/lao/lm_5gram_prune01234_limit250k_fixed.chars.arpa \ No newline at end of file diff --git a/lms/lao/char_20gram.bin b/lms/lao/char_20gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..87ac51c67d8e32c1d8467cd92c55e14517d1474b --- /dev/null +++ b/lms/lao/char_20gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/lao/lm_5gram_prune01234_limit250k_fixed.chars.bin \ No newline at end of file diff --git a/lms/lav/word_5gram.arpa b/lms/lav/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..3c5a27b585c0c57b87d16576fb5d009f67d2e8cc --- /dev/null +++ b/lms/lav/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/lav/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/lav/word_5gram.bin b/lms/lav/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..3ea04e509d1bc57ff186c6211ed0ab231690dee7 --- /dev/null +++ b/lms/lav/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/lav/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/lin/word_5gram.arpa b/lms/lin/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..8c84838600093a78c0ff708c7af59db1cf47bbd3 --- /dev/null +++ b/lms/lin/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/lin/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/lin/word_5gram.bin b/lms/lin/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..175caaaf5e5e6000aa8777b757254e7ffa7549c7 --- /dev/null +++ b/lms/lin/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/lin/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/lit/word_5gram.arpa b/lms/lit/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..d0dbf50cbf7c3b9bd488e55191149c3d263205ca --- /dev/null +++ b/lms/lit/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/lit/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/lit/word_5gram.bin b/lms/lit/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..38443b7343a4b7a277381bdfe597b56841974135 --- /dev/null +++ b/lms/lit/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/lit/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/ltz/word_5gram.arpa b/lms/ltz/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..994120abb28e2d5926ddbb38ac078ffc420869e4 --- /dev/null +++ b/lms/ltz/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ltz/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/ltz/word_5gram.bin b/lms/ltz/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..db2e49169ea77bcaabff97d04f8ced88f7ba73f8 --- /dev/null +++ b/lms/ltz/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ltz/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/lug/word_5gram.arpa b/lms/lug/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..8e6f99fe777dea07e51e735fcd4c732329fc1d67 --- /dev/null +++ b/lms/lug/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/lug/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/lug/word_5gram.bin b/lms/lug/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..5317775ed2bce816c55f9cdace11becfe1ff2921 --- /dev/null +++ b/lms/lug/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/lug/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/luo/word_5gram.arpa b/lms/luo/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..92e1356fcfbe93dc5f08fe63f1834af4aa33548f --- /dev/null +++ b/lms/luo/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/luo/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/luo/word_5gram.bin b/lms/luo/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..f7e0349dc56bb340679f68b2e44172fbe7a9b9cf --- /dev/null +++ b/lms/luo/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/luo/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/mal/word_5gram.arpa b/lms/mal/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..cbc55fb83b3e3030af99c8f76bbb43cb6c953e4d --- /dev/null +++ b/lms/mal/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/mal/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/mal/word_5gram.bin b/lms/mal/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..fd55f6cf1333d92f29e850fb2771aee7cea7a56d --- /dev/null +++ b/lms/mal/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/mal/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/mar/word_5gram.arpa b/lms/mar/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..414d2c5b501763ccda03e6506557153056e3d47f --- /dev/null +++ b/lms/mar/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/mar/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/mar/word_5gram.bin b/lms/mar/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..a738134c461ed089063c4be7a9b058e1bcce33e5 --- /dev/null +++ b/lms/mar/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/mar/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/mkd/word_5gram.arpa b/lms/mkd/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..39b08f0d831573180f3007d4e0f2717dec443265 --- /dev/null +++ b/lms/mkd/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/mkd/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/mkd/word_5gram.bin b/lms/mkd/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..33f58e43a15568149ab43e2c7b98efa898211d68 --- /dev/null +++ b/lms/mkd/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/mkd/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/mlt/word_5gram.arpa b/lms/mlt/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..e214c0e6f151ec9d55afb5adb976d8b12955174e --- /dev/null +++ b/lms/mlt/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/mlt/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/mlt/word_5gram.bin b/lms/mlt/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..4f008f8555d6bccda6be6b0cb7b593ba3620cec9 --- /dev/null +++ b/lms/mlt/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/mlt/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/mon/word_5gram.arpa b/lms/mon/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..21b85729615005800aa08c718c7cc7bf9fbcddfc --- /dev/null +++ b/lms/mon/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/mon/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/mon/word_5gram.bin b/lms/mon/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..ab464837a7b2beb083c423bea48978ac57c9e25b --- /dev/null +++ b/lms/mon/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/mon/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/mri/word_5gram.arpa b/lms/mri/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..218ec72b41e3ac1c9afbb550a3f169b3e7f110c2 --- /dev/null +++ b/lms/mri/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/mri/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/mri/word_5gram.bin b/lms/mri/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..75cbb11b517914bdb82a55891f998e6698785184 --- /dev/null +++ b/lms/mri/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/mri/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/mya/char_20gram.arpa b/lms/mya/char_20gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..60ae85c8a65b7b302f5d6b1611992e7d5074ac43 --- /dev/null +++ b/lms/mya/char_20gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/mya/lm_5gram_prune01234_limit250k_fixed.chars.arpa \ No newline at end of file diff --git a/lms/mya/char_20gram.bin b/lms/mya/char_20gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..eb9966c5894972c166c02a1b21cc0eb19606d154 --- /dev/null +++ b/lms/mya/char_20gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/mya/lm_5gram_prune01234_limit250k_fixed.chars.bin \ No newline at end of file diff --git a/lms/nld/word_5gram.arpa b/lms/nld/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..d566dc0a4b6d41f0c032a69abe34a3fdd0174e78 --- /dev/null +++ b/lms/nld/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/nld/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/nld/word_5gram.bin b/lms/nld/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..aa417baa69dd84bccbbe0bb34fd0e3c51462a859 --- /dev/null +++ b/lms/nld/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/nld/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/nob/word_5gram.arpa b/lms/nob/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..6174fd99673135111a12fe7341409aac270dd841 --- /dev/null +++ b/lms/nob/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/nob/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/nob/word_5gram.bin b/lms/nob/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..e52cfe81dec0d3fca4c07455c925794b1b08108b --- /dev/null +++ b/lms/nob/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/nob/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/npi/word_5gram.arpa b/lms/npi/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..5126755ec0d01a8213d052cad56288d375cdca86 --- /dev/null +++ b/lms/npi/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/npi/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/npi/word_5gram.bin b/lms/npi/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..08488eaab4ddaf37af003621b457c85bc227b29c --- /dev/null +++ b/lms/npi/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/npi/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/nso/word_5gram.arpa b/lms/nso/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..6c7716bc45badf5bb6d4001772cf65d5f0ad0741 --- /dev/null +++ b/lms/nso/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/nso/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/nso/word_5gram.bin b/lms/nso/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..2d4be60f9341312861947f48d236b65ae6e09dba --- /dev/null +++ b/lms/nso/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/nso/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/nya/word_5gram.arpa b/lms/nya/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..ffd937d36959b80be01e96fe3ecdf957a7d1dd84 --- /dev/null +++ b/lms/nya/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/nya/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/nya/word_5gram.bin b/lms/nya/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..45bdfe4a66f6d6be70dd2f74183e2f779fbd726d --- /dev/null +++ b/lms/nya/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/nya/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/oci/word_5gram.arpa b/lms/oci/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..8eaa5d2ec6e4ddf6dc63934e22b77a3779651686 --- /dev/null +++ b/lms/oci/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/oci/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/oci/word_5gram.bin b/lms/oci/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..13c025abfbb0d22053345664137c36d47f7d7bef --- /dev/null +++ b/lms/oci/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/oci/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/orm/word_5gram.arpa b/lms/orm/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..29e839dc81aa02ad269a3f1c24da0802a42ad5f9 --- /dev/null +++ b/lms/orm/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/orm/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/orm/word_5gram.bin b/lms/orm/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..c447ee204bbafc90816c0e6c9ac9783a407cea8e --- /dev/null +++ b/lms/orm/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/orm/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/ory/word_5gram.arpa b/lms/ory/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..071cdea8418f2f1e9d6e82305d585b649f51d3ab --- /dev/null +++ b/lms/ory/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ory/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/ory/word_5gram.bin b/lms/ory/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..16dc1f742c45582fb3926b49726725c0bc853115 --- /dev/null +++ b/lms/ory/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ory/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/pan/word_5gram.arpa b/lms/pan/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..e754966ae7140f2cd1325f397b378e12b5a746aa --- /dev/null +++ b/lms/pan/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/pan/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/pan/word_5gram.bin b/lms/pan/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..77810ae8dde67cb5a137d19bc1a51cdb35449ea9 --- /dev/null +++ b/lms/pan/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/pan/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/pol/word_5gram.arpa b/lms/pol/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..d51f668034ba4dca0521c15874f0c42486cee52b --- /dev/null +++ b/lms/pol/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/pol/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/pol/word_5gram.bin b/lms/pol/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..71c7d111eede5e7c770bfe456c0d000ada6c11b2 --- /dev/null +++ b/lms/pol/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/pol/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/por/word_5gram.arpa b/lms/por/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..b36e4252ca0fe102e780c8a568f3b5dbbf48f585 --- /dev/null +++ b/lms/por/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/por/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/por/word_5gram.bin b/lms/por/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..2366172098bd4ebcad57f6cfceb501b009e95123 --- /dev/null +++ b/lms/por/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/por/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/pus/word_5gram.arpa b/lms/pus/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..34e7e7341adde6c46044aa1dc3450cd62586b3f1 --- /dev/null +++ b/lms/pus/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/pus/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/pus/word_5gram.bin b/lms/pus/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..a16903479fc33188ad62cde4f84e4e9c6a3fab31 --- /dev/null +++ b/lms/pus/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/pus/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/ron/word_5gram.arpa b/lms/ron/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..4a593902ffe2cfcd973daa4e37050a65eb305ee7 --- /dev/null +++ b/lms/ron/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ron/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/ron/word_5gram.bin b/lms/ron/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..20c84e40ad100db2a174e3e1a0fa6243824475e9 --- /dev/null +++ b/lms/ron/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ron/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/rus/word_5gram.arpa b/lms/rus/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..8c8aa9b9feff03dbccdcc181500f41481fa4f389 --- /dev/null +++ b/lms/rus/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/rus/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/rus/word_5gram.bin b/lms/rus/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..94993b54a838285a6d903001d1802070cc9d1731 --- /dev/null +++ b/lms/rus/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/rus/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/slk/word_5gram.arpa b/lms/slk/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..26b3819cfc7601783bd85b9d7dfb8f9daad199c0 --- /dev/null +++ b/lms/slk/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/slk/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/slk/word_5gram.bin b/lms/slk/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..296ee272e4914f25b76b57e1931d631257dba06a --- /dev/null +++ b/lms/slk/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/slk/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/slv/word_5gram.arpa b/lms/slv/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..58ff2bcb5737dce240f63fad98fc0c8009449b26 --- /dev/null +++ b/lms/slv/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/slv/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/slv/word_5gram.bin b/lms/slv/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..32a6bcf96f7fceb4e7372f30883a8c0d78fd9797 --- /dev/null +++ b/lms/slv/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/slv/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/sna/word_5gram.arpa b/lms/sna/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..910272f990d53f15374c96c29977ee5c8ee68cae --- /dev/null +++ b/lms/sna/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/sna/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/sna/word_5gram.bin b/lms/sna/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..29869c22c06d43a5ba9ffa7e4515a83b20778429 --- /dev/null +++ b/lms/sna/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/sna/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/snd/word_5gram.arpa b/lms/snd/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..98107a1e827e630fe2cf9f5fbb9b89054afcd3ba --- /dev/null +++ b/lms/snd/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/snd/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/snd/word_5gram.bin b/lms/snd/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..21eb0b0ea607cbbbaf3d4bb661c208df30db8e71 --- /dev/null +++ b/lms/snd/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/snd/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/som/word_5gram.arpa b/lms/som/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..056bc6cc3deaff58f9b1142539c524c7fac88533 --- /dev/null +++ b/lms/som/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/som/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/som/word_5gram.bin b/lms/som/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..c6c7788fbc479116e2892e7329d3070a6b00cd9f --- /dev/null +++ b/lms/som/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/som/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/spa/word_5gram.arpa b/lms/spa/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..42543f345c1d9b97116815aa5c576dbfdde5da06 --- /dev/null +++ b/lms/spa/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/spa/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/spa/word_5gram.bin b/lms/spa/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..b0a8deee9fc8ab16db6bd84398ea5d39b41bc915 --- /dev/null +++ b/lms/spa/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/spa/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/srp-script_latin/word_5gram.arpa b/lms/srp-script_latin/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..5bb16d242b206eda006cb5876f1059297d0f77b1 --- /dev/null +++ b/lms/srp-script_latin/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/srp/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/srp-script_latin/word_5gram.bin b/lms/srp-script_latin/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..234ac768a26df6b19b9ad613d38b062bf7df3f14 --- /dev/null +++ b/lms/srp-script_latin/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/srp/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/swe/word_5gram.arpa b/lms/swe/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..051b7ae8710dbf5509544b6f465585314b654b54 --- /dev/null +++ b/lms/swe/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/swe/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/swe/word_5gram.bin b/lms/swe/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..8bf24bd3cb30f2b8612097d57cef173a35e1d56b --- /dev/null +++ b/lms/swe/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/swe/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/swh/word_5gram.arpa b/lms/swh/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..636d7c7a1d3a92f4457e1fc21cf3a27c613c73e1 --- /dev/null +++ b/lms/swh/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/swh/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/swh/word_5gram.bin b/lms/swh/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..77c6ba5c3541922596b886d3edc8d8678054545f --- /dev/null +++ b/lms/swh/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/swh/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/tam/word_5gram.arpa b/lms/tam/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..1dade7b061ed388eb0c033ea6d7631ec794551c2 --- /dev/null +++ b/lms/tam/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/tam/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/tam/word_5gram.bin b/lms/tam/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..83d2e7f7766966994215bbd190571521847068f3 --- /dev/null +++ b/lms/tam/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/tam/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/tel/word_5gram.arpa b/lms/tel/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..93539a1aa81175bb437ef20e101f7d2a995790ed --- /dev/null +++ b/lms/tel/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/tel/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/tel/word_5gram.bin b/lms/tel/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..6820594a330f9eb138c1bd312b95fc9a4389f054 --- /dev/null +++ b/lms/tel/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/tel/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/tgk/word_5gram.arpa b/lms/tgk/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..6895bb254b8b47dfca3a8485d650e37c30bc5186 --- /dev/null +++ b/lms/tgk/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/tgk/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/tgk/word_5gram.bin b/lms/tgk/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..388b68371ba2fc88f6d09d74c5431b762ca4c253 --- /dev/null +++ b/lms/tgk/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/tgk/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/tgl/word_5gram.arpa b/lms/tgl/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..bf19d278302afad02ec99a9cc70acf5d22e6fb4d --- /dev/null +++ b/lms/tgl/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/tgl/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/tgl/word_5gram.bin b/lms/tgl/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..4792742fb70929ac0583dedb49212223d407170b --- /dev/null +++ b/lms/tgl/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/tgl/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/tha/char_20gram.arpa b/lms/tha/char_20gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..8f7739a8fe5bc3ac96b83801971af256308a5e5f --- /dev/null +++ b/lms/tha/char_20gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/tha/lm_5gram_prune12345_limit250k_fixed.chars.arpa \ No newline at end of file diff --git a/lms/tha/char_20gram.bin b/lms/tha/char_20gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..739d527680fa8a4d3e95f52ae6a66d59a5f040f6 --- /dev/null +++ b/lms/tha/char_20gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/tha/lm_5gram_prune12345_limit250k_fixed.chars.bin \ No newline at end of file diff --git a/lms/tur/word_5gram.arpa b/lms/tur/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..5f4ec167fd42fdcbb7a95d974b813f250af960de --- /dev/null +++ b/lms/tur/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/tur/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/tur/word_5gram.bin b/lms/tur/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..e5dfdbc354fc44c8628138ba00b75af3ab8d21aa --- /dev/null +++ b/lms/tur/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/tur/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/ukr/word_5gram.arpa b/lms/ukr/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..fef6ba45ae5d5e150877a8f0f6dfd58371e1f69d --- /dev/null +++ b/lms/ukr/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ukr/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/ukr/word_5gram.bin b/lms/ukr/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..baadc17093b865b250106e0124d9ff80909072b5 --- /dev/null +++ b/lms/ukr/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ukr/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/umb/word_5gram.arpa b/lms/umb/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..49ec63f9e01c05988c36c4ef5b833ef5c23340b3 --- /dev/null +++ b/lms/umb/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/umb/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/umb/word_5gram.bin b/lms/umb/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..fc6fb5b707f822620e4f2fa0ad60f930af320d62 --- /dev/null +++ b/lms/umb/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/umb/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/urd-script_arabic/word_5gram.arpa b/lms/urd-script_arabic/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..39b978470871c35e4bdc9ce5d66d2247373d2ccb --- /dev/null +++ b/lms/urd-script_arabic/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/urd/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/urd-script_arabic/word_5gram.bin b/lms/urd-script_arabic/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..f3ed861b50c5bbfaec555d9beb8f8c43a4620215 --- /dev/null +++ b/lms/urd-script_arabic/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/urd/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/uzb-script_latin/word_5gram.arpa b/lms/uzb-script_latin/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..96f37426550a51c74d8dbb99e43a53864591819a --- /dev/null +++ b/lms/uzb-script_latin/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/uzb/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/uzb-script_latin/word_5gram.bin b/lms/uzb-script_latin/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..dfe5439db40002d0b1bc1d6bd71747fafc2e4454 --- /dev/null +++ b/lms/uzb-script_latin/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/uzb/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/vie/word_5gram.arpa b/lms/vie/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..5b780192519593d8dee9d50f3b9b4e1837bf5656 --- /dev/null +++ b/lms/vie/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/vie/lm_5gram_prune12345_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/vie/word_5gram.bin b/lms/vie/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..ecda3b0d5afe491549022ddb315de5ed05a795da --- /dev/null +++ b/lms/vie/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/vie/lm_5gram_prune12345_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/wol/word_5gram.arpa b/lms/wol/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..76195e7979397fe464e7f381ba7238e11ba31a06 --- /dev/null +++ b/lms/wol/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/wol/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/wol/word_5gram.bin b/lms/wol/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..33680d42aec63af228e4514c038a12cf8cfd63e7 --- /dev/null +++ b/lms/wol/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/wol/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/xho/word_5gram.arpa b/lms/xho/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..24d8eda98b9b1a0ab7c7d4179fce14cfd06ccc24 --- /dev/null +++ b/lms/xho/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/xho/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/xho/word_5gram.bin b/lms/xho/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..a0885b3fb0cb14b4c2429b0d28efa5fb70a13a92 --- /dev/null +++ b/lms/xho/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/xho/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/yor/word_5gram.arpa b/lms/yor/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..625ed190180735de6743b9bd3038833ac1fb0b5d --- /dev/null +++ b/lms/yor/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/yor/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/yor/word_5gram.bin b/lms/yor/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..b994b8f5e42e6f9a2cf829490c6a6115877b906a --- /dev/null +++ b/lms/yor/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/yor/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/yue-script_traditional/char_20gram.arpa b/lms/yue-script_traditional/char_20gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..185134b028f700d0cf9e00aab7470714a8ca9b54 --- /dev/null +++ b/lms/yue-script_traditional/char_20gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/yue/lm_5gram_prune12345_limit250k_fixed.chars.arpa \ No newline at end of file diff --git a/lms/yue-script_traditional/char_20gram.bin b/lms/yue-script_traditional/char_20gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..b9fe234d9e5c39366140fde805cbbab00a68dd0b --- /dev/null +++ b/lms/yue-script_traditional/char_20gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/yue/lm_5gram_prune12345_limit250k_fixed.chars.bin \ No newline at end of file diff --git a/lms/zlm/word_5gram.arpa b/lms/zlm/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..2d46548f827302c2e07acddc2a210be9e0693f7e --- /dev/null +++ b/lms/zlm/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/zlm/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/zlm/word_5gram.bin b/lms/zlm/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..1a635eb5cc1f016aa23b94f8fff2eecd57541e48 --- /dev/null +++ b/lms/zlm/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/zlm/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/lms/zul/word_5gram.arpa b/lms/zul/word_5gram.arpa new file mode 120000 index 0000000000000000000000000000000000000000..5fe117782da8e6402e5d95437c4227d613a4022e --- /dev/null +++ b/lms/zul/word_5gram.arpa @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/zul/lm_5gram_prune01234_limit250k_fixed.arpa \ No newline at end of file diff --git a/lms/zul/word_5gram.bin b/lms/zul/word_5gram.bin new file mode 120000 index 0000000000000000000000000000000000000000..652e1524debf6d6ed620130925853d494e64326a --- /dev/null +++ b/lms/zul/word_5gram.bin @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/zul/lm_5gram_prune01234_limit250k_fixed.bin \ No newline at end of file diff --git a/mms-1b-all/afr/lexicon.txt b/mms-1b-all/afr/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..49ce1cc2ff27399b8502faed3e5f00492907ccc2 --- /dev/null +++ b/mms-1b-all/afr/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/afr/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/afr/tokens.txt b/mms-1b-all/afr/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..e948ba8fe0a4a3bb3df309b7354c554855ed0971 --- /dev/null +++ b/mms-1b-all/afr/tokens.txt @@ -0,0 +1,73 @@ +| 22961 +e 19676 +i 9317 +a 8840 +n 8757 +s 7269 +r 7160 +o 6923 +t 6554 +d 6107 +l 4470 +g 3996 +k 3537 +m 2886 +v 2460 +u 2428 +w 2010 +p 1923 +h 1908 +b 1764 +y 1065 +f 963 +' 569 +j 397 +0 250 +c 214 +- 186 +1 183 +ë 171 +ê 105 +2 103 +9 81 +3 71 +4 71 +. 68 +5 68 +6 64 +‘ 61 +8 47 +z 44 +7 37 +x 25 +ï 23 +: 23 +, 15 +/ 15 +q 15 +á 10 +’ 9 +” 8 +; 6 +$ 5 +% 5 +ń 4 +– 4 +í 3 +¥ 3 +ó 2 +ϊ 2 ++ 2 +& 2 +[ 2 +] 2 +² 1 +" 1 +ö 1 +ç 1 +ł 1 +£ 1 +° 1 +ͦ 1 +ú 1 +é 1 diff --git a/mms-1b-all/amh/lexicon.txt b/mms-1b-all/amh/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..5cbdcdc39b4586e27d358089b6b4b0a399e29646 --- /dev/null +++ b/mms-1b-all/amh/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/amh/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/amh/tokens.txt b/mms-1b-all/amh/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..141d64fbc2ed112e0c85d7507b431bfadc73c565 --- /dev/null +++ b/mms-1b-all/amh/tokens.txt @@ -0,0 +1,335 @@ +| 85715 +ን 22176 +በ 11256 +ት 10395 +የ 8670 +ው 10849 +ስ 9576 +ር 10440 +መ 7239 +ል 8332 +ም 12644 +ች 5926 +ይ 7267 +አ 9765 +ለ 9133 +ተ 6544 +እ 11473 +ነ 7507 +ያ 5215 +ና 5732 +ላ 5013 +ከ 5275 +ደ 5557 +ድ 4511 +ብ 4890 +ማ 3266 +ረ 2961 +ግ 4963 +ገ 4498 +ወ 4130 +ታ 3547 +ራ 2968 +ሰ 4459 +ጥ 1832 +ሚ 2132 +ክ 2491 +ባ 2662 +ዎ 955 +ቀ 2619 +ሉ 2613 +ሪ 485 +ጠ 1938 +ካ 1230 +ሳ 1903 +ሆ 2602 +ፍ 1648 +ቅ 1955 +ህ 4031 +ዳ 1704 +ዋ 1001 +ጋ 1775 +ቸ 2155 +ቶ 1460 +ኛ 1032 +ሮ 865 +ጣ 1328 +ፈ 1991 +ሊ 563 +ሎ 937 +ሁ 5544 +0 789 +ሩ 1186 +ቱ 1367 +ቃ 1150 +ዝ 833 +ኖ 767 +ሲ 556 +ዘ 1373 +ቢ 668 +፣ 639 +ዊ 308 +ዲ 1737 +ጊ 783 +1 585 +ጉ 376 +ሞ 1243 +ቦ 206 +ዜ 763 +ሌ 795 +ኑ 899 +ዛ 746 +ጀ 393 +ሽ 292 +ዙ 554 +ኮ 125 +ሜ 169 +ዓ 367 +ዚ 2878 +ቡ 645 +ፊ 513 +ዶ 329 +ሃ 405 +ጎ 231 +ቤ 578 +ቻ 516 +ኒ 59 +ዩ 346 +ዕ 512 +ጨ 313 +ቹ 389 +2 365 +ቁ 483 +ፋ 684 +ቆ 431 +ዱ 783 +ጅ 553 +ኪ 145 +ሬ 336 +ሱ 2664 +a 334 +ቲ 205 +ኤ 222 +ቋ 113 +ሙ 858 +ኔ 632 +ሶ 206 +ፓ 307 +ሻ 221 +ፕ 4 +ሀ 98 +ኝ 855 +ፖ 281 +ኢ 1000 +ጫ 120 +ዮ 392 +ቴ 169 +ሕ 1289 +ጭ 209 +ሥ 1162 +ጃ 115 +ሴ 354 +o 254 +ኩ 240 +ሄ 397 +. 245 +9 245 +ጓ 70 +n 233 +- 232 +ኦ 38 +5 223 +ሸ 222 +e 222 +ቂ 175 +ፎ 162 +4 209 +ኋ 769 +ጡ 417 +i 198 +3 196 +ኘ 205 +ፒ 189 +ጽ 568 +6 181 +r 178 +8 175 +t 165 +ፉ 291 +ጦ 275 +ኙ 114 +s 162 +l 158 +ጂ 233 +ዞ 128 +ቪ 152 +ቷ 56 +ፅ 145 +ጆ 269 +ኃ 3 +c 135 +ሠ 399 +u 129 +ኬ 36 +ዴ 210 +ፃ 81 +ኞ 269 +7 119 +ኳ 185 +m 111 +p 109 +ሂ 50 +ሏ 81 +h 97 +ፀ 67 +። 95 +ሐ 478 +ቫ 89 +ጸ 569 +ሺ 55 +ጤ 41 +, 80 +ፌ 59 +ሯ 62 +g 73 +፡ 72 +ጻ 337 +ዌ 13 +ዥ 44 +ጁ 154 +d 62 +ፐ 61 +b 61 +ቨ 59 +ሹ 61 +/ 57 +ሾ 96 +ቧ 27 +ቬ 52 +ሟ 33 +ቭ 51 +” 50 +ፔ 1 +ጪ 12 +k 46 +f 44 +ቄ 103 +ቺ 41 +y 37 +ፏ 8 +ሷ 96 +ጄ 64 +ዬ 127 +w 34 +v 34 +ሣ 381 +ሦ 122 +ጌ 816 +ዣ 18 +ቾ 17 +ሔ 1155 +ጩ 13 +ሑ 11 +ጧ 31 +ኗ 24 +j 26 +ጮ 84 +ዐ 599 +ቼ 119 +" 24 +: 24 +ኚ 2 +ዦ 25 +ዷ 23 +፤ 20 +ዢ 2 +ዑ 10 +ዉ 19 +ኅ 116 +ኣ 5 +ፆ 3 +ጾ 41 +x 16 +$ 16 +ኡ 1 +ጢ 364 +ቮ 13 +« 13 +» 13 +ፑ 3 +ጹ 127 +% 10 +ዟ 10 +z 9 +ዡ 30 +ጵ 43 +— 7 +ፁ 53 +ጿ 3 +ዪ 53 +? 6 +’ 6 +ሼ 18 ++ 5 +፥ 5 +ዒ 5 +° 5 +ሿ 6 +ዖ 85 +ጐ 125 +õ 4 +– 4 +ኸ 186 +; 4 +ã 3 +¥ 3 +ኺ 1 +ጴ 147 +ጳ 242 +ኜ 26 +' 3 +ጇ 12 +ጺ 2 +‘ 2 +! 2 +² 2 +ቿ 22 +ኰ 47 +ፗ 2 +q 2 +& 2 +£ 2 +ኟ 8 +[ 1 +] 1 +፦ 1 +ኀ 395 +ቈ 221 +ጒ 187 +ኵ 138 +ሓ 103 +ቍ 102 +ቊ 81 +ሡ 63 +ጲ 40 +ጶ 31 +ኹ 28 +ኾ 24 +ሤ 24 +ጕ 21 +ኽ 20 +ኻ 16 +ዔ 14 +ጬ 9 +ጼ 8 +ኄ 5 +ዤ 5 +ኆ 4 +ሖ 3 +ዠ 3 +ጯ 2 +ጱ 2 +ጰ 2 +ሢ 1 +ፄ 1 +ሒ 1 diff --git a/mms-1b-all/ara/lexicon.txt b/mms-1b-all/ara/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..d74100e76eca6d92b6954db0ee6acb8ee2aab915 --- /dev/null +++ b/mms-1b-all/ara/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ara/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/ara/tokens.txt b/mms-1b-all/ara/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff8ac6f9127202a96f24b3d2c7616831b7e3523d --- /dev/null +++ b/mms-1b-all/ara/tokens.txt @@ -0,0 +1,117 @@ +| 753464 +ا 414915 +ل 366105 +ي 243376 +م 220617 +ن 218976 +و 200525 +ت 107267 +ر 115296 +ب 117382 +ة 49395 +ع 103441 +د 77178 +ف 85639 +س 85645 +ك 100464 +أ 111253 +ق 62850 +ه 126602 +ح 59914 +ج 42045 +ط 21998 +ى 30642 +ش 25690 +ص 21207 +خ 28556 +إ 35733 +ث 16532 +ض 20734 +ذ 40384 +ً 1092 +ز 11052 +غ 9208 +ء 14858 +ئ 11563 +ّ 614 +ظ 6501 +ُ 340 +0 14 +1 255 +ٍ 236 +آ 6722 +، 179 +ؤ 4278 +2 157 +ِ 131 +9 108 +4 1 +5 6 +3 79 +6 74 +َ 71 +8 70 +ٌ 65 +7 62 +- 168 +. 41 +a 4 +/ 30 +" 28 +n 7 +o 7 +e 6 +: 23 +s 1 +t 10 +٠ 21 +p 19 +g 5 +d 2 +c 1 +l 3 +, 14 +i 9 +١ 11 +m 2 +r 10 +v 10 +b 9 +% 9 +u 3 +h 1 +؛ 6 +٩ 6 +w 1 +— 1 +k 4 +ـ 4 +٣ 4 +x 3 +f 3 +٨ 2 +z 2 ++ 2 +õ 2 +j 2 +٧ 2 +y 1 +چ 1 +٢ 1 +٦ 1 +٪ 1 +× 1 +٥ 1 +ٰ 62 +ی 60 +ۖ 34 +ک 21 +ۚ 19 +ۗ 6 +ۛ 2 +ھ 1 +' 344 +ڨ 1 +ۘ 1 +☭ 1 +– 4 diff --git a/mms-1b-all/asm/lexicon.txt b/mms-1b-all/asm/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..b0d11e7c06fad038f3a0686a56d34fea540b261f --- /dev/null +++ b/mms-1b-all/asm/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/asm/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/asm/tokens.txt b/mms-1b-all/asm/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..e790a579d188991291b1a3307b873cf268c11d65 --- /dev/null +++ b/mms-1b-all/asm/tokens.txt @@ -0,0 +1,134 @@ +| 124409 +ৰ 50675 +া 49888 +ি 36290 +্ 26337 +ক 37696 +ে 37636 +ত 31682 +ন 27205 +ব 18900 +য 19023 +ল 25377 +প 16522 +স 13837 +ম 15821 +ু 15586 +হ 11198 +ো 22070 +় 10202 +ট 2504 +ছ 4560 +আ 12049 +দ 9901 +ী 6339 +ই 10144 +গ 5310 +জ 6968 +ৱ 4817 +চ 6058 +ৈ 5753 +ষ 3666 +শ 6163 +ভ 3640 +ণ 3092 +এ 4180 +অ 3115 +থ 3705 +ধ 3802 +খ 2877 +ও 7575 +ড 504 +ূ 1800 +উ 1439 +' 2700 +ং 602 +ঁ 7651 +ফ 684 +" 882 +ৃ 1250 +। 573 +0 8 +ঠ 930 +ঘ 774 +র 1953 +1 473 +ঞ 594 +৷ 303 +2 1 +- 1220 +ঙ 402 +9 196 +ৎ 409 +5 148 +3 136 +ঢ 254 +ৌ 478 +4 128 +8 125 +6 120 +০ 119 +a 107 +, 100 +7 96 +. 89 +১ 68 +ঃ 33 +s 64 +২ 56 +৪ 53 +e 52 +c 51 +i 49 +’ 48 +u 48 +n 44 +p 43 +: 43 +m 43 +o 41 +৬ 38 +৫ 38 +t 36 +l 35 +৩ 34 +r 34 +ʼ 29 +/ 28 +g 27 +d 27 +b 24 +৮ 24 +h 22 +৯ 22 +‌ 20 +৭ 18 +ঐ 28 +v 14 +” 13 +$ 12 +ঝ 1 +ঔ 1 +f 11 +; 10 +% 10 +k 9 +z 6 +w 6 +y 6 +— 1 +j 6 +! 5 +x 5 +– 5 +ঈ 1487 +ঋ 10 +õ 4 +° 3 +q 3 +& 2 +̇ 2 +² 2 +£ 2 ++ 2 +‍ 1 diff --git a/mms-1b-all/ast/lexicon.txt b/mms-1b-all/ast/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..ce665bcdd169e499a6ffd862aa65d8e88abb7685 --- /dev/null +++ b/mms-1b-all/ast/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ast/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/ast/tokens.txt b/mms-1b-all/ast/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f860f7d701e1937ac6e4bdbb75a10419c222e1c --- /dev/null +++ b/mms-1b-all/ast/tokens.txt @@ -0,0 +1,76 @@ +| 51249 +e 33720 +a 27331 +s 19765 +n 19159 +o 17534 +r 16422 +i 16065 +l 15857 +u 13965 +t 11631 +d 11270 +c 10540 +m 7463 +p 7013 +b 3063 +y 2912 +g 2830 +f 2690 +v 2641 +q 2368 +á 2003 +ó 1929 +x 1830 +h 1733 +í 1565 +’ 1407 +é 861 +z 774 +' 755 +0 627 +ñ 512 +1 440 +ú 380 +k 368 +2 303 +. 265 +9 203 +w 199 +- 187 +4 179 +5 173 +3 162 +8 139 +6 139 +j 124 +” 102 +7 90 +ü 88 +: 57 +, 38 +/ 27 +; 25 +– 17 +ḥ 15 +! 13 +¿ 12 +? 12 +¡ 10 +% 10 +º 9 +$ 6 +— 6 +õ 6 +& 4 +ç 3 +¥ 3 +ś 3 +ª 3 ++ 2 +² 2 +ý 2 +[ 1 +] 1 +ö 1 +ł 1 diff --git a/mms-1b-all/azj-script_latin/lexicon.txt b/mms-1b-all/azj-script_latin/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..ad82bd8099c8716663558cbeda8b322907d290a3 --- /dev/null +++ b/mms-1b-all/azj-script_latin/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/azj/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/azj-script_latin/tokens.txt b/mms-1b-all/azj-script_latin/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..2920b2f28cf8f7db9ea9f7a37ad7df55c60a97bc --- /dev/null +++ b/mms-1b-all/azj-script_latin/tokens.txt @@ -0,0 +1,66 @@ +| 99874 +a 62356 +ə 57493 +i 51292 +n 45766 +r 37176 +l 36643 +d 33035 +t 13791 +s 18886 +ı 22592 +m 21796 +y 16817 +e 13311 +k 11097 +b 16339 +u 14388 +o 11683 +ü 13021 +q 10161 +v 7460 +ş 7795 +z 11258 +h 9682 +ç 5276 +c 4343 +x 4381 +f 1705 +g 5890 +ö 6008 +p 2190 +ğ 3906 +- 758 +0 50 +1 18 +̇ 1645 +2 13 +9 205 +4 8 +5 173 +j 143 +3 158 +8 152 +6 1 +7 113 +. 87 +: 57 +w 36 +, 35 +" 24 +/ 18 +% 15 +' 7 +° 5 ++ 4 +$ 4 +õ 4 +? 2 +[ 2 +] 2 +— 2 +– 103 +ú 1 +² 1 +; 1 +! 1 diff --git a/mms-1b-all/bel/lexicon.txt b/mms-1b-all/bel/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..531dd80bfd7c78c927c81509acba8036297dac67 --- /dev/null +++ b/mms-1b-all/bel/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/bel/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/bel/tokens.txt b/mms-1b-all/bel/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ca9b45f66b7e10fe7492c05896c8e9f850992e5 --- /dev/null +++ b/mms-1b-all/bel/tokens.txt @@ -0,0 +1,91 @@ +| 2382442 +а 2103745 +н 827503 +і 646274 +ы 601477 +р 563290 +с 512691 +т 523900 +е 533265 +к 470754 +л 497013 +о 461338 +я 463462 +д 419227 +м 446632 +п 368943 +ц 392162 +в 388956 +у 399480 +з 337760 +ў 252458 +г 257230 +б 233428 +ь 210829 +э 169318 +ч 181420 +х 146477 +й 127439 +ш 152224 +ж 100739 +ю 83906 +ё 71268 +ф 29047 +» 595 +« 589 +0 523 +- 10596 +' 10312 +1 395 +2 248 +. 240 +9 170 +5 147 +3 116 +— 153 +6 114 +8 111 +7 97 +4 96 +: 66 +a 52 +i 248 +o 35 +e 34 +r 34 +c 31 +l 29 +n 1 +s 25 +, 25 +p 22 +t 18 +v 18 +g 18 +d 17 +/ 15 +h 14 +b 12 +m 8 +y 7 +k 7 +² 6 +и 5 +! 5 +; 5 +° 4 +% 4 +– 12 +[ 3 +] 3 ++ 3 +x 2 +u 2 +f 2 +z 2 +õ 2 +ú 2 +? 2 +j 1 +‒ 19 +ґ 1 diff --git a/mms-1b-all/ben/lexicon.txt b/mms-1b-all/ben/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..bffc1f93744141840b99c8993fc8fb0d9c68303f --- /dev/null +++ b/mms-1b-all/ben/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ben/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/ben/tokens.txt b/mms-1b-all/ben/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..11a64fd344aa85fec25f3941c367092eb122f09d --- /dev/null +++ b/mms-1b-all/ben/tokens.txt @@ -0,0 +1,132 @@ +| 782576 +া 369621 +র 331604 +ে 362833 +্ 197431 +ি 183411 +ন 185762 +ক 179387 +য 145869 +ব 137621 +ত 188910 +স 96140 +ল 102490 +প 82930 +় 72348 +ম 120731 +ট 18536 +ু 67112 +দ 94815 +এ 34960 +হ 59459 +ো 56838 +জ 40457 +গ 35681 +শ 49111 +ছ 46256 +ী 38411 +ই 49640 +থ 28233 +ভ 22241 +অ 18714 +আ 58155 +ং 13400 +ষ 26406 +চ 22006 +ড 12217 +ধ 22635 +ও 26296 +ণ 16379 +খ 31419 +উ 12875 +ফ 5948 +" 798 +ূ 9779 +0 130 +ৃ 7143 +। 597 +ঘ 3297 +1 189 +ঠ 5145 +ঁ 19028 +- 5811 +ঞ 2385 +2 123 +ৈ 1471 +ঙ 3935 +ৌ 3203 +9 39 +5 50 +4 69 +ৎ 2298 +3 52 +ঝ 1613 +০ 11 +6 33 +8 30 +, 117 +' 641 +7 23 +a 87 +১ 59 +. 83 +ঃ 1005 +c 1 +৪ 23 +: 44 +২ 48 +৯ 12 +ঐ 851 +t 38 +s 37 +/ 36 +m 20 +p 36 +৬ 11 +i 3 +৫ 21 +৮ 9 +n 32 +৩ 30 +ঢ 819 +৭ 13 +u 27 +o 1 +r 24 +d 20 +h 19 +e 18 +v 18 +l 7 +g 16 +b 13 +w 12 +ঈ 10229 +; 11 +y 10 +k 10 +x 9 +‍ 8 +% 8 +f 7 +! 6 +ঔ 20 +¥ 6 +õ 6 +ঊ 68 +ঋ 129 +° 5 +q 5 +í 4 ++ 4 +ü 4 +z 4 +[ 3 +] 3 +£ 3 +৷ 2 +? 2 +j 1 +ú 1 +á 1 +— 173 diff --git a/mms-1b-all/bos/lexicon.txt b/mms-1b-all/bos/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..d4a5512f6b8377f5dc175e1901c80eba0e1d08cd --- /dev/null +++ b/mms-1b-all/bos/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/bos/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/bos/tokens.txt b/mms-1b-all/bos/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ff87bc1553244db18487c41a361073d3aca03d5 --- /dev/null +++ b/mms-1b-all/bos/tokens.txt @@ -0,0 +1,73 @@ +| 60397 +a 36731 +i 32041 +o 29407 +e 29102 +n 21057 +j 16921 +r 16610 +t 15604 +s 14974 +u 14511 +k 11280 +l 10852 +d 10840 +v 10706 +m 10301 +p 9301 +z 6121 +g 5526 +b 4526 +č 3011 +c 3010 +š 2461 +h 2298 +ž 1900 +ć 1573 +f 1061 +. 971 +đ 746 +0 729 +1 540 +2 327 +- 268 +9 233 +y 218 +5 194 +w 188 +4 177 +3 166 +8 154 +6 152 +7 111 +: 97 +/ 53 +, 45 +x 40 +q 32 +; 23 +? 20 +! 18 +– 17 +' 16 +% 11 +ü 10 +á 9 +$ 5 +í 5 +é 4 ++ 4 +õ 4 +— 4 +‘ 3 +’ 3 +° 3 +ó 2 +& 2 +ç 2 +ã 2 +ú 2 +² 2 +[ 2 +] 2 +ł 1 diff --git a/mms-1b-all/bul/lexicon.txt b/mms-1b-all/bul/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..c182b0ace7edc418f765b254b6360ffaf9aec07e --- /dev/null +++ b/mms-1b-all/bul/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/bul/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/bul/tokens.txt b/mms-1b-all/bul/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..eeb36a6124388c6f53489fc57070c8a61b3d5958 --- /dev/null +++ b/mms-1b-all/bul/tokens.txt @@ -0,0 +1,92 @@ +| 480961 +а 253357 +и 193780 +о 209323 +е 199247 +т 161025 +н 118245 +р 80643 +с 109822 +в 103688 +к 68705 +д 79897 +л 53701 +п 53354 +з 53492 +м 50721 +я 41649 +ъ 36560 +г 38915 +б 34659 +у 28203 +ч 26070 +ж 15723 +ц 10587 +щ 16842 +й 17394 +х 24363 +ш 13353 +ф 1100 +- 1102 +0 735 +. 639 +1 554 +ю 2123 +2 350 +a 1 +r 220 +9 212 +o 1 +e 1 +5 194 +n 189 +4 180 +i 1 +3 163 +6 160 +t 157 +8 150 +s 148 +c 131 +7 122 +: 120 +p 115 +l 107 +m 103 +g 97 +u 85 +h 67 +d 54 +ь 28 +– 1856 +k 48 +/ 44 +, 41 +; 41 +b 39 +v 38 +y 30 +w 28 +f 28 +! 18 +" 17 +? 15 +ѝ 246 +% 14 +' 9 +q 8 +z 7 +x 7 +j 6 ++ 4 +° 2 +[ 2 +] 2 +á 2 +õ 2 +№ 2 +² 2 +— 873 +` 5 +̀ 16 +ѐ 11 diff --git a/mms-1b-all/cat/lexicon.txt b/mms-1b-all/cat/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..73d2bfffc42c341530c12dc69ca4c79a360a5eca --- /dev/null +++ b/mms-1b-all/cat/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/cat/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/cat/tokens.txt b/mms-1b-all/cat/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbfe84b8bee372eba83974dfa5fc772612162bf8 --- /dev/null +++ b/mms-1b-all/cat/tokens.txt @@ -0,0 +1,102 @@ +| 279102 +e 163821 +a 132466 +s 104262 +r 82135 +i 76435 +n 73875 +l 79307 +t 66732 +o 56213 +d 43484 +u 58576 +c 32368 +m 34012 +p 32062 +v 23355 +g 16599 +b 14416 +' 10239 +f 10561 +q 19389 +h 10706 +é 10902 +x 5777 +ó 3914 +à 5960 +í 2769 +è 3535 +j 6769 +y 2909 +ò 3843 +0 484 +z 467 +1 344 +k 13882 +ç 1240 +- 2946 +2 239 +ú 2288 +w 9364 +· 152 +9 145 +ï 500 +5 136 +4 129 +8 109 +3 107 +6 106 +. 100 +ü 164 +7 53 +: 33 +/ 30 +, 24 +" 10 +% 8 +— 7 +á 494 +² 6 +; 5 +! 4 +– 4 +” 4 +¥ 3 +’ 3 +[ 2 +] 2 +º 2 +ö 3 +& 2 +“ 2 +õ 2 ++ 1 +? 1 +ã 7 +° 1 +ñ 188 +– 85 +— 2016 +ø 21 +ô 18 +ř 15 +ū 14 +ž 8 +č 7 +ı 7 +ş 6 +š 5 +ë 4 +ł 3 +ţ 3 +å 3 +ė 3 +ň 3 +ō 2 +ù 2 +ā 2 +ä 1 +ő 1 +ì 1 +ὑ 1 +ð 1 diff --git a/mms-1b-all/ceb/lexicon.txt b/mms-1b-all/ceb/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..b196ebfff28fa96609e13b74ca6ec9ee006d76f1 --- /dev/null +++ b/mms-1b-all/ceb/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ceb/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/ceb/tokens.txt b/mms-1b-all/ceb/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f4248c1236ec29e56f98018889cf98547ab005a --- /dev/null +++ b/mms-1b-all/ceb/tokens.txt @@ -0,0 +1,70 @@ +a 170262 +| 150956 +n 89528 +g 73203 +i 62645 +s 42778 +o 46715 +m 27981 +t 26723 +u 31550 +k 29074 +l 24460 +p 16452 +e 5237 +d 18591 +r 6301 +y 18837 +b 14055 +h 13075 +w 7040 +- 3046 +c 956 +0 126 +f 136 +v 76 +1 70 +. 543 +2 49 +j 1845 +9 9 +5 11 +z 51 +4 41 +8 3 +3 7 +6 6 +x 11 +7 4 +, 112 +: 87 +' 182 +q 64 +; 49 +/ 45 +" 44 +? 21 +! 18 +$ 14 +ü 13 +— 78 +– 12 +á 12 +% 11 +í 7 +ç 6 ++ 5 +& 5 +° 4 +ú 4 +£ 3 +¥ 3 +[ 3 +] 3 +ã 3 +’ 2 +â 2 +õ 2 +ó 2 +² 1 +é 1 diff --git a/mms-1b-all/ces/lexicon.txt b/mms-1b-all/ces/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..8c9971ac6b413879dc73c10ca9040515c903863f --- /dev/null +++ b/mms-1b-all/ces/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ces/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/ces/tokens.txt b/mms-1b-all/ces/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..5849b3b26c206e5e4bb691126cb8f3f13e1db9b9 --- /dev/null +++ b/mms-1b-all/ces/tokens.txt @@ -0,0 +1,82 @@ +| 105027 +o 48988 +e 43999 +n 37790 +a 38664 +t 32056 +s 28673 +i 24958 +v 27128 +l 25729 +r 22145 +k 20219 +d 20333 +p 19262 +m 18512 +u 18322 +í 18123 +c 14287 +h 13806 +z 12868 +á 13199 +y 11300 +j 12323 +b 10377 +ě 10195 +é 7908 +ř 6642 +ž 4894 +ý 5066 +č 5165 +š 4440 +ů 3082 +g 1793 +f 1730 +0 664 +1 501 +ú 843 +2 306 +w 231 +x 350 +9 217 +. 215 +5 194 +4 167 +ň 331 +6 163 +8 160 +3 155 +ó 141 +7 117 +- 20 +ť 137 +ď 121 +/ 50 +: 42 +q 13 +, 38 +– 20 +% 13 +' 4 +" 8 +° 7 +õ 6 +⁠ 4 +ö 9 +! 3 +— 3 +[ 3 +] 3 +ü 5 +; 3 +& 2 ++ 2 +× 2 +² 2 +ç 1 +ā 1 +ã 1 +ł 1 +è 3 +ä 2 +ï 1 diff --git a/mms-1b-all/ckb/lexicon.txt b/mms-1b-all/ckb/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..5f0d8a2c6d5e4bd48bae62abcd6aa510e230acb8 --- /dev/null +++ b/mms-1b-all/ckb/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ckb/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/ckb/tokens.txt b/mms-1b-all/ckb/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..309b5c785384194d1c5ad93905bdf560fd69d913 --- /dev/null +++ b/mms-1b-all/ckb/tokens.txt @@ -0,0 +1,107 @@ +| 32097 +ە 24430 +ی 15154 +ا 14932 +و 12597 +ن 11393 +ر 8273 +ک 7984 +ت 7330 +د 6567 +ب 6764 +ێ 6025 +ل 3314 +ۆ 3483 +س 3707 +م 6342 +ش 2549 +گ 2150 +ه 4293 +ئ 2677 +پ 1710 +ز 2061 +ڕ 1650 +ڵ 1849 +خ 1668 +چ 1417 +ج 796 +ژ 677 +ف 572 +ك 43 +ق 621 +، 561 +ڤ 124 +٠ 368 +0 327 +١ 298 +ح 412 +1 223 +٢ 182 +‎ 160 +٩ 155 +2 143 +. 129 +٨ 105 +٥ 96 +5 96 +غ 67 +٤ 93 +- 19 +6 83 +4 79 +3 79 +٣ 79 +٦ 75 +9 70 +a 65 +ھ 303 +8 53 +ع 267 +٧ 50 +/ 49 +7 43 +c 38 +o 34 +: 31 +" 30 +p 27 +n 27 +t 27 +s 26 +d 26 +h 23 +, 22 +' 21 +i 20 +r 19 +v 19 +u 17 +f 17 +e 16 +ى 9 +[ 14 +] 14 +l 13 +m 13 +؛ 12 +g 11 +‏ 11 +b 10 +x 9 +k 8 +w 7 +% 6 +! 6 +y 6 +j 4 +؟ 3 +‌ 2 +ـ 2 ++ 2 +õ 2 +ء 2 +q 1 +z 1 +ي 80 +ؤ 1 +ث 1 diff --git a/mms-1b-all/cmn-script_simplified/lexicon.txt b/mms-1b-all/cmn-script_simplified/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/mms-1b-all/cmn-script_simplified/lexicon.txt @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/mms-1b-all/cmn-script_simplified/tokens.txt b/mms-1b-all/cmn-script_simplified/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..5435b271a8cc66889988bed39d13da019157d07a --- /dev/null +++ b/mms-1b-all/cmn-script_simplified/tokens.txt @@ -0,0 +1,4491 @@ +的 13574 +在 2652 +一 5821 +是 5720 +有 2085 +和 1875 +了 1065 +人 3013 +这 1036 +a 1 +0 793 +为 3880 +大 2312 +地 1832 +国 3787 +可 890 +他 1267 +1 615 +以 1280 +不 1045 +时 1225 +会 1247 +能 578 +e 554 +到 663 +上 1097 +中 2948 +个 3275 +行 936 +们 395 +n 507 +o 502 +r 492 +于 2651 +发 783 +多 919 +生 1087 +年 1437 +s 433 +来 845 +要 795 +其 837 +斯 1728 +里 714 +i 385 +2 373 +家 1132 +成 1062 +动 934 +用 963 +些 327 +法 758 +出 1111 +t 344 +方 666 +l 337 +对 628 +过 519 +后 1260 +公 871 +物 1359 +之 1107 +也 856 +亚 1218 +或 323 +种 1466 +被 733 +学 1760 +常 448 +前 614 +现 791 +分 863 +自 705 +部 1300 +利 1186 +都 604 +但 237 +最 742 +得 501 +通 534 +而 627 +者 462 +9 271 +子 991 +h 265 +经 570 +因 487 +该 627 +你 91 +就 340 +尔 1604 +特 957 +如 157 +车 608 +5 244 +所 814 +并 386 +作 929 +比 405 +间 567 +表 588 +c 1 +很 221 +它 344 +世 495 +与 982 +加 840 +更 168 +当 426 +进 744 +然 204 +克 1008 +u 218 +区 1926 +球 624 +g 213 +下 1702 +度 413 +名 1295 +场 537 +小 576 +开 477 +体 613 +同 720 +d 209 +语 342 +提 277 +4 205 +3 205 +m 205 +重 453 +称 391 +从 264 +数 453 +意 264 +月 210 +道 693 +定 579 +机 549 +面 382 +美 1154 +新 569 +日 766 +使 493 +文 671 +8 193 +6 192 +将 419 +没 226 +本 1142 +受 258 +目 494 +理 613 +拉 977 +天 569 +入 370 +德 1031 +果 186 +力 420 +西 1268 +旅 73 +罗 701 +我 256 +第 1015 +次 425 +游 243 +队 432 +达 334 +英 347 +手 272 +着 280 +性 464 +由 687 +赛 437 +长 723 +马 736 +起 271 +式 432 +类 371 +示 566 +主 1192 +化 903 +内 700 +说 282 +位 1680 +相 426 +事 656 +此 608 +p 2 +正 343 +水 429 +员 844 +还 320 +电 601 +全 393 +布 562 +海 676 +司 449 +非 529 +应 295 +原 426 +尼 682 +期 556 +外 451 +空 263 +高 619 +议 244 +7 143 +k 142 +战 453 +兰 645 +明 609 +活 238 +教 767 +保 317 +至 304 +管 334 +只 187 +关 481 +情 188 +心 337 +点 342 +样 163 +量 271 +岛 293 +许 189 +那 261 +已 260 +需 98 +约 217 +联 477 +b 130 +认 255 +报 173 +向 245 +变 749 +制 815 +往 150 +节 220 +件 243 +去 191 +军 715 +基 496 +接 282 +城 697 +建 926 +格 540 +选 279 +等 379 +民 977 +看 113 +每 120 +纪 149 +运 402 +科 1875 +奥 409 +代 833 +巴 709 +界 293 +金 413 +航 144 +二 737 +产 415 +据 138 +市 1589 +工 491 +业 733 +阿 739 +合 435 +组 346 +统 346 +计 293 +处 213 +任 878 +持 191 +知 274 +立 467 +实 276 +身 415 +供 131 +风 180 +周 158 +系 507 +政 886 +北 842 +及 716 +几 121 +南 1052 +让 118 +程 293 +三 851 +造 217 +洲 231 +山 851 +获 183 +指 460 +气 168 +少 191 +十 966 +流 267 +星 257 +交 301 +问 140 +尽 29 +f 101 +y 100 +务 264 +好 199 +两 474 +型 228 +州 1716 +影 367 +印 163 +住 131 +无 256 +始 233 +举 201 +速 116 +做 122 +图 652 +查 207 +回 189 +直 236 +维 411 +太 321 +飞 107 +带 245 +光 296 +必 64 +安 539 +己 126 +证 150 +况 64 +今 175 +结 244 +标 185 +击 147 +先 163 +形 345 +解 201 +群 182 +存 203 +取 138 +设 537 +路 569 +平 401 +决 148 +士 737 +近 249 +传 293 +卡 384 +项 184 +网 190 +随 157 +言 143 +感 100 +题 155 +放 178 +伊 382 +头 207 +参 329 +未 155 +口 719 +病 111 +致 164 +确 88 +客 112 +么 81 +雪 56 +包 208 +季 92 +足 264 +历 506 +土 173 +万 206 +导 228 +信 228 +线 429 +像 125 +何 125 +响 124 +规 137 +且 104 +预 69 +座 328 +号 237 +别 202 +各 248 +打 99 +船 56 +想 106 +站 700 +欧 164 +元 289 +普 379 +视 249 +领 241 +括 124 +v 71 +收 147 +助 116 +强 150 +纳 354 +共 453 +字 386 +总 340 +史 425 +神 254 +服 169 +米 221 +府 566 +记 162 +消 68 +古 369 +冰 38 +洛 239 +威 168 +居 284 +票 64 +w 69 +推 112 +堡 176 +引 134 +质 188 +官 317 +告 87 +显 100 +观 188 +萨 364 +河 411 +热 139 +院 487 +律 110 +状 144 +演 307 +离 107 +东 852 +台 570 +局 204 +即 111 +求 104 +越 158 +习 113 +酒 78 +远 90 +警 95 +支 174 +治 664 +曾 500 +份 144 +才 109 +塔 255 +圣 401 +夫 318 +难 71 +希 202 +织 164 +乐 357 +见 175 +把 85 +火 130 +镜 36 +反 221 +野 134 +失 80 +片 201 +抗 91 +门 305 +价 66 +细 101 +首 382 +波 292 +论 200 +医 141 +准 98 +, 61 +容 128 +根 205 +品 278 +试 208 +极 123 +步 96 +林 442 +话 113 +争 150 +班 136 +四 512 +j 58 +级 321 +具 156 +较 119 +完 138 +富 103 +测 116 +园 229 +案 106 +权 154 +转 158 +候 97 +创 175 +续 126 +单 285 +爱 173 +适 48 +费 117 +孩 47 +例 73 +术 228 +商 222 +易 133 +边 157 +户 125 +伤 54 +塞 221 +音 274 +划 114 +石 324 +则 245 +团 253 +故 179 +器 199 +严 64 +给 145 +潜 26 +博 234 +独 127 +识 75 +仅 88 +王 466 +围 130 +装 148 +思 100 +卫 193 +华 478 +须 75 +察 147 +控 114 +复 165 +展 226 +食 143 +早 179 +帮 53 +落 102 +降 58 +便 68 +声 127 +油 69 +义 332 +调 151 +低 80 +互 67 +什 113 +书 291 +签 32 +快 68 +滑 40 +备 129 +染 43 +诉 24 +照 72 +改 264 +命 213 +险 45 +陆 156 +投 107 +验 76 +轻 62 +条 191 +源 180 +育 192 +请 68 +坦 122 +营 149 +望 81 +老 117 +超 90 +防 108 +景 154 +毒 65 +切 90 +态 107 +列 522 +际 166 +域 178 +初 174 +庭 81 +构 220 +广 381 +苏 260 +众 115 +洋 130 +闻 88 +某 59 +甚 33 +她 258 +象 143 +附 146 +护 146 +办 161 +阳 212 +读 99 +模 185 +移 87 +女 414 +伦 164 +色 487 +千 112 +考 108 +哥 169 +射 113 +研 261 +究 241 +诺 206 +核 80 +乎 39 +鲁 178 +牙 136 +资 214 +播 123 +精 86 +走 61 +龙 293 +算 105 +仍 85 +令 118 +除 93 +留 109 +息 108 +盟 117 +房 84 +瓦 286 +花 400 +底 71 +母 250 +张 211 +埃 201 +愿 35 +白 219 +坏 26 +修 143 +连 171 +艇 14 +录 165 +断 67 +志 115 +死 120 +集 220 +虽 26 +限 133 +送 46 +伯 161 +店 100 +雷 271 +姆 176 +晚 67 +胜 88 +写 90 +济 156 +澳 88 +乘 53 +暴 41 +托 181 +弗 134 +功 124 +夏 70 +再 106 +够 50 +专 266 +汽 33 +宗 254 +绝 56 +索 154 +境 155 +著 87 +施 137 +整 97 +密 154 +止 60 +帕 142 +社 276 +- 3 +沙 194 +钟 48 +注 93 +担 226 +素 119 +耳 124 +距 28 +满 85 +黑 187 +哈 182 +蒂 117 +穿 40 +久 86 +协 149 +跑 20 +微 74 +危 31 +检 75 +庙 69 +效 143 +环 175 +宣 123 +真 165 +缺 45 +置 191 +露 30 +舰 98 +属 1739 +馆 218 +角 227 +追 51 +瑞 131 +序 67 +却 37 +技 134 +翰 41 +洞 37 +败 47 +破 60 +监 118 +俄 195 +亡 63 +念 104 +顿 153 +访 37 +勒 264 +杰 88 +似 103 +奇 131 +增 76 +右 65 +词 80 +另 124 +骑 30 +登 257 +害 60 +徒 47 +换 52 +养 58 +湾 255 +赫 81 +束 52 +焦 23 +校 320 +戈 75 +遗 93 +驶 19 +盖 114 +帝 182 +松 124 +舱 9 +尤 52 +z 31 +木 190 +皮 175 +震 58 +树 116 +典 130 +救 41 +迪 139 +左 90 +驾 19 +炸 31 +码 52 +侵 36 +师 233 +岁 49 +段 124 +势 71 +脑 48 +拿 114 +遇 43 +饮 24 +雨 22 +待 44 +背 77 +找 57 +免 45 +族 312 +短 117 +吃 49 +责 94 +艺 153 +率 97 +麦 85 +激 49 +沿 68 +架 50 +银 107 +评 124 +又 172 +欢 62 +排 95 +购 57 +买 60 +终 127 +毛 233 +简 69 +胞 50 +坐 38 +清 334 +皇 135 +丹 122 +判 56 +摄 58 +儿 191 +x 27 +停 79 +承 115 +半 105 +餐 26 +税 23 +叫 42 +估 16 +临 71 +戏 162 +额 41 +讲 70 +积 82 +植 272 +双 140 +港 249 +袭 43 +恐 33 +频 65 +略 46 +温 126 +宿 34 +五 361 +版 156 +闭 31 +轮 61 +腊 60 +捕 43 +亿 1 +莱 269 +邮 37 +料 110 +夜 52 +订 20 +眼 81 +批 67 +胁 13 +鞋 7 +块 31 +委 280 +摩 101 +森 215 +曼 133 +拥 82 +紧 35 +优 74 +卢 90 +患 37 +症 52 +练 75 +阅 7 +省 631 +味 44 +镇 835 +迫 29 +私 57 +攻 80 +恶 25 +挥 47 +觉 70 +桥 109 +窗 20 +谷 75 +疫 22 +职 229 +援 45 +蒙 184 +障 22 +吉 211 +返 31 +灯 55 +训 59 +湖 219 +输 69 +负 89 +疗 61 +继 121 +择 57 +层 122 +男 147 +竞 37 +授 136 +疾 32 +压 60 +扎 54 +寻 32 +否 48 +佳 58 +% 23 +亲 224 +余 71 +街 187 +唯 77 +冷 25 +菌 64 +午 18 +友 101 +靠 37 +智 44 +贝 195 +趣 31 +罪 42 +厅 57 +疑 32 +依 80 +. 22 +升 132 +毁 21 +般 103 +画 206 +享 35 +福 458 +板 55 +裂 47 +值 61 +筑 186 +岩 76 +六 266 +述 121 +异 106 +艾 74 +农 240 +触 31 +京 293 +箱 12 +署 47 +益 44 +爆 42 +丁 108 +审 52 +乔 68 +肯 171 +旋 55 +冠 76 +尚 104 +液 44 +范 136 +杂 54 +歌 165 +俗 23 +探 44 +坡 44 +熟 24 +奖 186 +退 55 +虑 10 +货 46 +冬 36 +寺 125 +端 91 +采 106 +犯 40 +针 30 +屿 22 +钱 66 +脉 55 +冲 57 +胡 90 +藏 118 +童 48 +虫 158 +督 108 +邻 43 +兵 147 +迎 38 +载 65 +鸟 65 +朗 143 +党 379 +泡 50 +牧 27 +听 43 +七 182 +岸 73 +急 26 +莫 148 +喜 73 +玩 49 +刀 16 +猫 25 +跨 27 +武 245 +吸 56 +撞 27 +恩 155 +耶 142 +媒 55 +室 149 +付 35 +巨 75 +笔 44 +幕 54 +佛 117 +彩 30 +损 29 +拍 55 +九 198 +避 30 +绕 25 +纽 98 +逐 39 +宇 42 +漏 10 +墓 83 +封 100 +突 66 +禁 38 +邦 179 +泰 178 +汗 35 +深 105 +烈 60 +差 61 +芬 60 +屋 65 +陡 2 +赢 26 +血 75 +困 32 +编 145 +污 18 +裁 38 +截 15 +床 29 +陷 27 +驱 26 +迹 47 +健 30 +梵 9 +艘 11 +拒 16 +览 38 +顶 76 +征 93 +散 64 +盛 77 +融 33 +挪 13 +按 60 +归 80 +陵 70 +副 108 +充 42 +剧 207 +延 70 +辆 36 +途 40 +执 59 +隆 189 +透 45 +猎 24 +址 100 +搜 17 +涉 27 +毕 183 +折 31 +俱 65 +遭 56 +弱 12 +兴 167 +句 19 +菜 109 +妇 23 +减 35 +违 11 +翻 33 +敦 62 +” 17 +殿 82 +夺 38 +雅 81 +策 74 +卷 44 +茨 88 +含 69 +禽 16 +坠 5 +占 58 +释 45 +朝 574 +脚 38 +聚 56 +携 7 +呼 37 +百 201 +姿 8 +坚 48 +弓 11 +补 46 +派 210 +售 39 +隐 41 +秘 50 +刚 35 +财 54 +逊 53 +促 23 +杀 81 +席 90 +峡 28 +启 38 +暂 19 +逃 29 +稳 32 +齐 80 +乱 24 +既 7 +飓 10 +幸 35 +尝 13 +彼 23 +柏 44 +磁 41 +彻 19 +灵 92 +红 164 +枪 42 +睡 14 +舒 17 +呈 45 +寄 29 +庆 135 +泄 3 +撒 23 +租 15 +婚 59 +鹿 31 +库 169 +汤 32 +侧 90 +草 345 +溜 15 +刃 2 +稀 18 +瀑 6 +迅 21 +霍 87 +舞 64 +宪 51 +漫 58 +荷 80 +碎 14 +饰 42 +瓜 39 +绩 21 +悉 12 +努 47 +描 65 +脏 16 +菲 79 +革 88 +韦 106 +丝 46 +犹 48 +弹 61 +香 287 +永 121 +允 27 +配 112 +啸 4 +桶 1 +牛 104 +狱 17 +弯 10 +痛 19 +: 14 +嫌 11 +恢 20 +云 120 +耐 5 +软 70 +李 206 +橄 4 +榄 8 +宝 86 +阻 34 +宽 37 +络 57 +父 279 +鱼 230 +浓 20 +倍 12 +汉 273 +暗 40 +盘 53 +贵 80 +康 135 +休 45 +仪 49 +狮 15 +豹 18 +赞 36 +符 44 +村 292 +顾 46 +礼 99 +销 44 +汇 42 +错 37 +措 17 +抵 31 +尿 15 +猛 10 +覆 27 +墙 38 +峭 7 +壁 33 +惧 6 +穴 16 +恒 33 +繁 35 +倾 14 +您 13 +赖 53 +横 45 +赌 8 +泽 92 +操 45 +锡 51 +乡 306 +惯 16 +衡 30 +醒 6 +诸 43 +君 68 +干 113 +碍 18 +溯 9 +顺 90 +诊 6 +抢 9 +羊 42 +琴 45 +妻 68 +款 54 +宜 27 +舍 56 +良 71 +守 102 +梦 39 +葬 41 +锦 57 +课 50 +峰 57 +浪 34 +固 42 +填 10 +阶 70 +译 41 +默 54 +涂 12 +叶 292 +均 102 +曲 197 +档 23 +旧 88 +摧 5 +斜 16 +跟 53 +荣 77 +刺 84 +牌 59 +隔 28 +咖 16 +麻 62 +骨 74 +绘 34 +振 30 +苦 33 +廷 72 +遥 13 +剩 14 +榜 27 +泛 55 +申 54 +喷 12 +阵 45 +奏 53 +纸 24 +稍 9 +铁 303 +昂 62 +鉴 12 +贸 33 +垒 12 +炎 25 +韩 96 +堂 253 +劣 9 +八 252 +丰 90 +殖 40 +煤 11 +撤 25 +祝 19 +龄 14 +毫 7 +虐 4 +殊 27 +烧 26 +黎 64 +尺 20 +蚊 4 +寨 33 +忧 11 +祸 5 +末 84 +楼 113 +慢 20 +柔 22 +旗 87 +穆 84 +厢 6 +央 141 +刻 49 +丘 34 +烦 5 +薄 19 +啡 11 +惠 38 +肺 6 +净 6 +磅 3 +键 33 +辑 72 +扫 8 +墨 45 +乏 12 +缘 45 +静 30 +叠 11 +闲 9 +善 65 +厄 43 +丽 98 +唱 69 +骗 10 +怀 45 +讼 3 +爪 14 +寒 19 +讯 61 +旺 29 +粒 25 +借 22 +削 10 +拜 66 +哪 12 +恰 14 +滩 13 +镖 4 +赏 25 +秀 53 +祖 172 +鸦 6 +衣 38 +秒 5 +洪 55 +谈 23 +柬 17 +埔 24 +径 41 +乌 153 +钉 3 +假 67 +脱 30 +迟 8 +玻 20 +璃 16 +础 42 +娜 42 +辐 46 +爵 65 +籍 173 +谢 84 +拟 39 +寓 9 +敲 3 +暖 15 +宁 157 +硬 35 +楚 40 +冈 63 +廉 27 +沉 29 +桑 169 +凯 65 +药 72 +苹 8 +鼓 19 +尊 25 +糖 34 +颁 34 +缓 15 +/ 9 +狼 21 +燃 22 +藻 27 +储 26 +剪 9 +斗 75 +宾 50 +蚁 5 +吧 15 +' 5 +闹 2 +丧 16 +疼 3 +珀 17 +怎 21 +杉 41 +矶 24 +刑 31 +誉 38 +扩 43 +仰 24 +敏 33 +掉 22 +亦 196 +摆 6 +怕 12 +役 55 +癌 16 +í 8 +氏 332 +泳 17 +跃 26 +颗 22 +培 40 +惊 18 +幅 22 +详 32 +昨 4 +概 40 +堵 5 +懂 5 +框 7 +莎 48 +册 24 +川 222 +伽 14 +猩 10 +绿 50 +姻 15 +葡 58 +萄 56 +辛 54 +剥 2 +晓 15 +拘 6 +误 36 +潮 55 +伸 18 +掷 3 +厚 27 +夷 20 +浮 26 +泊 32 +拨 7 +熊 26 +玛 70 +灾 24 +押 7 +替 26 +• 8 +剑 27 +巢 16 +堆 15 +粗 46 +珍 34 +谋 23 +宴 4 +湿 23 +丢 7 +堕 2 +敌 27 +答 13 +盗 13 +酸 80 +鲜 80 +崎 40 +衔 27 +乳 53 +妮 17 +虎 87 +荡 9 +荐 10 +匈 22 +虚 23 +钢 34 +垫 4 +饭 29 +旁 36 +逮 10 +膀 4 +椅 6 +宙 18 +q 7 +佐 44 +棒 40 +朋 23 +榻 1 +戴 45 +夕 9 +企 58 +肩 9 +材 48 +讨 41 +坎 25 +搭 18 +匹 36 +旦 16 +劳 82 +吐 10 +踪 11 +娱 22 +梅 137 +腰 9 +沃 159 +贯 18 +棕 22 +鹰 19 +齿 54 +忙 18 +励 19 +貌 14 +曝 3 +握 21 +旨 19 +厉 7 +罚 13 +偷 5 +圭 15 +矛 21 +篮 45 +靴 6 +胎 12 +挑 25 +逝 29 +詹 18 +渡 39 +梁 72 +缆 4 +畅 10 +藤 95 +仁 67 +剂 31 +箭 39 +荒 15 +奶 16 +苗 25 +崇 47 +页 21 +吗 22 +瓶 8 +黄 252 +窑 6 +烤 4 +伏 32 +拼 7 +奎 14 +胶 33 +芯 8 +挤 2 +昆 46 +氮 12 +献 40 +闪 12 +坟 15 +朱 60 +颜 46 +歉 6 +巧 12 +陪 6 +臭 8 +裙 5 +怪 19 +罕 12 +贴 13 +弃 23 +缝 7 +隙 5 +裔 41 +拖 8 +纷 16 +杯 42 +郁 6 +亨 29 +帐 8 +池 45 +蛋 38 +欣 13 +悬 23 +鼠 57 +阀 5 +兹 79 +偏 38 +掌 26 +塑 17 +铺 12 +鸡 17 +屠 16 +宰 13 +咬 4 +虹 5 +— 4 +寸 5 +锻 3 +炼 9 +媲 6 +壳 57 +凉 17 +币 39 +捷 39 +疆 39 +厂 71 +巡 54 +塌 3 +亮 43 +憾 2 +秃 1 +õ 6 +烹 1 +饪 1 +斤 5 +睛 7 +刊 19 +谎 3 +瑟 51 +擅 14 +凌 11 +轴 33 +枚 13 +甲 292 +葱 11 +讽 6 +混 42 +° 5 +伐 95 +孙 96 +辨 7 +绪 43 +滋 9 +斐 14 +御 66 +掠 12 +惜 5 +雇 11 +吊 8 +丛 16 +循 24 +贪 8 +嘲 5 +豪 38 +挡 7 +羽 60 +栋 19 +脸 11 +驼 11 +棵 2 +旱 4 +聊 9 +贺 26 +诗 58 +勘 4 +予 49 +疏 33 +遍 27 +卸 5 +伴 19 +咎 2 +奔 6 +镑 5 +赤 41 +钳 5 +迁 56 +陶 37 +袋 23 +昼 3 +妨 3 +· 5 +坑 28 +囚 2 +辞 18 +仇 9 +氢 26 +佩 65 +粉 59 +吞 10 +残 29 +跌 1 +贾 26 +践 13 +涵 10 +逻 20 +尘 7 +邓 26 +坍 5 +坛 22 +腐 14 +% 5 +鞍 9 +晨 5 +悠 18 +噬 3 +榈 9 +轨 56 +歧 12 +酋 14 +糟 4 +糕 19 +仿 23 +刷 16 +庞 20 +忆 17 +逗 3 +崩 5 +悟 4 +粘 7 +兼 63 +沫 13 +姓 56 +溪 79 +爬 17 +跳 33 +攀 10 +青 199 +滤 7 +缠 5 +圈 41 +笑 20 +鹅 24 +卵 16 +娅 5 +届 85 +肝 13 +祭 23 +驻 103 +怖 12 +忘 7 +抽 17 +篇 20 +抑 11 +谨 8 +慎 7 +诚 14 +聪 12 +戒 10 +腹 49 +盎 4 +腿 9 +壮 32 +暑 4 +抚 26 +漠 11 +麋 4 +幽 13 +掩 4 +倒 19 +贡 58 +厌 7 +涸 4 +宅 24 +犒 4 +肠 19 +氛 4 +猪 32 +逆 11 +碳 19 +添 11 +偶 22 +妹 46 +田 198 +勤 28 +尸 7 +抛 5 +裹 2 +谐 4 +犬 16 +氧 50 +涛 4 +锅 8 +夸 12 +仔 27 +鸭 10 +驯 3 +晰 3 +饱 7 +遮 6 +缩 21 +嘴 14 +尾 114 +鲍 11 +咨 4 +询 12 +仑 17 +竟 4 +陌 3 +轿 6 +兑 1 +润 14 +拔 32 +擦 7 +袖 18 +铅 6 +愤 3 +卉 4 +贫 22 +滚 17 +壤 14 +洁 14 +迈 33 +啮 4 +斑 36 +春 104 +奢 3 +穷 14 +扰 19 +茶 68 +鲨 14 +镫 4 +涯 34 +樱 11 +扬 32 +栏 11 +蜃 1 +颇 10 +蛇 29 +宠 15 +氰 6 +鼎 15 +滕 27 +阱 5 +删 8 +哺 25 +摔 5 +雄 77 +夹 16 +疲 3 +竭 7 +蹈 13 +眨 3 +颠 7 +碰 2 +歇 23 +捉 5 +兄 70 +弟 113 +疟 3 +腺 26 +裕 22 +僧 15 +囊 11 +勋 26 +丈 17 +捐 11 +赠 19 +泥 19 +寂 5 +芝 38 +臣 68 +肥 23 +函 31 +茜 33 +俚 3 +踢 3 +魅 3 +庄 85 +辈 13 +奉 49 +鼻 14 +窃 2 +奋 5 +趁 3 +盾 46 +盲 18 +渔 16 +阪 22 +姐 32 +劫 4 +彰 12 +泵 11 +溢 1 +鞭 13 +狩 5 +徙 7 +沼 10 +枝 37 +饲 7 +: 3 +藉 15 +歪 2 +搞 6 +杜 142 +雕 32 +凹 21 +抓 4 +邀 19 +獭 1 +柜 8 +é 3 +屏 19 +吓 4 +陨 15 +瓣 20 +遣 27 +慨 3 +若 54 +罄 3 +竹 85 +琳 22 +搬 21 +锁 17 +隧 11 +燥 9 +废 48 +纵 14 +忒 4 +弥 14 +驳 7 +卧 8 +眠 9 +勾 4 +丑 18 +宛 9 ++ 3 +邸 7 +唐 140 +麾 2 +氨 19 +驰 3 +喉 6 +咙 3 +缪 1 +碑 34 +焰 4 +搁 1 +浅 22 +介 115 +吁 4 +堤 4 +漂 10 +诈 4 +撰 15 +渐 38 +裸 6 +锋 34 +瞩 3 +矩 23 +猿 10 +牵 14 +冒 10 +撑 3 +狗 30 +炮 44 +插 25 +紊 1 +蚀 2 +蜿 2 +蜒 2 +弄 19 +股 69 +柱 44 +亏 4 +愚 2 +蠢 3 +伟 27 +厕 5 +缭 2 +喱 2 +椰 5 +笨 1 +侯 34 +拴 2 +趾 6 +渊 10 +蒸 7 +抄 7 +摸 5 +勿 5 +捶 2 +碾 2 +滥 2 +翅 24 +诞 12 +吻 16 +巷 3 +厘 3 +县 1649 +吴 90 +耗 9 +萦 2 +婆 40 +愈 12 +陀 21 +螺 97 +剖 3 +吾 22 +淘 13 +汰 11 +胆 30 +膝 4 +叹 3 +绵 14 +拓 18 +灭 40 +乞 8 +狂 7 +炉 13 +鸫 2 +炭 8 +综 28 +帜 4 +橇 1 +帅 15 +掘 7 +淌 2 +漆 4 +趟 2 +招 19 +募 3 +阴 38 +腓 8 +硕 25 +铜 30 +洒 3 +呕 2 +彗 6 +岭 33 +岖 1 +梯 25 +俯 2 +瞰 2 +崖 17 +铃 24 +ó 2 +亥 21 +串 15 +卖 29 +挟 2 +狄 21 +醉 6 +飘 5 +珠 48 +遐 1 +迩 2 +烟 26 +拐 2 +裤 1 +掏 1 +揭 11 +纱 5 +猞 2 +猁 2 +& 2 +慈 24 +诱 6 +扔 3 +赶 13 +赴 22 +喀 25 +弦 16 +璀 2 +璨 2 +欺 4 +淹 3 +禄 23 +遂 16 +纹 49 +馊 2 +喙 6 +耸 2 +泻 3 +诅 2 +咒 6 +杞 2 +沮 2 +粮 17 +佣 4 +稻 15 +蔗 4 +阔 14 +浩 21 +瀚 4 +垠 2 +绸 4 +勃 27 +赔 4 +吼 2 +逼 4 +聘 8 +撼 1 +蜂 15 +亭 18 +映 38 +; 2 +兔 13 +羚 1 +卓 20 +番 22 +晴 6 +啤 6 +撕 3 +肆 3 +懒 5 +鹦 7 +鹉 8 +盆 17 +吵 2 +痴 1 +洽 1 +柯 25 +迄 8 +召 32 +、 2 +糊 7 +肿 12 +瘤 19 +诀 2 +章 97 +隶 100 +堪 74 +摇 16 +婪 1 +津 75 +魁 17 +拦 2 +猜 3 +敛 1 +洗 17 +哦 1 +勇 19 +敢 8 +馈 1 +缴 8 +喘 1 +肴 2 +孔 64 +雀 36 +蜗 21 +榛 1 +毯 1 +债 8 +券 16 +蓄 6 +豁 2 +贷 8 +辅 28 +扭 6 +肢 16 +挽 6 +疽 2 +枢 25 +寰 1 +顷 1 +膨 5 +胀 10 +旬 1 +玉 84 +柿 9 +豆 119 +盔 5 +甄 3 +湍 1 +喂 2 +擎 12 +甘 62 +菊 94 +奸 4 +扮 12 +甩 2 +篷 5 +茵 5 +瑙 16 +崔 31 +呢 11 +匣 2 +缉 3 +桃 60 +棍 3 +篡 2 +挖 7 +翱 2 +翔 12 +忽 6 +摘 3 +伍 32 +镳 2 +拷 1 +逛 2 +薪 8 +骚 5 +敞 3 +钻 14 +抱 11 +韵 6 +艳 8 +抿 2 +肉 47 +磷 10 +硫 14 +狭 15 +窄 4 +莨 2 +菪 2 +躲 9 +欲 10 +肚 6 +猴 12 +凭 13 +棱 13 +臂 12 +贬 7 +尖 45 +哭 12 +屈 25 +栅 4 +桩 2 +诩 2 +涅 53 +锐 10 +杆 18 +蝎 1 +哇 6 +胺 12 +坷 2 +巅 1 +榨 4 +汁 9 +昔 9 +赁 1 +酱 8 +甜 8 +蝽 20 +哲 45 +秩 6 +侦 12 +虔 2 +遏 2 +垃 3 +圾 3 +割 9 +垩 6 +侈 2 +纯 17 +粹 15 +挂 12 +礁 12 +俘 14 +虏 1 +铀 3 +涌 2 +棘 26 +凶 6 +冻 5 +泪 7 +盈 4 +眶 1 +熨 2 +婴 14 +媚 2 +宏 12 +株 14 +砂 18 +盐 42 +氯 11 +钙 4 +套 38 +乃 24 +筋 8 +醇 16 +浴 7 +祇 1 +汀 8 +贿 5 +筒 25 +莉 29 +雌 16 +孟 42 +榴 5 +姬 12 +汐 8 +磨 17 +《 1 +》 1 +艰 4 +泉 53 +牺 4 +牲 6 +零 40 +暹 3 +跋 7 +贼 11 +惕 4 +蚂 4 +惬 1 +惨 4 +惑 3 +骸 9 +ł 1 +踵 1 +遵 19 +郊 32 +罂 10 +粟 12 +碱 10 +蔡 29 +乍 3 +橱 1 +钥 3 +匙 3 +畜 5 +颅 6 +凡 18 +宫 130 +墟 2 +蹄 36 +牢 2 +蕾 11 +扶 14 +噱 1 +疯 1 +烫 1 +袜 2 +吹 6 +祈 5 +祷 3 +巩 3 +冯 19 +蔓 7 +嫩 7 +钛 2 +妙 11 +迷 24 +姊 8 +淡 29 +棺 2 +江 403 +辖 318 +郡 211 +蕨 183 +嘉 172 +蛛 167 +陈 140 +蟹 130 +宋 113 +杨 109 +刘 103 +町 91 +昌 90 +靖 87 +紫 83 +圆 82 +浙 80 +滨 80 +凤 74 +魏 73 +郎 70 +奈 69 +畔 69 +蝶 68 +晋 67 +禾 65 +鹃 64 +徽 63 +董 61 +柳 61 +赵 59 +藓 59 +郑 57 +蓝 56 +井 55 +棋 55 +鳞 54 +翼 53 +绍 52 +徐 51 +吕 51 +乙 50 +灰 50 +秦 50 +辽 49 +仙 49 +叙 48 +皆 48 +秋 48 +虾 47 +阮 47 +陕 47 +幼 46 +乾 45 +熙 45 +析 45 +郭 44 +仓 44 +沟 43 +莲 43 +唇 43 +纲 42 +矿 41 +讷 41 +栖 40 +芒 40 +弘 39 +浦 39 +贞 38 +潘 38 +寿 38 +阁 37 +珊 37 +甸 36 +苣 36 +鳍 36 +翁 35 +缅 35 +苔 35 +奴 35 +曹 35 +茂 34 +蛾 34 +卒 33 +姜 33 +忠 33 +藩 33 +敬 33 +侍 32 +桂 32 +仕 31 +蚤 31 +萧 31 +瑚 31 +龟 30 +溶 30 +恋 30 +厦 30 +堇 30 +昭 29 +胸 29 +晶 29 +孝 29 +祥 29 +桐 29 +契 28 +穗 28 +塘 28 +彭 28 +岳 27 +咸 27 +累 27 +辰 27 +蒲 27 +葛 27 +肃 27 +卿 26 +械 26 +麟 26 +碘 26 +杭 25 +玄 25 +柄 25 +沈 25 +茅 25 +蜱 25 +萤 24 +膜 24 +魔 24 +兽 24 +蛤 24 +茛 24 +鳅 24 +茎 23 +薇 23 +叉 23 +谱 23 +汝 23 +腔 23 +雍 23 +垂 22 +脂 22 +鹤 22 +彦 22 +伞 22 +链 22 +仲 22 +酶 22 +尉 21 +萼 21 +扁 21 +傅 21 +丙 21 +燕 21 +脊 21 +祠 21 +笼 21 +荆 21 +蝠 21 +葵 21 +娘 21 +槭 21 +芳 20 +孢 20 +冕 20 +霸 20 +蜜 20 +妈 20 +瓷 20 +蔷 19 +廊 19 +鬼 19 +铭 19 +卑 19 +慕 19 +柴 19 +椒 19 +蒋 19 +佑 19 +蜥 19 +姚 19 +肌 19 +耀 19 +樟 19 +赐 19 +岗 19 +谦 19 +蒿 19 +忍 19 +拱 19 +凰 18 +檗 18 +钦 18 +辉 18 +琉 18 +锥 18 +袁 18 +桔 18 +戟 18 +拳 18 +蚜 18 +狸 18 +钩 17 +扑 17 +芦 17 +甫 17 +绳 17 +贤 17 +潭 17 +隋 17 +俊 17 +拆 17 +汪 17 +叛 17 +巫 17 +妃 17 +娃 17 +肖 17 +卯 17 +梨 17 +蚓 17 +迦 17 +赋 16 +绣 16 +递 16 +戌 16 +屡 16 +兆 16 +尹 16 +爸 16 +浏 16 +谓 16 +蓬 16 +淮 16 +蜡 16 +坊 16 +催 16 +绒 15 +丞 15 +壬 15 +嫁 15 +芹 15 +儒 15 +蛄 15 +沧 15 +卜 15 +滇 15 +鼩 15 +芋 14 +幻 14 +鸣 14 +舌 14 +戊 14 +腾 14 +侄 14 +雾 14 +肾 14 +颈 14 +湘 14 +慧 14 +嗣 14 +铸 14 +邑 14 +橙 14 +鸿 14 +颖 14 +薹 14 +鲤 14 +屯 14 +坞 13 +酯 13 +诏 13 +叔 13 +恭 13 +褐 13 +梭 13 +衍 13 +坝 13 +岑 13 +薯 13 +邵 13 +蛙 13 +毅 13 +癸 13 +苞 13 +榆 13 +芥 13 +丸 13 +鲷 13 +祯 13 +硝 13 +蕉 13 +蛱 13 +仆 12 +坪 12 +祀 12 +眉 12 +庚 12 +闸 12 +坂 12 +氟 12 +別 12 +阜 12 +噶 12 +纺 12 +妳 12 +稣 12 +蝉 12 +稚 12 +禅 12 +橡 12 +苯 12 +绥 12 +帽 12 +虱 12 +鞘 12 +鳗 12 +烷 12 +札 11 +衰 11 +翠 11 +赚 11 +朔 11 +蜀 11 +薛 11 +蓼 11 +姑 11 +渠 11 +皿 11 +渥 11 +茄 11 +披 11 +魂 11 +荨 11 +镶 11 +吏 11 +侨 11 +羌 11 +娶 11 +铝 11 +闽 11 +逸 11 +苑 10 +琼 10 +啊 10 +稿 10 +绑 10 +朴 10 +喝 10 +荠 10 +淀 10 +妖 10 +槐 10 +苍 10 +埋 10 +邱 10 +凝 10 +灌 10 +耆 10 +趋 10 +肤 10 +厥 10 +蚕 10 +沪 10 +萝 10 +砖 10 +喇 10 +埠 10 +霉 10 +冶 10 +倡 10 +怒 10 +茹 10 +沂 10 +纤 10 +槟 10 +烛 10 +醛 10 +帆 9 +栗 9 +罢 9 +狐 9 +橘 9 +酰 9 +扇 9 +峨 9 +惟 9 +犁 9 +玫 9 +弼 9 +矮 9 +耕 9 +怡 9 +晖 9 +钝 9 +杖 9 +鲸 9 +霞 9 +慰 9 +谟 9 +岐 9 +舟 9 +伪 9 +菩 9 +踏 9 +垦 9 +蛭 9 +牡 9 +蕈 9 +墅 9 +楠 9 +邪 9 +荚 9 +蕊 9 +淳 9 +谭 9 +晃 9 +蚌 9 +鲈 9 +孤 9 +垣 9 +芽 9 +凸 9 +谊 9 +蒜 9 +烯 9 +烃 9 +肽 9 +浆 9 +悦 9 +裴 9 +殉 9 +棠 9 +酉 8 +盒 8 +碧 8 +寅 8 +漳 8 +襄 8 +淫 8 +饼 8 +谁 8 +粤 8 +竺 8 +芭 8 +澜 8 +驹 8 +贩 8 +彝 8 +蚬 8 +锯 8 +莹 8 +鄂 8 +笙 8 +梗 8 +忌 8 +椭 8 +悼 8 +酮 8 +讳 8 +皱 8 +鼱 8 +椎 8 +邢 8 +恨 8 +钠 8 +菱 8 +戎 8 +毗 8 +淄 8 +筹 8 +痕 8 +壶 8 +劝 8 +圳 8 +厝 8 +菀 8 +庐 8 +蟾 8 +藨 8 +蓟 7 +嵩 7 +莓 7 +庶 7 +枯 7 +廖 7 +尧 7 +侠 7 +庸 7 +酿 7 +孚 7 +羁 7 +犀 7 +胃 7 +斩 7 +褶 7 +豫 7 +匠 7 +锤 7 +蝇 7 +骤 7 +泌 7 +弧 7 +蛮 7 +苋 7 +蕴 7 +擢 7 +窝 7 +硅 7 +拂 7 +矾 7 +澎 7 +腭 7 +肋 7 +芪 7 +乂 7 +蔬 7 +冢 7 +悲 7 +禹 7 +铠 7 +纠 7 +準 7 +溧 7 +涡 7 +畸 7 +晕 7 +舜 7 +榕 7 +挝 7 +芙 7 +鳄 7 +噜 7 +庇 7 +鳕 7 +莆 7 +滴 7 +糙 7 +妆 7 +辟 7 +欠 7 +樽 7 +嗜 7 +麓 7 +熔 7 +逍 7 +醚 7 +哩 7 +柽 7 +瑜 7 +沁 7 +螯 7 +芩 7 +藜 7 +溴 7 +濑 6 +煮 6 +邨 6 +肛 6 +缕 6 +鹘 6 +妾 6 +藿 6 +镰 6 +葫 6 +蓉 6 +饶 6 +蕃 6 +檬 6 +黛 6 +琦 6 +沛 6 +酷 6 +琪 6 +伙 6 +孕 6 +芸 6 +渝 6 +彬 6 +窟 6 +罐 6 +坤 6 +涩 6 +畴 6 +茱 6 +赣 6 +桌 6 +浑 6 +禧 6 +妥 6 +笛 6 +蝙 6 +廿 6 +莪 6 +闵 6 +轩 6 +骏 6 +酬 6 +湄 6 +萍 6 +沅 6 +啄 6 +厨 6 +陇 6 +谴 6 +釜 6 +誓 6 +灼 6 +劲 6 +剿 6 +兖 6 +谏 6 +荫 6 +栽 6 +幢 6 +蜘 6 +桨 6 +匡 6 +斥 6 +谣 6 +庵 6 +谥 6 +肇 6 +笋 6 +虞 6 +肪 6 +抬 6 +梓 6 +斛 6 +瑰 6 +俞 6 +啦 6 +铎 6 +绛 6 +喻 6 +诛 6 +嫡 6 +衙 6 +鬣 6 +桤 6 +辩 6 +嵌 6 +蜍 6 +磐 6 +蟒 6 +酵 5 +匝 5 +梢 5 +荀 5 +阐 5 +饿 5 +惹 5 +縻 5 +黔 5 +婿 5 +沽 5 +纶 5 +昊 5 +懋 5 +隍 5 +翌 5 +竖 5 +赦 5 +皂 5 +傲 5 +翊 5 +秉 5 +黏 5 +煌 5 +窦 5 +楝 5 +笠 5 +履 5 +咏 5 +衷 5 +泾 5 +俭 5 +樊 5 +袍 5 +佥 5 +岱 5 +鹏 5 +晤 5 +娄 5 +懿 5 +毙 5 +枣 5 +柑 5 +寮 5 +轰 5 +爷 5 +劾 5 +阎 5 +岬 5 +淋 5 +酥 5 +桓 5 +豚 5 +桉 5 +蝴 5 +凿 5 +鳖 5 +淆 5 +耿 5 +鹭 5 +臧 5 +稽 5 +摊 5 +桦 5 +萌 5 +咕 5 +谍 5 +榧 5 +丫 5 +澄 5 +籽 5 +纂 5 +嘛 5 +矢 5 +跆 5 +锈 5 +颌 5 +棉 5 +於 5 +斋 5 +葶 5 +苈 5 +胤 5 +冀 5 +甬 5 +扣 5 +睦 5 +兜 5 +蔚 5 +晏 5 +侣 5 +隼 5 +钧 5 +睾 5 +獐 5 +帘 5 +畿 4 +菅 4 +鸠 4 +愉 4 +幌 4 +焊 4 +蔽 4 +鼹 4 +骼 4 +垄 4 +鸻 4 +匪 4 +劈 4 +烽 4 +斧 4 +岷 4 +媛 4 +涨 4 +侃 4 +雏 4 +甥 4 +柠 4 +舶 4 +葆 4 +琅 4 +铨 4 +酚 4 +溃 4 +朵 4 +锚 4 +羟 4 +昏 4 +笃 4 +腕 4 +颉 4 +畏 4 +椿 4 +镧 4 +隅 4 +扈 4 +碲 4 +廓 4 +谬 4 +戚 4 +涧 4 +墩 4 +槽 4 +炀 4 +т 4 +萸 4 +聂 4 +颤 4 +祺 4 +栩 4 +诃 4 +鑫 4 +秆 4 +匿 4 +爽 4 +斌 4 +颂 4 +稠 4 +奠 4 +偿 4 +敷 4 +荸 4 +鲫 4 +寇 4 +倪 4 +檐 4 +刹 4 +驴 4 +肼 4 +梾 4 +鸥 4 +聋 4 +惩 4 +牟 4 +菝 4 +葜 4 +孜 4 +哀 4 +菁 4 +凋 4 +炽 4 +傣 4 +矣 4 +僚 4 +柃 4 +碟 4 +勉 4 +莘 4 +镀 4 +翡 4 +歙 4 +邯 4 +苎 4 +銮 4 +瑛 4 +鲹 4 +哉 4 +旭 4 +凑 4 +韶 4 +锂 4 +喔 4 +勐 4 +驿 4 +琮 4 +唤 4 +伺 4 +玮 4 +霖 4 +辗 4 +韧 4 +髓 4 +胄 4 +淑 4 +猬 4 +汛 4 +麝 4 +舅 4 +梧 4 +嫔 4 +汶 4 +羧 4 +辣 4 +谜 4 +钮 4 +渭 4 +鲣 4 +溥 4 +闯 4 +乒 4 +椤 4 +呀 4 +戍 4 +蜕 4 +绅 4 +碗 4 +帛 4 +钯 4 +钾 4 +蔺 4 +谅 4 +釉 4 +崴 4 +鲡 4 +豌 4 +溲 4 +箬 4 +敖 4 +沸 3 +烂 3 +觅 3 +濒 3 +曷 3 +泗 3 +鸽 3 +嘎 3 +汾 3 +嘧 3 +啶 3 +扯 3 +臀 3 +兀 3 +筛 3 +谤 3 +拯 3 +祁 3 +胱 3 +荔 3 +玖 3 +戮 3 +灶 3 +嗓 3 +胰 3 +鲃 3 +妓 3 +佰 3 +匐 3 +剌 3 +枕 3 +玲 3 +孪 3 +掖 3 +谕 3 +孵 3 +憩 3 +舄 3 +捞 3 +凄 3 +菇 3 +颊 3 +宕 3 +稷 3 +泣 3 +脓 3 +镁 3 +纬 3 +肄 3 +卦 3 +幡 3 +浊 3 +龈 3 +晟 3 +叻 3 +郃 3 +膳 3 +挛 3 +竿 3 +慌 3 +皖 3 +璋 3 +腥 3 +莺 3 +濂 3 +瘦 3 +钕 3 +唑 3 +炬 3 +燮 3 +簇 3 +殷 3 +赈 3 +饥 3 +丕 3 +耧 3 +珂 3 +滞 3 +疣 3 +簧 3 +谛 3 +俩 3 +祚 3 +鞑 3 +靼 3 +羲 3 +蚶 3 +仗 3 +棚 3 +溞 3 +蒴 3 +鹳 3 +牻 3 +缔 3 +腧 3 +雉 3 +覃 3 +砌 3 +瓢 3 +狡 3 +纭 3 +棣 3 +歼 3 +帧 3 +珙 3 +莴 3 +亩 3 +蛹 3 +钰 3 +寞 3 +弊 3 +镊 3 +峻 3 +翟 3 +悖 3 +拾 3 +靛 3 +钵 3 +矫 3 +菏 3 +憧 3 +憬 3 +慷 3 +芮 3 +倭 3 +媳 3 +雯 3 +烨 3 +逢 3 +郸 3 +揖 3 +溺 3 +俨 3 +晁 3 +弈 3 +韬 3 +辙 3 +胖 3 +宦 3 +屑 3 +敕 3 +枫 3 +阙 3 +圻 3 +鄞 3 +犍 3 +灿 3 +琰 3 +鸾 3 +霜 3 +祕 3 +翘 3 +缮 3 +钓 3 +鲑 3 +炒 3 +螈 3 +舆 3 +晒 3 +烘 3 +陋 3 +昀 3 +萱 3 +暨 3 +澧 3 +缚 3 +篱 3 +瑶 3 +筱 3 +柚 3 +衅 3 +毋 3 +檀 3 +闍 3 +赉 3 +渗 3 +郓 3 +鳔 3 +葎 3 +橐 3 +迭 3 +殴 3 +缨 3 +溉 3 +陁 3 +钜 3 +莼 3 +邕 3 +乓 3 +襟 3 +瘿 3 +栈 3 +桫 3 +沦 3 +瘾 3 +跖 3 +砍 3 +吨 3 +熬 3 +漕 3 +僵 3 +穹 3 +蛳 3 +绶 3 +牦 3 +粪 3 +漪 3 +曰 3 +洱 3 +湛 3 +苴 3 +黧 3 +鲳 3 +颓 3 +蜈 3 +蚣 3 +铬 3 +匀 3 +饷 3 +胚 3 +珧 3 +螨 3 +匍 3 +罩 3 +蛏 3 +栉 3 +抒 3 +瞭 3 +脐 2 +辇 2 +疮 2 +萎 2 +咽 2 +厍 2 +峙 2 +赭 2 +馅 2 +熄 2 +璜 2 +捆 2 +贰 2 +驸 2 +鲻 2 +焕 2 +褒 2 +蔻 2 +黯 2 +铟 2 +噩 2 +渤 2 +侗 2 +蜴 2 +蜊 2 +莅 2 +砻 2 +傀 2 +儡 2 +鹑 2 +骆 2 +浇 2 +颐 2 +琢 2 +渣 2 +査 2 +莒 2 +̌ 2 +啰 2 +诽 2 +赂 2 +轸 2 +赟 2 +谶 2 +褚 2 +莽 2 +兮 2 +奘 2 +汴 2 +彧 2 +夭 2 +绰 2 +卞 2 +铳 2 +丟 2 +恼 2 +鹋 2 +鹩 2 +鄣 2 +镒 2 +脖 2 +鍊 2 +窥 2 +幂 2 +镛 2 +皋 2 +嘌 2 +呤 2 +禺 2 +栾 2 +潢 2 +笏 2 +潞 2 +珩 2 +呆 2 +滁 2 +稗 2 +瞬 2 +侏 2 +吟 2 +漩 2 +诟 2 +貂 2 +鼬 2 +涪 2 +渎 2 +绯 2 +楞 2 +沆 2 +蛟 2 +蠕 2 +壸 2 +砗 2 +磲 2 +寡 2 +锌 2 +叭 2 +琥 2 +讶 2 +孺 2 +祗 2 +佃 2 +柝 2 +蜻 2 +蜓 2 +菠 2 +缀 2 +鞅 2 +昙 2 +捧 2 +悚 2 +箴 2 +妊 2 +娠 2 +侬 2 +涟 2 +栓 2 +邳 2 +磡 2 +砷 2 +哨 2 +僖 2 +闾 2 +焉 2 +匕 2 +苷 2 +膦 2 +箕 2 +и 2 +̃ 2 +羞 2 +辱 2 +硼 2 +廪 2 +蛎 2 +惰 2 +酌 2 +拮 2 +缇 2 +蘑 2 +刈 2 +蛸 2 +悄 2 +鳌 2 +桧 2 +砸 2 +巳 2 +瑾 2 +抹 2 +喊 2 +杠 2 +瞻 2 +郦 2 +痒 2 +馨 2 +煜 2 +懈 2 +哮 2 +闫 2 +麒 2 +咀 2 +殃 2 +琨 2 +沾 2 +枋 2 +毓 2 +猕 2 +崛 2 +睿 2 +嵯 2 +颍 2 +邺 2 +瑀 2 +淇 2 +噪 2 +鏊 2 +诫 2 +锺 2 +灏 2 +烁 2 +悔 2 +汞 2 +淯 2 +鲱 2 +魄 2 +郝 2 +霆 2 +巾 2 +邹 2 +蓣 2 +榔 2 +壹 2 +秤 2 +閒 2 +荛 2 +夥 2 +谒 2 +掸 2 +莩 2 +糯 2 +皎 2 +坳 2 +喵 2 +弢 2 +凳 2 +啧 2 +琵 2 +匆 2 +阇 2 +猗 2 +腮 2 +淅 2 +肘 2 +疃 2 +蕲 2 +翎 2 +痉 2 +冥 2 +眷 2 +罹 2 +澡 2 +缸 2 +婷 2 +恬 2 +狙 2 +汲 2 +渴 2 +邬 2 +茉 2 +奕 2 +诡 2 +荽 2 +鲢 2 +芃 2 +肟 2 +谚 2 +赃 2 +洮 2 +脆 2 +诵 2 +陲 2 +窜 2 +绢 2 +炔 2 +蕗 2 +鸮 2 +吠 2 +粥 2 +掳 2 +琚 2 +荥 2 +忻 2 +闳 2 +摹 2 +峄 2 +湳 2 +婕 2 +妤 2 +庹 2 +汕 2 +泱 2 +佟 2 +氓 2 +苟 2 +珞 2 +婵 2 +硒 2 +钡 2 +憍 2 +椹 2 +鱿 2 +槲 2 +骠 2 +淖 2 +锰 2 +寥 2 +辜 2 +璇 2 +婉 2 +岚 2 +账 2 +哔 2 +锆 2 +曙 2 +涞 2 +铂 2 +绎 2 +杏 2 +愕 2 +姒 2 +铰 2 +亳 2 +荃 2 +衬 2 +楸 2 +琛 2 +楷 2 +柰 2 +栻 2 +骄 2 +铋 2 +辍 2 +遁 2 +靶 2 +璧 2 +桢 2 +薮 2 +狠 2 +镐 2 +诠 2 +渲 2 +赎 2 +雁 2 +夯 2 +膺 2 +苇 2 +茫 2 +衢 2 +堰 2 +冉 2 +昕 2 +瞧 2 +殆 2 +芎 2 +祉 2 +惘 2 +铆 2 +腈 2 +羰 2 +渚 2 +苁 2 +郯 2 +铵 2 +睹 2 +恕 2 +巽 2 +泸 2 +簪 2 +伎 2 +贻 2 +醋 2 +虻 2 +卤 2 +疍 2 +睫 2 +嗲 2 +鄱 2 +珑 2 +炕 2 +μ 2 +肱 2 +梳 2 +裘 2 +愁 2 +瞄 2 +洵 1 +盱 1 +眙 1 +箩 1 +筐 1 +翥 1 +蟑 1 +螂 1 +漾 1 +叩 1 +胭 1 +腻 1 +沌 1 +煎 1 +琊 1 +勍 1 +徬 1 +惶 1 +疡 1 +渌 1 +泠 1 +胍 1 +酪 1 +浔 1 +歆 1 +镥 1 +傍 1 +佚 1 +俳 1 +塾 1 +鹌 1 +掞 1 +酐 1 +馀 1 +奚 1 +苳 1 +竈 1 +瞎 1 +鳃 1 +霈 1 +榉 1 +唾 1 +俸 1 +柩 1 +荦 1 +ㄟ 1 +昱 1 +捩 1 +慑 1 +忱 1 +瞳 1 +椋 1 +纻 1 +隗 1 +晔 1 +鮎 1 +郴 1 +谠 1 +轶 1 +諲 1 +陂 1 +抖 1 +沔 1 +叟 1 +觚 1 +唛 1 +钌 1 +痢 1 +蹶 1 +偕 1 +喽 1 +骷 1 +髅 1 +膑 1 +蚺 1 +怜 1 +歹 1 +稔 1 +刮 1 +郫 1 +捣 1 +垢 1 +捏 1 +茧 1 +鹂 1 +踩 1 +娟 1 +哄 1 +虬 1 +栃 1 +钲 1 +焮 1 +邾 1 +泼 1 +泮 1 +骂 1 +焚 1 +黾 1 +圩 1 +驭 1 +掛 1 +胪 1 +艮 1 +濡 1 +盼 1 +巯 1 +骰 1 +偲 1 +嫣 1 +睢 1 +鳚 1 +繄 1 +荼 1 +聿 1 +朕 1 +辕 1 +珐 1 +绂 1 +筷 1 +洙 1 +圪 1 +扉 1 +桡 1 +鹛 1 +擂 1 +骁 1 +浉 1 +骥 1 +俣 1 +盂 1 +榖 1 +廆 1 +砾 1 +讚 1 +腱 1 +螟 1 +踊 1 +祋 1 +娣 1 +諡 1 +仄 1 +曜 1 +靳 1 +嘟 1 +妲 1 +矗 1 +検 1 +蒟 1 +泯 1 +瑮 1 +倘 1 +袓 1 +洣 1 +搏 1 +绮 1 +铼 1 +濠 1 +圜 1 +涝 1 +弁 1 +甾 1 +魇 1 +杓 1 +屉 1 +揩 1 +坯 1 +娆 1 +侮 1 +罔 1 +躯 1 +坜 1 +缜 1 +竣 1 +蟥 1 +搔 1 +朽 1 +姨 1 +炊 1 +岔 1 +暝 1 +舫 1 +咆 1 +挠 1 +艷 1 +熠 1 +鋆 1 +茁 1 +拧 1 +瓯 1 +闰 1 +镍 1 +镉 1 +崧 1 +酃 1 +佬 1 +濮 1 +桝 1 +儋 1 +纮 1 +谯 1 +煲 1 +呔 1 +膻 1 +筮 1 +帷 1 +幔 1 +緁 1 +県 1 +儆 1 +庾 1 +荁 1 +菉 1 +汜 1 +傕 1 +膈 1 +胯 1 +鄙 1 +蝌 1 +蚪 1 +鸲 1 +跻 1 +玠 1 +蝾 1 +龚 1 +镎 1 +阖 1 +乖 1 +佤 1 +萃 1 +扼 1 +佼 1 +鸵 1 +笆 1 +呐 1 +峥 1 +帚 1 +埕 1 +蘸 1 +诙 1 +孖 1 +琐 1 +螣 1 +傩 1 +殒 1 +舵 1 +阕 1 +轲 1 +汹 1 +埈 1 +讫 1 +峒 1 +沐 1 +籴 1 +琶 1 +彪 1 +坨 1 +鼷 1 +睁 1 +攸 1 +圹 1 +诹 1 +譬 1 +栀 1 +娥 1 +畋 1 +仞 1 +喹 1 +啉 1 +蹴 1 +憨 1 +俵 1 +髎 1 +铲 1 +焱 1 +皓 1 +鋐 1 +枞 1 +咪 1 +苓 1 +贮 1 +忤 1 +羯 1 +拇 1 +褓 1 +籁 1 +飒 1 +缙 1 +専 1 +诬 1 +泓 1 +哼 1 +雑 1 +簿 1 +熈 1 +讹 1 +浚 1 +绞 1 +攒 1 +俐 1 +韭 1 +蔑 1 +侪 1 +禑 1 +淼 1 +缵 1 +湜 1 +耽 1 +苛 1 +侂 1 +勺 1 +峯 1 +薨 1 +耍 1 +祜 1 +铽 1 +唆 1 +砀 1 +嚓 1 +屹 1 +悫 1 +砵 1 +糠 1 +獴 1 +膛 1 +鲎 1 +隈 1 +钊 1 +脲 1 +呋 1 +喃 1 +玹 1 +厩 1 +溅 1 +犷 1 +灞 1 +杵 1 +镂 1 +禛 1 +扳 1 +枥 1 +挫 1 +呵 1 +込 1 +暎 1 +懔 1 +遑 1 +偃 1 +璹 1 +蠹 1 +佘 1 +妡 1 +紑 1 +暄 1 +喧 1 +叅 1 +螃 1 +惺 1 +蛊 1 +槛 1 +刍 1 +钿 1 +褧 1 +镓 1 +镠 1 +陟 1 +璆 1 +牕 1 +帖 1 +寀 1 +綦 1 +奣 1 +嵗 1 +谌 1 +嫖 1 +挚 1 +钴 1 +赘 1 +荪 1 +怨 1 +惪 1 +鹪 1 +璎 1 +藳 1 +暮 1 +潇 1 +薖 1 +沤 1 +伛 1 +娇 1 +頴 1 +旌 1 +オ 1 +カ 1 +ヤ 1 +ド 1 +舘 1 +栢 1 +耻 1 +嵋 1 +哑 1 +冼 1 +瘰 1 +骘 1 +鼢 1 +狯 1 +霹 1 +雳 1 +萁 1 +芗 1 +峪 1 +臻 1 +赡 1 +攥 1 +泷 1 +呻 1 +瘫 1 +痪 1 +蒺 1 +笮 1 +钹 1 +嫉 1 +榙 1 +骈 1 +鸩 1 +馯 1 +葳 1 +玶 1 +綖 1 +崁 1 +蔵 1 +罘 1 +钪 1 +镗 1 +柞 1 +瓛 1 +裬 1 +蛲 1 +擒 1 +倚 1 +鼐 1 +餵 1 +蠋 1 +嘻 1 +颞 1 +冤 1 +瓮 1 +岫 1 +醮 1 +鲀 1 +澥 1 +婺 1 +楮 1 +倻 1 +酢 1 +苡 1 +阈 1 +硖 1 +硎 1 +倩 1 +腩 1 +寝 1 +撇 1 +宵 1 +撷 1 +疹 1 +徘 1 +徊 1 +薷 1 +斟 1 +斝 1 +邰 1 +尨 1 +槱 1 +沥 1 +锑 1 +尻 1 +皕 1 +饯 1 +搅 1 +拌 1 +钨 1 +铯 1 +唁 1 +轭 1 +萘 1 +稼 1 +霄 1 +嬤 1 +俅 1 +芰 1 +貘 1 +墀 1 +杙 1 +栎 1 +桄 1 +蓍 1 +砝 1 +傉 1 +幺 1 +恍 1 +掰 1 +锶 1 +埗 1 +禀 1 +蓑 1 +蝮 1 +湫 1 +熏 1 +渺 1 +锣 1 +阉 1 +枭 1 +昵 1 +绫 1 +樾 1 +轧 1 +鬃 1 +捍 1 +嵖 1 +岈 1 +绚 1 +鸯 1 +謇 1 +昴 1 +钗 1 +恽 1 +玭 1 +偈 1 +闼 1 +蛉 1 +槚 1 +脾 1 +畲 1 +瓘 1 +涓 1 +钫 1 +浸 1 +胛 1 +烺 1 +篦 1 +珰 1 +剃 1 +嵴 1 +斡 1 +鎏 1 +芷 1 +霓 1 +氖 1 +馔 1 +蟆 1 +驷 1 +阏 1 +愔 1 +淤 1 +眩 1 +爹 1 +铮 1 +剀 1 +濯 1 +炫 1 +塬 1 +顽 1 +砥 1 +樵 1 +倦 1 +郤 1 +玎 1 +沭 1 +琬 1 +嬴 1 +猄 1 +宥 1 +拢 1 +氦 1 +姮 1 +挞 1 +胳 1 +膊 1 +疤 1 +鹬 1 +蟋 1 +蟀 1 +嶷 1 +盏 1 +埇 1 +烙 1 +莞 1 +秧 1 +捡 1 +匮 1 +妍 1 +樨 1 +妒 1 +浞 1 +臼 1 +奂 1 +靡 1 +蜑 1 +臯 1 +谪 1 +黜 1 +痼 1 +懦 1 +棁 1 +笄 1 +牂 1 +牁 1 +褪 1 +簸 1 +湟 1 +佯 1 +瘠 1 +跪 1 +躺 1 +隘 1 +涿 1 +屐 1 +藔 1 +匾 1 +衫 1 +洐 1 +龛 1 +鳟 1 +膏 1 +饴 1 +杻 1 +醯 1 +晦 1 +鞣 1 +荧 1 +嚣 1 +炜 1 +氙 1 +榭 1 +怵 1 +猝 1 +伫 1 +翃 1 +婢 1 +胥 1 +壽 1 +珈 1 +淞 1 +铣 1 +胼 1 +痘 1 +噌 1 +馥 1 +箨 1 +钼 1 +羡 1 +棨 1 +圃 1 +趴 1 +揆 1 +饵 1 +彤 1 +飙 1 +悌 1 +揉 1 +闱 1 +撮 1 +戛 1 +秽 1 +麂 1 +孛 1 +䴕 1 +濉 1 +袒 1 +瑭 1 +愧 1 diff --git a/mms-1b-all/cym/lexicon.txt b/mms-1b-all/cym/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..d6b3f6cd1d700c0d4cb65605704fbfe0af957995 --- /dev/null +++ b/mms-1b-all/cym/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/cym/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/cym/tokens.txt b/mms-1b-all/cym/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3f7177b2745d5d0717b4c4729427814fec76875 --- /dev/null +++ b/mms-1b-all/cym/tokens.txt @@ -0,0 +1,79 @@ +| 283373 +a 108040 +d 133852 +n 94923 +e 88233 +y 108946 +i 82967 +r 74069 +o 61519 +l 46299 +w 59650 +h 55187 +g 36279 +f 37847 +t 35683 +s 27999 +u 30526 +c 31354 +m 23422 +b 20198 +' 16052 +p 9059 +0 847 +â 2904 +1 627 +k 45 +ô 971 +2 385 +- 318 +9 279 +j 628 +5 235 +v 77 +4 221 +3 204 +6 196 +8 187 +. 167 +ŵ 442 +7 131 +, 102 +z 6 +ï 246 +ê 106 +x 40 +î 2 +á 204 +q 1 +: 40 +ŷ 490 +/ 31 +$ 18 +é 7 +ü 13 +% 11 +; 8 +í 6 +¥ 6 +õ 6 +ë 8 +ö 4 +ú 4 ++ 4 +" 4 +ç 3 +£ 3 +ò 1 +ã 3 +ì 2 +ä 2 +ó 4 +̇ 2 +° 2 +[ 2 +] 2 +û 76 +à 4 +– 1 +— 210 diff --git a/mms-1b-all/dan/lexicon.txt b/mms-1b-all/dan/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..42fcbca9f288bf21834148f79b525fe76f12b699 --- /dev/null +++ b/mms-1b-all/dan/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/dan/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/dan/tokens.txt b/mms-1b-all/dan/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..54d56e44729554a78bc8467cc5bbb94dd0a5c221 --- /dev/null +++ b/mms-1b-all/dan/tokens.txt @@ -0,0 +1,71 @@ +| 14954 +e 11605 +r 5646 +n 5315 +t 4655 +a 3757 +d 4868 +i 3895 +s 4023 +l 3568 +o 3060 +g 3438 +m 2187 +k 2497 +f 1397 +v 1971 +u 1399 +b 959 +p 861 +h 1496 +å 843 +æ 779 +y 396 +ø 652 +j 667 +c 141 +0 510 +1 408 +- 18 +. 282 +w 2 +2 247 +9 163 +5 147 +4 139 +3 136 +z 7 +6 119 +8 117 +7 83 +x 3 +' 4 +/ 41 +é 4 +q 33 +, 32 +: 31 +$ 12 +​ 10 +% 10 +í 7 +ü 6 +; 6 +" 5 +õ 4 +– 3 +— 9 +° 3 +ö 3 +¥ 3 +& 3 +” 2 +ç 2 +ã 2 +[ 2 +] 2 +á 1 +ł 1 +² 1 ++ 1 +ó 1 diff --git a/mms-1b-all/deu/lexicon.txt b/mms-1b-all/deu/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..43255eb74957be693f1974d314d63f447bf8dfc9 --- /dev/null +++ b/mms-1b-all/deu/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/deu/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/deu/tokens.txt b/mms-1b-all/deu/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..e78f1905975981c9ff99ea39db796a66491bab3b --- /dev/null +++ b/mms-1b-all/deu/tokens.txt @@ -0,0 +1,190 @@ +| 159217 +e 131270 +n 80138 +i 59280 +r 54730 +s 52175 +t 49879 +a 41708 +d 40800 +h 42513 +u 32710 +l 26384 +g 23320 +o 18665 +c 23688 +m 20035 +b 16207 +f 10936 +w 14343 +k 7364 +z 7971 +v 6891 +p 3622 +ü 5101 +ä 2938 +ö 2481 +j 3471 +0 30 +- 16 +y 212 +ß 1273 +1 7 +. 478 +2 3 +x 56 +" 245 +9 196 +5 5 +3 6 +4 11 +6 6 +8 1 +q 50 +7 92 +: 45 +/ 40 +, 29 +' 1530 +á 239 +% 11 +° 8 +í 187 +é 57 +& 5 ++ 5 +; 5 +ç 6 +‍ 4 +[ 3 +] 3 +² 3 +¥ 3 +æ 13 +ú 26 +ā 26 +ã 30 +! 2 +õ 2 +ł 53 +‌ 2 +” 2 +$ 1 +ó 150 +– 257 +ō 140 +š 91 +č 56 +ř 48 +ć 42 +â 39 +ø 37 +ž 35 +ı 33 +ş 31 +ô 31 +ý 31 +ñ 28 +ū 28 +ș 26 +́ 24 +ʿ 24 +ë 5 +ě 20 +ś 18 +œ 18 +ț 16 +ī 15 +ə 15 +ʻ 14 +đ 13 +— 12 +ń 12 +′ 12 +ê 11 +å 11 +ă 10 +ð 9 +î 9 +ą 8 +ň 8 +ğ 7 +̇ 7 +ė 7 +ï 6 +ò 5 +ż 5 +ḫ 5 +ź 4 +ů 4 +ő 4 +ę 3 +о 3 +` 3 +ơ 3 +а 3 +и 3 +à 3 +‐ 3 +ì 3 +ġ 3 +ď 2 +ť 2 +ế 2 +с 2 +е 2 +р 2 +ф 2 +м 2 +в 2 +ш 2 +û 2 +μ 2 +ṣ 2 +ả 2 +ạ 2 +辶 1 +ѹ 1 +無 1 +ǐ 1 +ན 1 +་ 1 +カ 1 +臣 1 +比 1 +支 1 +ч 1 +к 1 +ņ 1 +合 1 +̆ 1 +ʾ 1 +ŏ 1 +孙 1 +道 1 +临 1 +尣 1 +ħ 1 +ụ 1 +ắ 1 +黃 1 +城 1 +关 1 +镇 1 +è 1 +̥ 1 +毛 1 +泽 1 +东 1 +§ 1 +þ 1 +ṟ 1 +ē 1 +⟨ 1 +⟩ 1 +ù 1 +≡ 1 +ṭ 1 +ộ 1 +ễ 1 +ằ 1 diff --git a/mms-1b-all/ell/lexicon.txt b/mms-1b-all/ell/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..1bec959174717dd4a80e4d6bf931691680c191b4 --- /dev/null +++ b/mms-1b-all/ell/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ell/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/ell/tokens.txt b/mms-1b-all/ell/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..8233d601929a727bc6e7a28601291c52e6f67c73 --- /dev/null +++ b/mms-1b-all/ell/tokens.txt @@ -0,0 +1,98 @@ +| 132013 +α 67615 +τ 45556 +ο 51934 +ι 38449 +ε 45522 +ν 40678 +σ 27827 +ρ 21417 +π 23770 +κ 20204 +η 14736 +ς 22501 +μ 18760 +υ 20772 +λ 15212 +ί 16363 +ό 15381 +ά 13608 +έ 12870 +γ 9887 +δ 10341 +ή 7902 +ω 7303 +χ 5131 +ύ 10316 +θ 9429 +φ 4505 +ώ 4512 +β 3171 +ξ 2010 +ζ 2110 +0 175 +a 1136 +1 43 +ψ 785 +e 994 +o 350 +r 436 +i 1388 +n 222 +2 29 +s 352 +. 327 +t 464 +« 280 +» 274 +c 269 +9 6 +l 238 +5 12 +m 451 +h 171 +4 47 +3 8 +6 8 +d 176 +8 2 +u 166 +p 57 +ϊ 29 +g 144 +b 157 +k 3080 +- 7 +7 7 +y 217 +v 84 +w 72 +: 64 +, 64 +f 60 +' 25 +/ 51 +z 35 +j 23 +x 601 +% 20 +ΐ 34 +q 18 +΄ 12 +ϋ 4 +· 9 +² 8 +[ 6 +õ 6 +] 6 +í 5 +! 4 +; 4 +– 2 +é 2 +& 2 +° 2 +$ 2 ++ 1 +́ 3 +â 1 diff --git a/mms-1b-all/eng/lexicon.txt b/mms-1b-all/eng/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..a6b735b73a217a7055c67e9bcdf563169e38fca1 --- /dev/null +++ b/mms-1b-all/eng/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/eng/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/eng/tokens.txt b/mms-1b-all/eng/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..724c0575e94910ee3bffa079f81f847cf5195d53 --- /dev/null +++ b/mms-1b-all/eng/tokens.txt @@ -0,0 +1,150 @@ +| 715078 +e 377441 +t 272576 +a 226897 +o 238560 +i 187349 +n 195266 +s 182431 +r 161181 +h 223160 +l 107967 +d 133527 +c 58150 +u 85202 +m 72055 +f 69590 +p 44051 +g 58128 +y 63065 +w 75423 +b 45594 +v 31157 +k 18042 +0 106 +x 1852 +j 9196 +1 39 +' 2091 +- 380 +z 1301 +q 880 +2 32 +9 204 +. 173 +5 2 +4 22 +6 11 +3 2 +8 134 +7 97 +, 76 +/ 35 +: 35 +% 16 +$ 13 +á 87 +í 67 +— 59 +; 6 +¥ 6 +ü 181 +é 316 ++ 5 +& 4 +° 4 +[ 4 +] 4 +– 692 +" 4 +” 4 +ç 44 +’ 3 +ã 12 +ó 71 +£ 2 +õ 1 +ú 17 +​ 2 +‘ 1 +ł 10 +ö 100 +! 1 +² 1 +̇ 1 +ä 113 +è 67 +ō 37 +â 35 +ß 30 +ñ 23 +à 22 +ï 16 +ô 15 +ë 11 +ê 11 +č 11 +š 10 +ø 8 +` 8 +́ 10 +ć 7 +ž 6 +œ 6 +î 6 +ð 5 +û 5 +ā 5 +ū 5 +ă 5 +ı 4 +ș 4 +ò 3 +α 3 +ī 3 +ř 2 +κ 2 +æ 2 +ạ 2 +ý 2 +и 2 +к 2 +ʻ 2 +ş 2 +π 2 +ń 2 +ę 1 +ő 1 +ṃ 1 +ụ 1 +å 1 +в 1 +е 1 +л 1 +й 1 +н 1 +я 1 +з 1 +ь 1 +þ 1 +时 1 +尚 1 +先 1 +生 1 +ň 1 +ə 1 +§ 1 +ě 1 +χ 1 +≡ 1 +ē 1 +а 1 +ả 1 +ị 1 +נ 1 +ע 1 +京 1 +都 1 +大 1 +阪 1 +ğ 1 diff --git a/mms-1b-all/est/lexicon.txt b/mms-1b-all/est/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..d29665ba608197761d50869a34ad5b1300692098 --- /dev/null +++ b/mms-1b-all/est/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/est/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/est/tokens.txt b/mms-1b-all/est/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..63a7a20949de29cdd041d46d228d66a050841149 --- /dev/null +++ b/mms-1b-all/est/tokens.txt @@ -0,0 +1,70 @@ +| 40392 +a 30962 +e 28079 +i 24872 +s 22127 +t 18906 +l 15513 +u 14832 +n 12552 +k 12253 +d 10238 +o 9962 +m 9419 +r 7726 +v 6198 +p 4616 +g 4878 +h 4156 +j 4286 +ä 3469 +õ 2989 +b 2300 +ü 1737 +ö 897 +f 404 +0 570 +1 17 +c 210 +- 419 +2 258 +y 89 +9 200 +" 199 +w 59 +5 160 +. 157 +4 154 +3 143 +8 136 +6 132 +' 42 +7 87 +z 22 +x 21 +š 54 +/ 39 +ž 19 +, 34 +– 29 +q 5 +: 19 +% 11 +; 10 +’ 7 +! 6 +” 5 +— 4 ++ 4 +ç 3 +ł 3 +é 3 +¥ 3 +[ 2 +] 2 +² 2 +° 2 +í 2 +ã 2 +ú 1 +ô 1 diff --git a/mms-1b-all/fas/lexicon.txt b/mms-1b-all/fas/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..4afe32547b9383135bf410a23c5f6a8332b1a877 --- /dev/null +++ b/mms-1b-all/fas/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/fas/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/fas/tokens.txt b/mms-1b-all/fas/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4c216d5649e2cfc7ff8abfe6d169bb91be36e35 --- /dev/null +++ b/mms-1b-all/fas/tokens.txt @@ -0,0 +1,114 @@ +| 285356 +ا 145657 +ی 93122 +ر 78013 +د 90966 +ن 78737 +ه 64212 +و 65098 +ت 40294 +م 60458 +ب 47120 +س 32463 +ک 18745 +ل 15851 +ش 26504 +ز 20254 +‌ 5255 +گ 14180 +ف 12247 +ق 9028 +خ 19780 +ع 11076 +ج 7140 +ح 7897 +پ 8853 +آ 12676 +ط 3975 +ص 3183 +چ 5510 +غ 1451 +ض 1709 +۰ 518 +ظ 1129 +ذ 1160 +ث 907 +۱ 386 +ً 355 +ئ 711 +ژ 133 +۲ 258 +، 239 +0 196 +۹ 163 +۵ 140 +1 122 +۳ 120 +أ 228 +۴ 104 +۶ 103 +۸ 95 +۷ 81 +4 77 +. 74 +a 4 +5 70 +- 70 +2 61 +3 57 +« 51 +» 51 +8 51 +e 3 +9 46 +6 45 +s 1 +i 6 +r 3 +ء 78 +o 2 +, 35 +n 2 +: 34 +l 5 +d 1 +t 7 +/ 24 +ِ 24 +h 3 +ؤ 129 +7 19 +p 19 +m 18 +b 2 +y 15 +u 2 +k 14 +g 1 +c 3 +v 11 +w 10 +x 2 +ّ 8 +؛ 7 +% 6 +َ 5 +f 5 +٪ 5 +q 4 +‎ 4 +õ 4 +j 3 ++ 3 +ُ 3 +ۀ 1487 +& 2 +z 2 +ي 2 +ك 10892 +ى 25 +ٔ 1651 +ة 4 +– 1 +ے 1 +' 456 diff --git a/mms-1b-all/fin/lexicon.txt b/mms-1b-all/fin/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..2edc6fe026b3e297e2ff1a6cfa9ae76e1765d8b4 --- /dev/null +++ b/mms-1b-all/fin/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/fin/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/fin/tokens.txt b/mms-1b-all/fin/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f8600311e6ce0afc1b730d8ff3cc342f824d4a8 --- /dev/null +++ b/mms-1b-all/fin/tokens.txt @@ -0,0 +1,67 @@ +| 107927 +a 78912 +i 65411 +t 60794 +n 63683 +e 60951 +s 48002 +l 37107 +o 29344 +k 34494 +u 30838 +ä 37334 +m 22813 +r 10901 +v 14546 +j 17463 +p 10018 +h 16606 +y 7997 +d 5966 +ö 2138 +g 710 +0 639 +b 411 +- 198 +c 46 +1 440 +f 434 +2 295 +9 194 +w 5 +5 165 +4 145 +6 144 +. 144 +3 142 +8 139 +7 91 +z 3 +: 69 +x 2 +q 11 +‎ 33 +, 32 +– 3 +' 241 +/ 17 +š 14 +ž 12 +ü 6 +× 6 +ł 5 +ç 5 +õ 4 +ú 4 +é 3 +" 3 +[ 3 +] 3 +í 3 +ã 3 +̇ 2 +% 2 +ë 2 ++ 2 +á 2 +ó 2 diff --git a/mms-1b-all/fra/lexicon.txt b/mms-1b-all/fra/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..373a52bd19f3fb39b5f966499736b5f4168f47e2 --- /dev/null +++ b/mms-1b-all/fra/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/fra/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/fra/tokens.txt b/mms-1b-all/fra/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..44bd943367878086dac608f47d6d62a18a5513f3 --- /dev/null +++ b/mms-1b-all/fra/tokens.txt @@ -0,0 +1,310 @@ +| 779611 +e 511720 +s 313151 +a 234502 +n 232849 +t 228011 +i 241272 +r 214878 +u 239417 +l 181838 +o 192066 +d 122108 +c 104346 +p 98231 +é 54650 +m 91185 +v 71001 +g 27141 +f 36337 +' 42200 +q 42528 +b 21415 +h 28401 +à 17744 +x 13682 +è 13132 +y 7338 +j 27249 +0 631 +z 11081 +k 311 +ê 8277 +- 9876 +1 174 +2 120 +’ 324 +« 285 +» 285 +w 17706 +9 26 +ô 1699 +5 64 +4 128 +3 30 +. 184 +6 23 +8 15 +ç 1711 +î 2526 +7 21 +â 1138 +ù 1012 +û 778 +: 53 +ï 943 +" 51 +/ 44 +, 37 +œ 1815 +% 24 +; 21 +! 12 +° 11 +ü 69 +í 906 +á 1274 +– 292 +õ 19 +? 5 +ë 313 +[ 4 +] 4 +— 874 +ú 197 ++ 3 +ã 131 +$ 3 +ł 261 +ó 783 +² 2 +ö 904 +æ 119 +ÿ 30 +ō 585 +ä 459 +š 335 +ć 309 +č 256 +ñ 224 +ū 187 +ø 163 +ā 142 +ń 123 +ž 118 +ă 117 +å 107 +ș 94 +ß 89 +́ 74 +ş 73 +ı 69 +ř 65 +ț 62 +ý 57 +ð 45 +ś 45 +ż 42 +ę 42 +đ 42 +ī 42 +ò 41 +ą 37 +ě 33 +ő 27 +ė 25 +` 25 +ğ 24 +̇ 20 +α 20 +β 20 +ē 19 +ʻ 17 +ź 17 +ì 16 +ʿ 12 +ṣ 12 +ņ 12 +þ 11 +ʼ 10 +ŏ 9 +σ 8 +π 8 +ả 8 +ň 8 +ư 7 +ʾ 7 +ţ 7 +ː 6 +ễ 6 +γ 6 +ť 6 +ơ 5 +ĩ 5 +ệ 5 +ω 5 +δ 5 +κ 5 +μ 4 +ů 4 +λ 4 +ν 4 +ṭ 4 +ạ 4 +ļ 4 +ḥ 4 +∆ 4 +ľ 4 +ầ 3 +ṇ 3 +ộ 3 +ǃ 3 +ε 3 +ј 3 +ə 3 +± 3 +ل 3 +ا 3 +г 3 +↔ 3 +ǔ 2 +ồ 2 +ề 2 +χ 2 +ờ 2 +ǹ 2 +ρ 2 +е 2 +а 2 +≥ 2 +′ 2 +® 2 +ũ 2 +ṯ 2 +ď 2 +τ 2 +ʉ 2 +ο 2 +ċ 2 +і 2 +ψ 2 +ử 2 +ي 2 +φ 2 +ن 2 +ደ 2 +̱ 2 +ị 2 +星 2 +р 2 +ǎ 2 +の 1 +ひ 1 +ổ 1 +ӌ 1 +ቀ 1 +ű 1 +к 1 +牡 1 +丹 1 +ŵ 1 +ҫ 1 +ậ 1 +я 1 +ḍ 1 +ṅ 1 +ķ 1 +ǫ 1 +ŭ 1 +ế 1 +ɨ 1 +ủ 1 +ǀ 1 +и 1 +з 1 +м 1 +н 1 +め 1 +や 1 +џ 1 +ц 1 +ч 1 +∞ 1 +ẵ 1 +ⱎ 1 +ⱅ 1 +ứ 1 +§ 1 +⋅ 1 +ĺ 1 +☉ 1 +宇 1 +津 1 +保 1 +⊨ 1 +υ 1 +п 1 +ι 1 +ό 1 +ς 1 +西 1 +甌 1 +ከ 1 +厳 1 +三 1 +̐ 1 +い 1 +ố 1 +م 1 +ة 1 +ب 1 +ر 1 +و 1 +د 1 +ħ 1 +∅ 1 +ẓ 1 +∼ 1 +θ 1 +む 1 +も 1 +ጀ 1 +̠ 1 +ζ 1 +գ 1 +北 1 +京 1 +美 1 +术 1 +馆 1 +た 1 +つ 1 +ợ 1 +文 1 +へ 1 +ま 1 +ġ 1 +η 1 +ʽ 1 +э 1 +ớ 1 +ắ 1 +ụ 1 +ỳ 1 +杜 1 +乃 1 +扬 1 +ų 1 +∈ 1 +զ 1 +青 1 +貴 1 +う 1 +ゔ 1 +ꝑ 1 +₽ 1 +ወ 1 +‐ 1 +∨ 1 +̲ 1 +ɑ 1 diff --git a/mms-1b-all/ful/lexicon.txt b/mms-1b-all/ful/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..9a06399dfbc7967e4fb008aa624206db64d71ae6 --- /dev/null +++ b/mms-1b-all/ful/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ful/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/ful/tokens.txt b/mms-1b-all/ful/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f9810ad3ba6b7af91b03fe4832f07c2cfcfc08c --- /dev/null +++ b/mms-1b-all/ful/tokens.txt @@ -0,0 +1,81 @@ +| 1053558 +a 768845 +e 457100 +i 460504 +o 524339 +n 456889 +d 170262 +u 238410 +m 276010 +t 132454 +b 65658 +r 139929 +l 169588 +h 80970 +k 166380 +j 64655 +g 99848 +w 117919 +s 73230 +y 105588 +f 46579 +ɗ 160919 +ɓ 125560 +p 17579 +c 20297 +v 8204 +0 191 +’ 712 +1 37 +. 532 +z 130 +' 46636 +2 34 +- 5826 +9 4 +5 13 +4 38 +3 12 +6 7 +8 3 +, 164 +q 336 +7 6 +” 78 +x 1 +: 68 +‘ 40 +; 40 +` 33 +/ 29 +? 20 +$ 19 +% 11 +– 12 +! 9 +ü 9 +é 5 +— 4919 +õ 4 +á 4 +× 4 +ã 3 +[ 3 +] 3 +ı 3 ++ 3 +í 3 +& 3 +£ 3 +ç 3 +ş 3 +¥ 3 +è 2 +ŋ 1946 +ó 2 +ƴ 12323 +° 2 +_ 2 +ú 2 +ɲ 2128 +‐ 70 diff --git a/mms-1b-all/gle/lexicon.txt b/mms-1b-all/gle/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..6f017de19a1b12de91c88d04dcd7b643464aeb6c --- /dev/null +++ b/mms-1b-all/gle/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/gle/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/gle/tokens.txt b/mms-1b-all/gle/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0dc3c4fca916975288cf0981bed5380f4504ddb --- /dev/null +++ b/mms-1b-all/gle/tokens.txt @@ -0,0 +1,74 @@ +| 4007 +a 2801 +i 1608 +n 1367 +h 1438 +r 1006 +t 796 +e 840 +s 829 +c 732 +o 747 +l 734 +d 566 +g 588 +u 449 +m 533 +á 434 +í 358 +b 550 +é 293 +f 243 +ú 134 +ó 151 +p 95 +- 51 +0 591 +1 448 +' 17 +k 1 +v 4 +2 287 +y 5 +9 184 +w 2 +. 178 +5 152 +j 2 +6 141 +4 137 +8 127 +3 121 +7 88 +z 76 +x 70 +​ 66 +, 56 +/ 48 +: 42 +q 36 +$ 12 +ü 10 +’ 8 +; 8 +% 7 +— 7 +" 5 +ç 5 +õ 4 ++ 4 +& 4 +ö 3 +” 3 +£ 3 +‘ 3 +° 3 +– 3 +¥ 3 +̇ 2 +ã 2 +! 2 +² 1 +[ 1 +] 1 +ł 1 diff --git a/mms-1b-all/glg/lexicon.txt b/mms-1b-all/glg/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..f350d3a7e8f20432379c23948b9b4eef3fcc70a9 --- /dev/null +++ b/mms-1b-all/glg/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/glg/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/glg/tokens.txt b/mms-1b-all/glg/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3351149e000bd212e6f305b249af69e78c76ae9 --- /dev/null +++ b/mms-1b-all/glg/tokens.txt @@ -0,0 +1,72 @@ +| 25658 +a 16122 +e 14834 +o 13238 +s 9715 +n 9315 +r 8612 +i 8457 +d 6413 +t 6327 +c 5875 +u 4672 +l 4165 +m 3896 +p 3476 +b 1597 +v 1465 +g 1350 +f 1333 +x 1043 +q 811 +h 847 +ó 925 +í 882 +á 870 +é 828 +z 504 +ú 362 +ñ 402 +0 500 +1 362 +k 6 +2 238 +y 6 +w 3 +« 156 +» 152 +9 145 +5 133 +4 124 +3 121 +j 3 +8 111 +6 105 +. 102 +7 68 +: 56 +- 6 +/ 28 +, 27 +' 2 +% 22 +ü 2 +; 16 +º 12 +! 5 +° 5 +& 4 +ª 3 +ç 3 +¥ 3 +? 2 +" 2 +– 9 +ł 2 ++ 2 +ã 1 +ö 1 +[ 1 +] 1 +² 1 +— 19 diff --git a/mms-1b-all/guj/lexicon.txt b/mms-1b-all/guj/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..e63793d0f1ddebde72189f50b67cbdf8884d23a4 --- /dev/null +++ b/mms-1b-all/guj/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/guj/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/guj/tokens.txt b/mms-1b-all/guj/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..1074b386fb9b9449761959b8caf00c993fc5af45 --- /dev/null +++ b/mms-1b-all/guj/tokens.txt @@ -0,0 +1,120 @@ +| 128150 +ા 57941 +ે 53681 +ર 31768 +ન 32329 +્ 25417 +ક 21291 +ી 21790 +ત 32763 +મ 23524 +વ 17406 +ો 18805 +સ 14274 +ં 20817 +ય 13943 +પ 17675 +િ 9067 +લ 6771 +ુ 14440 +જ 10323 +હ 10368 +ટ 3053 +થ 9681 +છ 7412 +અ 5431 +શ 7720 +ગ 4010 +બ 4455 +દ 6233 +ણ 7700 +આ 5712 +ડ 2160 +એ 5360 +ધ 3335 +ઓ 5659 +ચ 2481 +ૂ 2634 +ફ 1083 +ળ 2778 +ખ 3576 +ભ 3088 +ષ 1962 +ઈ 4877 +ઇ 437 +ઉ 1315 +0 762 +" 744 +ઝ 257 +ઘ 992 +1 573 +. 483 +ૃ 699 +2 365 +- 15 +9 251 +ઠ 700 +ૌ 64 +ૈ 189 +ૉ 219 +5 215 +, 208 +3 208 +4 199 +ૅ 193 +8 174 +6 174 +7 123 +ઢ 243 +ઊ 497 +ઑ 98 +ઞ 333 +: 57 +i 52 +a 45 +s 45 +p 44 +/ 41 +' 550 +n 37 +e 34 +o 32 +c 31 +ઍ 26 +m 25 +t 25 +ઔ 23 +d 23 +ઃ 130 +g 21 +r 21 +u 20 +b 18 +; 15 +v 15 +f 13 +ઋ 8 +$ 12 +x 12 +ઐ 2 +w 11 +h 10 +k 7 +% 7 +l 7 +y 6 +! 6 ++ 5 +[ 3 +] 3 +j 3 +¥ 3 +° 2 +z 2 +q 2 +& 2 +઼ 2 +૮ 2 +૪ 2 +૬ 2 +õ 2 diff --git a/mms-1b-all/hau/lexicon.txt b/mms-1b-all/hau/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..f0f76df2197f15d4eab9bd6698fb776c30f747b4 --- /dev/null +++ b/mms-1b-all/hau/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/hau/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/hau/tokens.txt b/mms-1b-all/hau/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ade827d5ef618b04fe0b9f91fa38a9de9a7c435 --- /dev/null +++ b/mms-1b-all/hau/tokens.txt @@ -0,0 +1,80 @@ +a 501625 +| 468882 +n 158466 +i 153679 +k 109232 +u 131202 +r 62705 +s 90952 +d 70888 +y 71421 +m 80827 +t 43958 +e 50621 +w 44242 +o 31212 +b 49339 +h 39114 +g 29002 +l 30986 +c 22641 +f 14587 +z 17982 +j 11483 +ƙ 10464 +' 3158 +ɗ 12547 +0 760 +p 61 +1 589 +- 460 +2 370 +v 21 +9 1 +ɓ 2581 +5 224 +4 202 +3 198 +8 178 +6 6 +. 158 +7 2 +, 93 +x 4 +’ 68 +/ 53 +q 3 +: 51 +” 30 +$ 18 +ü 13 +; 12 +ƴ 17 +% 11 +‘ 10 +á 8 +í 1 +õ 6 +° 5 +[ 5 +] 5 ++ 4 +< 4 +> 4 +ç 3 +¥ 3 +é 3 +ã 3 +£ 3 +² 2 +ó 2 +! 2 +ú 2 +& 2 +ā 2104 +ö 1 +ʻ 9 +— 10 +ˈ 5770 +ă 2449 +ū 133 diff --git a/mms-1b-all/heb/lexicon.txt b/mms-1b-all/heb/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..3ad314801ff21d51fdce90bff183bda5b078494a --- /dev/null +++ b/mms-1b-all/heb/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/heb/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/heb/tokens.txt b/mms-1b-all/heb/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3e1c692afc113ddad9cae1ad13d2c135313bae5 --- /dev/null +++ b/mms-1b-all/heb/tokens.txt @@ -0,0 +1,86 @@ +| 178590 +י 70684 +ו 77246 +ה 61333 +ל 53178 +ר 34064 +ת 36763 +מ 33943 +ב 34892 +א 61945 +ש 36729 +ם 34431 +נ 25399 +ע 22838 +ד 20085 +ק 9648 +ח 16695 +כ 23175 +פ 8814 +ס 6628 +ג 7043 +ט 4381 +צ 6053 +ן 9521 +ז 5422 +ך 5010 +0 795 +- 102 +1 579 +ף 1291 +' 320 +2 378 +" 373 +ץ 898 +a 244 +9 240 +5 216 +4 208 +. 208 +n 188 +6 184 +3 177 +8 166 +o 152 +r 150 +e 147 +t 140 +7 123 +i 109 +l 106 +p 99 +s 91 +, 90 +c 88 +d 73 +g 72 +m 71 +u 71 +h 60 +: 53 +b 50 +־ 32 +v 31 +k 30 +/ 26 +f 24 +y 19 +w 18 +% 16 +j 9 +z 8 +x 8 +; 6 +õ 4 +$ 4 +ּ 3 ++ 3 +— 591 +[ 3 +] 3 +¥ 3 +q 2 +! 2 +– 2 +? 1 +í 1 diff --git a/mms-1b-all/hin/lexicon.txt b/mms-1b-all/hin/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..e34a3e32bd5c2c74e6c6f202e80188e1fa8e052c --- /dev/null +++ b/mms-1b-all/hin/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/hin/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/hin/tokens.txt b/mms-1b-all/hin/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..9842e8ce3061580a1749a19037eb58e2a00d1aa4 --- /dev/null +++ b/mms-1b-all/hin/tokens.txt @@ -0,0 +1,123 @@ +| 477390 +ा 141913 +क 112716 +े 133876 +र 104411 +् 67425 +स 75384 +ि 62096 +न 79200 +ं 54898 +ह 91419 +ी 60431 +त 65279 +म 65207 +ो 58317 +ल 37165 +य 45862 +प 41575 +व 37047 +ज 29217 +ै 25288 +द 29589 +ब 27459 +ग 23241 +ट 5235 +ु 39388 +ए 15249 +़ 12501 +श 19255 +अ 12741 +ड 7893 +ू 13642 +च 13773 +थ 12342 +आ 10285 +इ 8143 +भ 12904 +औ 12910 +ध 6326 +फ 4892 +ख 10172 +उ 19546 +ष 4452 +ई 3718 +ण 2631 +छ 4979 +0 93 +ॉ 24 +ौ 2543 +1 22 +- 1836 +ओ 2672 +घ 1400 +2 19 +ठ 3217 +ृ 1073 +ढ 1447 +9 133 +5 132 +4 14 +ऐ 1523 +झ 3230 +ऑ 12 +3 3 +6 104 +8 2 +ञ 782 +। 79 +. 68 +a 93 +7 61 +, 57 +o 83 +ँ 9708 +ऊ 1118 +c 27 +t 44 +l 4 +n 32 +e 89 +i 96 +r 57 +/ 27 +p 46 +s 45 +m 29 +: 24 +​ 20 +f 35 +ः 303 +' 541 +g 20 +u 11 +d 19 +b 24 +” 12 +h 2 +% 8 +w 44 +y 11 +¥ 6 +x 8 ++ 4 +v 25 +j 6 +$ 3 +‍ 4794 +! 3 +; 3 +á 3 +° 2 +ú 2 +k 26 +z 2 +õ 2 +— 15 +q 1 +£ 1 +ऋ 6 +ॅ 12 +` 6 +ॆ 2 +ऩ 1 diff --git a/mms-1b-all/hrv/lexicon.txt b/mms-1b-all/hrv/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..17fb7ab4c1fa6ee41e4a16b1732bde4d5b77fc3e --- /dev/null +++ b/mms-1b-all/hrv/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/hrv/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/hrv/tokens.txt b/mms-1b-all/hrv/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..e465c5af1b26be7e75179d0371564070b76882e9 --- /dev/null +++ b/mms-1b-all/hrv/tokens.txt @@ -0,0 +1,72 @@ +| 284886 +a 177712 +i 160492 +e 138045 +o 152040 +n 105258 +j 80907 +r 87770 +t 79334 +s 75968 +u 69947 +l 46697 +k 52771 +v 56145 +d 54249 +m 56665 +p 50821 +z 29092 +g 25714 +b 20213 +c 14118 +č 6972 +h 11600 +š 6948 +ž 5480 +ć 4335 +f 3522 +0 858 +. 835 +đ 1984 +" 816 +1 641 +2 406 +y 405 +w 438 +9 283 +- 265 +5 245 +4 222 +3 208 +6 204 +8 190 +7 139 +: 100 +/ 66 +x 179 +, 47 +q 10 +' 3 +​ 16 +; 15 +– 12 +% 11 +ü 8 +í 8 +° 8 +! 7 +á 3 +¥ 6 +ç 6 +õ 6 +ö 1 +” 5 ++ 5 +é 4 +ú 3 +[ 3 +] 3 +? 2 +ł 2 +² 2 +ó 2 diff --git a/mms-1b-all/hun/lexicon.txt b/mms-1b-all/hun/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..af7049976121323f4ee2a60494af27eae89bfd8a --- /dev/null +++ b/mms-1b-all/hun/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/hun/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/hun/tokens.txt b/mms-1b-all/hun/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..990d4da413632972429c3d412bc0b94beb68c973 --- /dev/null +++ b/mms-1b-all/hun/tokens.txt @@ -0,0 +1,72 @@ +| 107422 +e 66431 +a 51633 +t 54173 +l 32252 +s 32826 +n 36048 +k 35824 +o 21820 +r 20066 +z 24615 +i 24601 +á 17697 +g 21773 +é 20502 +m 24249 +y 12637 +b 9398 +v 10581 +d 13876 +h 10004 +ö 6306 +j 8046 +ó 4435 +u 5919 +p 4722 +f 4358 +ő 5050 +c 2091 +í 3759 +ü 3701 +ú 1459 +0 679 +ű 670 +1 562 +- 205 +2 326 +. 320 +" 285 +w 73 +9 241 +5 208 +4 196 +3 180 +6 174 +8 173 +x 20 +7 112 +/ 55 +, 40 +q 6 +” 35 +: 34 +' 2 +% 17 +° 8 +õ 6 +– 133 +— 5 +! 4 +; 4 ++ 4 +ç 3 +ã 3 +​ 3 +[ 3 +] 3 +² 2 +̇ 2 +? 2 +× 1 +û 1 diff --git a/mms-1b-all/hye/lexicon.txt b/mms-1b-all/hye/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..c7eb977fb120aa3fbfe3122ec00f04c3f452c70a --- /dev/null +++ b/mms-1b-all/hye/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/hye/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/hye/tokens.txt b/mms-1b-all/hye/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..d62f27a152ccc1cd59a808ff8edc709550bae295 --- /dev/null +++ b/mms-1b-all/hye/tokens.txt @@ -0,0 +1,99 @@ +| 3739 +ա 3846 +ն 2452 +ր 1991 +ո 1758 +ե 1837 +ի 1256 +ւ 1266 +մ 988 +կ 915 +տ 765 +յ 620 +վ 523 +ս 508 +լ 403 +հ 461 +ց 373 +թ 357 +ք 308 +ը 371 +դ 323 +գ 312 +ղ 310 +պ 272 +բ 287 +է 366 +ծ 227 +շ 223 +ռ 147 +և 2606 +զ 160 +խ 170 +չ 140 +ջ 149 +փ 71 +ձ 82 +ժ 81 +օ 38 +ճ 70 +0 757 +- 611 +1 568 +ֆ 23 +2 350 +: 343 +9 234 +5 206 +4 192 +3 179 +8 176 +6 170 +՝ 158 +։ 154 +7 119 +» 116 +, 110 +« 105 +. 102 +n 81 +a 71 +o 62 +/ 60 +i 58 +e 55 +p 46 +r 45 +c 41 +d 41 +` 36 +t 35 +g 32 +s 25 +h 24 +' 23 +l 23 +u 21 +v 21 +․ 19 +% 17 +f 15 +b 13 +՛ 12 +m 10 +w 10 +k 7 +x 7 +’ 6 +j 5 ++ 5 +õ 4 +y 4 +​ 4 +! 3 +° 3 +— 2 +[ 2 +] 2 +í 2 +ó 2 diff --git a/mms-1b-all/ibo/lexicon.txt b/mms-1b-all/ibo/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..f087960790746066442f6e0a27b9bf65bda001ce --- /dev/null +++ b/mms-1b-all/ibo/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ibo/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/ibo/tokens.txt b/mms-1b-all/ibo/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..35bd5cc96a1abfd9ca3575b0428842b5168e0ae3 --- /dev/null +++ b/mms-1b-all/ibo/tokens.txt @@ -0,0 +1,85 @@ +| 65791 +a 41220 +e 27079 +n 24818 +i 17595 +k 13133 +ụ 12672 +r 12164 +m 11779 +o 11655 +ọ 10071 +u 9666 +h 9326 +g 9315 +b 8863 +ị 8688 +t 7958 +w 7547 +d 7044 +s 5853 +l 4658 +y 4090 +p 3652 +' 3088 +c 3051 +- 2359 +z 2078 +j 1898 +f 1541 +0 568 +1 430 +v 375 +2 276 +9 208 +5 159 +4 159 +6 151 +3 147 +8 126 +. 120 +’ 109 +7 89 +x 64 +, 48 +: 34 +q 31 +/ 27 +ñ 17 +$ 17 +̄ 14 +ò 13 +; 12 +è 12 +à 8 +ü 6 +– 5 +ṅ 5 +ç 5 +” 5 ++ 4 +õ 4 +é 4 +ù 4 +á 4 +° 3 +¥ 3 +% 3 +ì 3 +ã 3 +í 3 +[ 3 +] 3 +— 2 +ū 2 +" 2 +ó 2 +_ 2 +ú 2 +£ 2 +! 2 +̀ 1 +ē 1 +ł 1 +& 1 +² 1 diff --git a/mms-1b-all/ind/lexicon.txt b/mms-1b-all/ind/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..e8b4faafb1ad5cb5492f91857c18e974f875e1e1 --- /dev/null +++ b/mms-1b-all/ind/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ind/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/ind/tokens.txt b/mms-1b-all/ind/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2a28e8e3bba943cf1ebc43a018d9ad091baef74 --- /dev/null +++ b/mms-1b-all/ind/tokens.txt @@ -0,0 +1,76 @@ +a 1147055 +| 902988 +n 482968 +e 402349 +i 371728 +t 248815 +r 238344 +u 308020 +k 301752 +s 197167 +m 243638 +d 226055 +g 197000 +l 176332 +p 134437 +b 133899 +h 159530 +o 54896 +y 113394 +j 45894 +c 14674 +w 15253 +f 3798 +- 35375 +v 2 +0 66 +1 24 +2 8 +z 1938 +. 171 +9 170 +5 2 +3 148 +4 32 +8 131 +6 10 +7 88 +x 82 +/ 36 +, 32 +' 352 +ã 32 +" 31 +q 37 +â 28 +€ 15 +: 14 +¡ 14 +% 9 +; 7 +¥ 6 +° 5 +­ 5 +$ 4 +£ 4 +§ 3 +º 3 +” 3 +© 2 +² 2 +& 2 +ä 2 ++ 1 +• 1 +µ 1 +¶ 1 +! 1 +— 2910 +\ 4 +á 3 +– 380 +é 2 +ō 1 +ł 1 +ń 1 +ʼ 619 diff --git a/mms-1b-all/isl/lexicon.txt b/mms-1b-all/isl/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..e219ba90e68d322ab81357706d3666458fb72e87 --- /dev/null +++ b/mms-1b-all/isl/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/isl/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/isl/tokens.txt b/mms-1b-all/isl/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..40a2433d7a0cd2770883879089c9b289e6eee828 --- /dev/null +++ b/mms-1b-all/isl/tokens.txt @@ -0,0 +1,60 @@ +| 126911 +a 47752 +r 46845 +n 44307 +i 44971 +e 37549 +s 31259 +t 24328 +u 27717 +l 21858 +ð 27940 +g 25158 +m 21556 +f 14238 +k 18353 +v 12001 +o 14498 +h 14224 +d 8733 +í 8033 +á 8842 +þ 13259 +j 6344 +b 4117 +y 6085 +ó 4622 +p 3481 +ö 4169 +æ 3698 +ú 4291 +ý 1196 +é 4832 +0 1 +1 197 +c 174 +- 1 +. 142 +2 1 +9 77 +w 73 +5 58 +4 55 +3 54 +8 54 +6 47 +x 137 +7 1 +z 27 +q 17 +: 16 +/ 15 +, 10 +' 4 +% 2 +ã 2 +; 1 +² 1 +– 1 +[ 1 +] 1 diff --git a/mms-1b-all/ita/lexicon.txt b/mms-1b-all/ita/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..6591db00e44af726e913423e5db13d5a3e25914c --- /dev/null +++ b/mms-1b-all/ita/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ita/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/ita/tokens.txt b/mms-1b-all/ita/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8d9719fda8eec311df4c4e090e9ffd4073cf7b5 --- /dev/null +++ b/mms-1b-all/ita/tokens.txt @@ -0,0 +1,179 @@ +| 1415014 +i 841722 +e 840501 +a 838608 +o 679059 +n 557586 +t 527514 +r 489986 +l 491893 +s 400255 +c 320129 +d 272622 +u 237248 +p 208384 +m 194767 +g 135581 +v 120820 +f 88323 +h 64899 +z 66451 +b 70318 +' 25794 +q 29111 +è 29547 +à 13426 +0 729 +k 9070 +1 492 +ù 5779 +y 9825 +2 330 +w 6893 +ò 12025 +9 192 +5 188 +4 177 +3 170 +. 166 +j 4300 +8 156 +6 155 +- 4936 +x 1970 +ì 4029 +7 104 +é 2438 +/ 50 +: 42 +% 33 +, 32 +° 20 +; 14 +’ 10 +² 10 +á 295 +" 9 +ü 8 +& 6 ++ 4 +í 260 +ç 4 +¥ 3 +š 80 +ł 18 +ª 2 +º 2 +ó 190 +[ 2 +] 2 +ú 80 +ï 22 +ō 109 +č 78 +– 65 +ū 57 +ä 55 +ñ 49 +ø 42 +ã 40 +ʿ 39 +ī 36 +ć 34 +ë 32 +ž 25 +ô 23 +å 18 +î 17 +ê 17 +ḥ 15 +ń 14 +æ 13 +ß 13 +ö 10 +ş 7 +ṣ 7 +` 6 +ř 6 +ə 5 +ě 5 +đ 5 +ė 5 +́ 5 +′ 5 +ň 4 +ё 4 +ő 4 +þ 3 +œ 3 +ð 3 +б 3 +а 3 +с 3 +ʻ 3 +ś 3 +ṭ 3 +ː 3 +ę 2 +ź 2 +ğ 2 +μ 2 +ț 2 +ı 2 +û 2 +о 2 +ʾ 2 +ā 2 +е 2 +ד 2 +ン 2 +ľ 2 +ʼ 2 +ة 2 +張 2 +三 2 +— 2 +禅 1 +家 1 +‐ 1 +ŭ 1 +旅 1 +д 1 +н 1 +ą 1 +あ 1 +ц 1 +ÿ 1 +л 1 +љ 1 +古 1 +多 1 +万 1 +ו 1 +ה 1 +ア 1 +ノ 1 +ș 1 +サ 1 +カ 1 +キ 1 +フ 1 +リ 1 +ザ 1 +ل 1 +س 1 +ص 1 +غ 1 +ي 1 +ر 1 +̨ 1 +ѐ 1 +ṛ 1 +у 1 +ễ 1 +丰 1 +峰 1 +ң 1 +ꞌ 1 +ż 1 +☆ 1 diff --git a/mms-1b-all/jav/lexicon.txt b/mms-1b-all/jav/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..283aa1f587b6bf2b09105124226aab897d2af0d5 --- /dev/null +++ b/mms-1b-all/jav/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/jav/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/jav/tokens.txt b/mms-1b-all/jav/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ab180f58f564ac7a7f1ceb50b6107514477cb1e --- /dev/null +++ b/mms-1b-all/jav/tokens.txt @@ -0,0 +1,69 @@ +a 328491 +| 281561 +n 202802 +i 102683 +e 101784 +g 117298 +k 98191 +s 66097 +u 90998 +r 64175 +t 50515 +l 46548 +m 48397 +o 39986 +d 44120 +p 50817 +h 41877 +b 29925 +w 37546 +y 19331 +j 13715 +c 5824 +- 7931 +f 562 +0 1 +v 518 +1 1 +2 1 +9 1 +5 186 +4 168 +. 168 +3 158 +8 154 +6 151 +z 165 +7 1 +x 1 +q 43 +, 40 +/ 36 +' 46 +é 27 +: 25 +" 18 +” 16 +; 12 +% 11 +á 10 +$ 8 +ü 8 +& 7 ++ 5 +— 5 +ç 5 +õ 4 +í 3 +[ 3 +] 3 +£ 3 +ã 3 +ê 3 +° 3 +² 2 +‘ 2 +! 1 +ó 1 +ö 1 +` 1 diff --git a/mms-1b-all/jpn/lexicon.txt b/mms-1b-all/jpn/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/mms-1b-all/jpn/lexicon.txt @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/mms-1b-all/jpn/tokens.txt b/mms-1b-all/jpn/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..c848376b8d4b34b27474bb82be7fd350d437295d --- /dev/null +++ b/mms-1b-all/jpn/tokens.txt @@ -0,0 +1,2264 @@ +の 6928 +に 3943 +し 4139 +は 5321 +で 3317 +た 4163 +ま 3337 +す 2738 +を 2655 +と 2715 +い 3799 +る 2876 +が 2197 +て 3083 +ー 1701 +ン 3268 +な 2285 +れ 2595 +り 1258 +こ 1447 +ス 1942 +か 1375 +ル 1675 +ら 1433 +あ 1295 +も 1256 +さ 1130 +う 819 +ト 1541 +っ 1588 +ラ 1189 +イ 1306 +ア 1259 +く 889 +リ 1411 +0 525 +1 524 +き 644 +よ 714 +ク 1131 +ッ 1164 +人 499 +ん 668 +け 554 +ド 886 +や 269 +め 433 +的 668 +国 184 +そ 1085 +フ 722 +ロ 700 +タ 658 +2 331 +シ 779 +つ 429 +せ 345 +カ 619 +地 296 +ジ 698 +ど 324 +年 112 +大 346 +場 211 +レ 653 +え 341 +テ 555 +行 232 +オ 438 +物 228 +わ 318 +者 283 +ち 348 +生 326 +日 132 +合 175 +コ 481 +バ 577 +ブ 413 +発 172 +ば 243 +要 138 +だ 566 +マ 527 +プ 583 +ニ 382 +中 270 +上 185 +サ 378 +一 388 +ィ 707 +チ 402 +グ 507 +間 178 +学 365 +ャ 419 +出 227 +時 144 +部 222 +キ 335 +手 146 +ム 520 +動 159 +最 270 +語 170 +性 281 +3 170 +分 211 +ズ 342 +入 118 +ウ 519 +メ 336 +理 144 +多 159 +9 154 +通 159 +エ 341 +5 152 +定 149 +見 216 +べ 129 +ほ 125 +ダ 311 +自 283 +パ 312 +み 191 +ュ 371 +後 276 +ビ 315 +方 151 +ェ 365 +法 112 +化 106 +会 225 +用 233 +デ 476 +前 207 +子 188 +ょ 67 +月 12 +体 133 +4 129 +ピ 160 +a 502 +ナ 314 +事 146 +常 132 +6 123 +お 275 +8 120 +所 133 +世 111 +ネ 173 +能 136 +数 117 +セ 228 +度 69 +ず 109 +報 50 +ミ 265 +本 136 +必 53 +ツ 172 +現 181 +水 75 +ハ 229 +立 192 +可 89 +力 86 +高 135 +ポ 167 +以 69 +彼 1168 +長 129 +目 96 +ケ 173 +旅 24 +ョ 304 +受 87 +外 87 +戦 100 +同 156 +当 82 +言 186 +気 128 +在 164 +光 26 +組 71 +ガ 178 +ソ 201 +機 102 +開 105 +選 87 +7 93 +氏 31 +経 61 +予 27 +空 74 +関 120 +重 71 +界 106 +文 76 +金 63 +ベ 254 +じ 153 +名 239 +向 75 +使 167 +教 158 +ノ 148 +全 131 +作 243 +げ 103 +観 67 +ギ 117 +明 99 +家 171 +約 31 +内 70 +代 89 +ボ 176 +取 105 +利 56 +先 32 +他 89 +p 127 +ザ 120 +業 127 +原 45 +成 166 +m 143 +近 94 +下 97 +々 76 +主 210 +回 41 +状 46 +、 75 +域 72 +非 73 +車 75 +意 114 +書 99 +持 125 +有 97 +ァ 146 +ゴ 127 +点 70 +電 45 +集 74 +期 93 +面 68 +番 29 +初 133 +道 92 +特 92 +ペ 140 +告 30 +設 92 +話 99 +ろ 162 +実 102 +連 73 +調 37 +保 61 +何 87 +島 34 +ホ 118 +軍 60 +始 59 +海 40 +供 78 +変 79 +決 81 +表 97 +過 55 +市 74 +確 46 +示 70 +食 74 +航 16 +小 114 +安 47 +認 41 +モ 193 +員 71 +州 142 +ワ 152 +解 63 +e 530 +速 28 +際 31 +知 162 +記 79 +考 72 +制 70 +族 47 +異 47 +呼 82 +o 393 +球 27 +流 52 +議 73 +得 48 +類 44 +北 62 +料 37 +び 152 +来 102 +問 76 +社 109 +形 72 +住 73 +c 207 +加 80 +種 54 +対 142 +米 18 +む 60 +活 57 +続 99 +民 70 +政 82 +付 111 +新 98 +別 69 +神 40 +放 34 +n 329 +位 53 +g 113 +画 134 +影 46 +公 104 +s 364 +査 17 +d 196 +r 362 +心 76 +判 45 +感 60 +験 37 +南 64 +述 19 +今 72 +直 42 +送 30 +計 64 +務 73 +件 24 +客 47 +思 88 +第 26 +隊 24 +果 66 +死 39 +i 372 +私 191 +式 59 +味 86 +t 326 +不 81 +察 23 +正 61 +ヤ 102 +提 72 +船 18 +ご 46 +野 33 +へ 105 +院 38 +交 35 +試 29 +違 35 +l 309 +習 19 +録 45 +害 28 +質 54 +進 57 +題 69 +限 47 +態 26 +起 53 +広 55 +少 94 +存 56 +王 38 +結 88 +, 43 +女 327 +曜 10 +史 45 +」 43 +楽 76 +低 31 +念 38 +込 41 +ォ 136 +紀 1 +官 26 +ヴ 106 +産 62 +「 42 +星 15 +基 48 +陸 13 +運 82 +離 41 +挙 26 +都 34 +優 26 +k 97 +乗 33 +府 25 +警 28 +建 84 +強 52 +統 57 +染 9 +撃 31 +局 33 +病 24 +ゲ 100 +情 34 +科 37 +価 39 +号 34 +響 38 +移 31 +引 53 +量 24 +険 16 +指 64 +万 6 +因 15 +準 32 +資 30 +税 7 +応 33 +器 41 +ヨ 57 +割 23 +両 57 +着 90 +止 36 +ヒ 82 +身 88 +素 39 +英 32 +落 25 +係 36 +規 30 +様 76 +領 29 +証 26 +型 18 +単 48 +収 47 +ヘ 71 +任 41 +火 26 +東 52 +信 61 +元 80 +伝 53 +終 65 +争 31 +般 58 +歴 46 +例 36 +無 64 +十 32 +備 38 +西 42 +熱 35 +断 31 +u 162 +由 47 +写 35 +降 13 +勝 34 +帯 30 +品 96 +ユ 51 +歩 34 +風 31 +負 22 +登 39 +訪 11 +然 43 +抗 14 +想 31 +転 41 +宿 5 +雨 13 +置 67 +~ 32 +亡 21 +傷 15 +育 45 +士 47 +配 41 +周 30 +再 47 +音 86 +権 33 +支 65 +潜 8 +真 58 +細 28 +洋 5 +づ 37 +個 47 +囲 36 +ぎ 45 +催 11 +共 53 +検 32 +求 28 +ぐ 55 +映 82 +視 31 +張 40 +投 45 +首 23 +親 58 +帰 15 +与 48 +恐 18 +台 31 +爆 5 +象 39 +団 42 +参 47 +危 8 +構 50 +属 33 +半 24 +波 11 +術 76 +次 44 +残 51 +済 21 +良 40 +症 21 +渡 21 +医 14 +夜 19 +相 36 +去 40 +f 90 +便 19 +防 17 +難 31 +泊 3 +頃 11 +季 7 +店 29 +被 8 +路 41 +ゥ 32 +命 34 +区 85 +ざ 52 +造 42 +ぶ 32 +識 58 +故 15 +接 41 +戻 34 +暴 9 +攻 15 +破 22 +h 191 +宇 8 +宙 11 +二 63 +裁 16 +増 24 +比 16 +管 38 +悪 34 +失 48 +適 30 +石 30 +研 36 +陽 7 +飛 37 +義 61 +復 18 +太 11 +境 50 +切 31 +艦 6 +足 28 +古 41 +極 14 +役 56 +葉 24 +施 17 +覚 25 +激 20 +融 7 +拠 21 +演 76 +b 119 +飲 20 +申 9 +職 16 +川 47 +治 59 +像 29 +導 22 +和 24 +天 21 +候 32 +庭 17 +窓 8 +遺 10 +望 22 +働 35 +鳥 11 +消 25 +究 36 +反 58 +傾 10 +滞 2 +許 21 +週 13 +圧 18 +待 16 +刑 19 +級 6 +馬 22 +捕 16 +購 7 +簡 12 +源 17 +効 33 +平 16 +欧 6 +継 27 +衆 14 +港 18 +ぼ 23 +模 15 +律 20 +徒 16 +軽 14 +千 6 +v 51 +協 20 +容 28 +口 60 +環 29 +線 34 +% 20 +論 73 +歳 1 +織 21 +雪 10 +格 32 +銀 11 +側 34 +避 7 +急 25 +況 14 +具 18 +. 19 +幅 16 +射 6 +標 24 +植 25 +値 24 +製 38 +深 25 +賞 23 +園 35 +朝 11 +端 18 +留 6 +雑 15 +障 19 +夏 9 +追 40 +屋 35 +訴 16 +疑 13 +探 15 +撮 10 +独 36 +距 9 +補 25 +票 11 +農 27 +除 26 +欠 14 +群 16 +印 12 +積 26 +商 34 +氷 8 +複 31 +療 14 +突 17 +山 43 +声 36 +ぜ 14 +委 12 +字 29 +侵 11 +岸 12 +宅 8 +宗 16 +各 15 +扱 8 +功 35 +監 30 +慣 11 +助 24 +途 8 +午 3 +系 23 +輸 12 +就 4 +為 25 +児 13 +遠 26 +達 27 +頭 33 +注 26 +衛 3 +街 14 +温 11 +震 4 +読 12 +司 18 +技 60 +微 8 +専 13 +罪 16 +含 80 +密 22 +従 21 +苦 14 +荷 4 +材 21 +満 21 +焦 6 +酒 7 +改 43 +胞 9 +覆 11 +率 16 +男 142 +ゆ 19 +営 35 +好 43 +緊 7 +競 20 +援 26 +兵 9 +総 8 +称 25 +階 21 +益 10 +勢 13 +評 44 +範 17 +余 11 +寄 17 +土 30 +・ 15 +測 19 +洞 4 +閉 18 +聞 33 +獲 13 +泳 5 +沿 14 +図 22 +訳 8 +工 34 +歌 56 +河 8 +毎 18 +町 61 +諸 7 +守 22 +盟 6 +担 12 +倒 21 +脅 6 +威 4 +門 26 +請 9 +右 10 +刻 12 +減 19 +推 14 +三 35 +殿 5 +肢 2 +券 2 +精 16 +換 15 +木 40 +寒 3 +唯 16 +省 10 +燃 10 +壊 23 +条 13 +跡 10 +程 10 +冬 5 +較 6 +校 93 +費 11 +菌 5 +露 10 +振 18 +否 36 +捜 13 +索 6 +散 18 +巻 9 +景 15 +昨 4 +鏡 12 +億 13 +説 46 +択 16 +息 46 +寺 4 +末 3 +芸 13 +師 21 +差 15 +謝 9 +婚 34 +維 18 +了 12 +届 6 +滑 10 +革 16 +逮 5 +弓 2 +延 6 +血 11 +逃 14 +倍 4 +曲 71 +触 15 +折 3 +損 10 +層 13 +怖 9 +未 22 +城 6 +並 10 +案 19 +照 19 +版 29 +厳 10 +液 4 +操 14 +給 9 +敷 6 +粒 6 +漏 2 +ぞ 28 +完 46 +携 11 +誰 33 +油 14 +署 12 +懸 3 +仕 45 +唆 7 +館 36 +党 42 +敗 19 +及 19 +邦 10 +央 13 +翻 4 +墓 19 +買 34 +ゼ 48 +紙 26 +y 88 +責 9 +授 21 +服 21 +救 4 +整 16 +依 12 +j 30 +超 16 +段 22 +我 47 +処 15 +更 25 +母 23 +橋 20 +窟 2 +売 60 +京 2 +富 6 +夫 19 +祝 7 +滝 6 +興 18 +居 16 +紛 4 +舞 9 +則 7 +打 25 +美 16 +繰 8 +底 8 +華 2 +患 12 +墜 10 +刺 13 +睡 2 +眠 6 +友 18 +隣 17 +竜 2 +辺 21 +村 40 +著 20 +惑 7 +早 11 +護 37 +左 10 +- 20 +祭 9 +勧 2 +敵 8 +横 11 +儀 6 +概 14 +ヌ 26 +批 14 +賀 10 +派 29 +魅 7 +絶 33 +花 11 +希 3 +寝 8 +走 24 +修 31 +臓 3 +尿 3 +企 17 +針 2 +籍 7 +裂 6 +盛 4 +揮 11 +披 7 +暗 17 +陥 6 +健 11 +拒 15 +典 12 +亜 8 +吸 6 +快 14 +聖 30 +将 13 +額 4 +貨 7 +遊 12 +席 15 +憲 9 +酸 4 +枚 2 +承 11 +該 9 +狭 9 +頻 11 +払 12 +伸 4 +肩 10 +姿 11 +林 15 +課 4 +労 20 +躍 5 +到 15 +善 20 +板 7 +赤 40 +博 29 +似 24 +色 99 +越 21 +答 19 +荒 3 +崩 11 +帝 13 +徴 22 +康 8 +角 15 +w 87 +築 28 +幸 9 +載 20 +弁 21 +汚 4 +頼 14 +暑 2 +根 17 +順 9 +殺 32 +室 20 +績 7 +怪 8 +津 8 +展 28 +核 6 +拡 14 +犯 15 +薬 11 +鎖 10 +清 2 +准 8 +浮 9 +婦 7 +埋 17 +黒 28 +ゾ 16 +禁 10 +飾 8 +易 12 +刷 3 +短 21 +脳 7 +賃 2 +郵 8 +ふ 15 +奪 4 +幹 6 +遅 15 +/ 7 +免 6 +湾 8 +片 7 +豊 10 +齢 7 +痛 9 +抑 5 +至 4 +還 2 +措 2 +詳 12 +握 8 +採 18 +眺 2 +柔 6 +珍 3 +征 4 +脈 13 +弾 14 +夢 6 +尊 4 +隻 7 +審 1 +固 15 +編 33 +辞 12 +脱 6 +爪 1 +ね 48 +契 13 +猫 6 +占 10 +濃 3 +毒 5 +湖 17 +抜 11 +蔵 7 +若 23 +顧 8 +列 7 +x 22 +洪 1 +貿 7 +押 10 +裕 4 +骨 13 +互 13 +葬 17 +旧 6 +肝 1 +塞 4 +盗 2 +百 12 +砂 11 +暖 5 +策 11 +等 17 +符 4 +晩 6 +ひ 52 +巨 7 +靴 4 +混 9 +努 9 +網 4 +練 12 +騎 3 +退 26 +渓 9 +谷 14 +略 16 +掘 7 +穴 5 +座 38 +削 6 +狩 2 +旬 1 +駆 6 +卒 16 +憶 10 +筋 4 +伴 4 +弱 3 +誌 9 +冷 18 +ゃ 43 +襲 11 +誕 7 +駐 9 +衝 10 +陣 5 +圏 6 +干 2 +岩 15 +背 21 +返 21 +歯 10 +叩 3 +炉 1 +眼 8 +純 11 +醸 3 +蛇 1 +香 5 +致 6 +蒸 4 +隠 9 +迅 6 +祖 10 +覧 11 +糖 2 +貫 7 +円 12 +渉 2 +厚 2 +劇 25 +布 13 +殖 4 +創 20 +装 33 +巡 6 +絡 4 +田 12 +雲 3 +蚊 5 +闘 19 +秀 3 +算 10 +藤 2 +秩 2 +序 5 +遭 3 +己 65 +馴 1 +旗 4 +牧 13 +雇 8 +炭 7 +沢 5 +白 53 +膨 6 +孔 3 +愛 27 +雷 5 +繁 16 +福 9 +熊 1 +戚 5 +喜 4 +簿 5 +栄 12 +暮 5 +阻 5 +鍵 5 +循 5 +武 13 +縦 5 +踵 5 +奏 38 +遂 2 +疲 6 +皆 3 +搬 5 +貴 7 +筆 13 +驚 12 +譲 3 +顔 18 +財 12 +仲 7 +掲 7 +斜 6 +頂 7 +礼 10 +診 2 +豪 3 +恒 2 +九 2 +秘 6 +里 1 +災 6 +壁 9 +欲 14 +腕 6 +虫 14 +借 5 +兆 2 +腹 14 +困 14 +湿 5 +硬 4 +舎 5 +耳 7 +輪 10 +介 13 +草 11 +丘 8 +逸 2 +招 3 +誘 11 +描 17 +把 6 +焼 12 +熟 6 +猛 4 +塔 6 +矛 29 +盾 30 +虐 2 +崎 4 +県 12 +佐 3 +妃 4 +宮 3 +隔 4 +戒 1 +礁 4 +殊 4 +耐 12 +揃 1 +匹 3 +掃 8 +納 4 +匂 4 +狙 4 +志 4 +停 7 +鉄 31 +鮫 4 +森 17 +封 1 +滅 8 +春 5 +秒 2 +釈 6 +飼 6 +犬 23 +砕 4 +薄 11 +妨 4 +四 12 +廷 2 +潮 4 +絞 1 +韓 6 +閲 4 +嘘 6 +芝 4 +摘 8 +妻 18 +吹 10 +霊 3 +摂 3 +冒 7 +柄 1 +聴 7 +銃 9 +添 2 +窒 2 +椅 5 +泡 4 +践 2 +藻 4 +塩 8 +討 11 +曖 4 +昧 4 +昼 3 +z 13 +父 34 +軒 4 +貧 11 +塗 5 +姫 3 +昔 6 +壇 2 +麓 3 +腫 3 +乾 9 +緩 5 +褒 3 +玉 1 +透 5 +斎 3 +昇 7 +訓 5 +拘 1 +談 4 +是 3 +仰 3 +哲 8 +顕 4 +搭 1 +督 21 +排 6 +胡 3 +錦 3 +濤 3 +猟 3 +稀 2 +鳴 3 +磁 2 +幕 2 +鮮 4 +麻 7 +祉 3 +弟 29 +怒 8 +遡 2 +勾 1 +筒 2 +養 14 +魚 9 +朗 2 +青 24 +溢 1 +丸 7 +裏 6 +囚 7 +卿 5 +盤 10 +嵐 4 +久 5 +踏 7 +哺 1 +乳 3 +: 3 +邸 3 +仮 9 +綴 2 +拓 4 +販 17 +瓶 1 +棒 4 +郊 10 +禽 3 +剤 15 +黄 17 +堤 3 +令 11 +獄 3 +尾 13 +晰 3 +募 3 +宛 3 +鋭 4 +尖 1 +冠 5 +隙 3 +普 11 +搾 3 +靭 3 +看 5 +孤 6 +勉 6 +阪 1 +枠 1 +涼 1 +講 6 +傑 3 +賊 3 +廃 12 +静 12 +束 11 +洗 4 +履 3 +既 11 +錯 3 +漁 5 +晴 12 +敬 9 +旋 1 +胆 2 +癌 2 +鉛 1 +須 2 +永 6 +掛 6 +彩 2 +隕 1 +逆 12 +庁 14 +嫌 6 +貯 6 +杯 3 +抱 10 +巣 4 +寂 3 +輩 1 +ぽ 9 +! 2 +娯 1 +腰 2 +媒 9 +℃ 2 +柱 3 +八 7 +沈 8 +替 13 +夕 5 +怯 2 +懲 2 +捨 3 +剰 6 +贅 1 +疫 3 +肉 17 +艇 2 +菓 5 +潔 1 +縮 9 +崖 3 +喉 4 +鋼 3 +促 5 +鹿 4 +× 2 +訛 2 +郷 5 +扇 2 +箱 9 +函 2 +慎 4 +彗 2 +慮 5 +宣 15 +妙 9 +綻 1 +扉 3 +罰 1 +枝 1 +噛 2 +包 6 +掌 2 +焉 1 +札 3 +徐 6 +詞 16 +棺 2 +餌 2 +曹 2 +猿 2 +緯 2 +鍛 2 +克 1 +贄 2 +捧 4 +刈 2 +亀 1 +陶 1 +羽 5 +悩 2 +唸 2 +憾 2 +井 1 +噴 2 +疽 1 +耕 1 +双 4 +桜 2 +鑑 2 +縛 2 +晶 4 +漠 1 +斉 3 +仁 2 +僧 1 +侶 1 +ぷ 3 +紅 3 +茶 21 +q 6 +擬 2 +臣 5 +瘍 1 +酔 2 +鈍 2 +姉 5 +妹 9 +顎 2 +皇 12 +遇 3 +勤 4 +棄 6 +俗 1 +鐙 2 +鞍 2 +預 2 +舗 3 +併 4 +榜 2 +庫 6 +泣 6 +叫 11 +戯 2 +仏 1 +毛 10 +雀 2 +齧 1 +濯 2 +抵 5 +節 12 +衣 7 +詩 14 +孫 9 +馳 1 +犠 4 +牲 4 +執 15 +副 10 +姦 2 +童 5 +窃 2 +乞 2 +誇 4 +胎 1 +肺 1 +雰 3 +迫 7 +忘 8 +狼 1 +需 4 +奮 5 +鄧 2 +誤 6 +尽 2 +灯 7 +拾 1 +剣 5 +辛 3 +贈 3 +涙 1 +挟 2 +宴 2 +呪 2 +讐 2 +痢 2 +寿 3 +願 9 +浴 3 +臆 2 +励 3 +絵 18 +肥 2 +唱 11 +膜 4 +句 3 +章 13 +啓 1 +蒙 1 +緒 26 +俺 11 +叶 2 +矢 1 +鎧 2 +鼻 2 +倫 3 +² 1 +浅 2 +塊 1 +捉 2 +詐 4 +欺 3 +慢 2 +疼 1 +雅 1 +佇 1 +縄 1 +勇 4 +忍 4 +翌 9 +泥 1 +賄 2 +債 4 +濡 1 +寮 1 +怠 2 +屯 4 +梅 2 +沖 1 +ぱ 20 +迷 8 +銅 3 +罠 1 +牛 7 ++ 1 +蓋 6 +浪 1 +貼 4 +株 11 +骸 4 +斗 1 +渦 1 +廊 3 +嘔 1 +吐 1 +糧 1 +硫 2 +遮 1 +涯 4 +柵 1 +杭 1 +砲 6 +没 4 +堪 1 +帥 2 +欄 2 +潤 2 +礎 5 +浸 2 +弊 1 +娘 21 +稼 2 +吊 3 +' 15 +幾 1 +傘 1 +狂 2 +墟 1 +祥 1 +豚 1 +帳 1 +賑 1 +暫 1 +淹 1 +休 9 +暇 4 +儲 1 +挫 1 +賛 9 +君 28 +偽 5 +控 6 +御 7 +盲 4 +黙 6 +床 5 +蔓 1 +訣 1 +垣 1 +迎 10 +趣 7 +膝 2 +彙 1 +疎 1 +卓 3 +凌 1 +& 1 +騙 1 +寸 1 +掻 1 +槍 1 +彷 1 +彿 1 +霧 2 +卵 7 +殻 3 +錆 1 +充 2 +偵 1 +勘 3 +屠 1 +誉 8 +雄 7 +哨 1 +煽 1 +堂 14 +聡 1 +迂 1 +撲 2 +恥 1 +七 3 +陵 1 +1 1 +甚 1 +疾 3 +幼 14 +峡 1 +峻 1 +坦 1 +舶 2 +浄 1 +裸 5 +彫 9 +伏 4 +虜 1 +訟 5 +皮 7 +膚 1 +撒 1 +瞬 10 +袋 2 +酷 3 +箋 1 +壌 4 +冊 1 +薦 1 +穏 2 +恍 1 +惚 1 +崇 5 +拝 8 +僚 3 +苗 3 +凝 3 +遣 4 +炎 3 +諦 1 +幻 4 +蜃 1 +楼 2 +懐 6 +憑 1 +苛 1 +郡 47 +兄 23 +駅 23 +緑 22 +ぬ 18 +僕 17 +架 15 +献 13 +悲 13 +笑 12 +脚 12 +灰 12 +抽 12 +髪 11 +却 11 +貢 10 +才 10 +魔 10 +析 10 +羊 10 +鉱 10 +乱 10 +恋 9 +帽 9 +老 9 +縁 9 +岐 9 +如 9 +奇 8 +麦 8 +即 8 +撤 8 +屈 8 +械 7 +棚 7 +邪 6 +凍 6 +五 6 +奴 6 +甲 6 +俳 6 +兼 6 +翼 6 +碑 6 +粘 6 +縫 6 +燥 6 +詰 6 +駄 5 +揚 5 +忙 5 +締 5 +稿 5 +騒 5 +樹 5 +軌 5 +慈 5 +隷 5 +挿 5 +粉 5 +ぺ 5 +謀 5 +陰 5 +尋 5 +漫 5 +爵 5 +冗 5 +挑 5 +菜 4 +豆 4 +煮 4 +飽 4 +飯 4 +璧 4 +甘 4 +祈 4 +鶏 4 +肌 4 +机 4 +跳 4 +揺 4 +六 4 +偶 4 +賢 4 +歓 4 +綿 4 +乏 4 +烈 4 +紹 4 +錬 4 +輝 4 +池 4 +刊 4 +項 4 +粋 4 +蓄 4 +昆 4 +戸 4 +脇 4 +均 4 +繋 4 +繊 4 +召 4 +芽 4 +奨 4 +肖 4 +笛 4 +飢 3 +頑 3 +姓 3 +巧 3 +踊 3 +ヶ 3 +漢 3 +宋 3 +癒 3 +恵 3 +宝 3 +唇 3 +褐 3 +軸 3 +堆 3 +軟 3 +桟 3 +訂 3 +陳 3 +伯 3 +枢 3 +棟 3 +闇 3 +砦 3 +衰 3 +幣 3 +擁 3 +紫 3 +垂 3 +鼓 3 +糸 3 +匿 3 +腐 3 +杉 3 +錠 3 +妊 3 +娠 3 +頸 3 +溶 3 +栓 3 +粧 3 +煙 3 +房 3 +擦 3 +惟 3 +叔 3 +悔 2 +誠 2 +溜 2 +汁 2 +舌 2 +嘆 2 +玄 2 +懇 2 +泉 2 +斧 2 +釘 2 +剃 2 +誓 2 +淡 2 +漂 2 +奈 2 +妖 2 +賂 2 +甥 2 +漕 2 +蹴 2 +畑 2 +肯 2 +腺 2 +茎 2 +紋 2 +愉 2 +枯 2 +芳 2 +轄 2 +閣 2 +附 2 +償 2 +恩 2 +弦 2 +礫 2 +獣 2 +頬 2 +臨 2 +奉 2 +謎 2 +瀬 2 +嬉 2 +江 2 +呂 2 +僅 2 +拐 2 +胸 2 +鐘 2 +隅 2 +縞 2 +倉 2 +茂 2 +魂 2 +窯 2 +犀 2 +醜 2 +倣 2 +蜂 2 +郎 2 +貸 2 +虹 2 +勃 2 +括 2 +挨 2 +拶 2 +惨 2 +随 2 +箔 2 +惹 2 +填 2 +奥 2 +沃 2 +瓦 2 +— 2 +拷 2 +穀 2 +敢 2 +径 2 +釣 2 +盆 2 +爬 2 +窮 2 +綺 2 +麗 2 +遥 2 +顆 2 +松 2 +栽 2 +培 2 +秦 2 +赦 2 +尉 2 +叙 2 +勲 2 +脂 2 +妥 2 +瞭 2 +汗 2 +摩 2 +槽 2 +皿 2 +胃 1 +抹 1 +ぁ 1 +眉 1 +謙 1 +敏 1 +枕 1 +漬 1 +些 1 +酬 1 +裾 1 +泌 1 +溺 1 +掴 1 +寛 1 +穫 1 +渇 1 +恨 1 +繕 1 +秋 1 +嫁 1 +愚 1 +– 1 +朽 1 +腎 1 +柳 1 +塀 1 +醇 1 +渋 1 +燕 1 +臼 1 +腸 1 +牡 1 +蠣 1 +岳 1 +丹 1 +遼 1 +寧 1 +陛 1 +蛋 1 +刃 1 +葦 1 +蛮 1 +汲 1 +鋤 1 +爽 1 +孵 1 +淘 1 +汰 1 +荊 1 +軻 1 +昌 1 +濁 1 +酢 1 +沼 1 +覇 1 +矯 1 +這 1 +脊 1 +椎 1 +蔗 1 +蒼 1 +鶯 1 +ぇ 1 +苔 1 +謖 1 +殉 1 +吉 1 +喪 1 +暢 1 +偉 1 +塹 1 +壕 1 +庵 1 +乙 1 +灌 1 +漑 1 +髭 1 +俣 1 +媽 1 +祠 1 +蜜 1 +蝋 1 +拍 1 +蔦 1 +茅 1 +葺 1 +忠 1 +彰 1 +遷 1 +軋 1 +憎 1 +繞 1 +閑 1 +沸 1 +胴 1 +堀 1 +罹 1 +廟 1 +耗 1 +憩 1 +厄 1 +蘇 1 +戴 1 +畜 1 +鳩 1 +竹 1 +蹄 1 +冶 1 +娼 1 +熾 1 +崔 1 +嗅 1 +膾 1 +炙 1 +橄 1 +欖 1 +剥 1 +繫 1 +〜 1 +虎 1 +膿 1 +憧 1 +其 1 +篤 1 +躁 1 +鬱 1 +痩 1 +煉 1 +偏 1 +瀕 1 +餓 1 +橙 1 +丁 1 +龍 1 +韻 1 +酵 1 +斑 1 +勅 1 +膀 1 +胱 1 +零 1 +鞭 1 +堕 1 +梨 1 +昏 1 +尻 1 +辱 1 +絨 1 +鷹 1 +毅 1 +痕 1 +硝 1 +讃 1 +拭 1 +羅 1 +磨 1 +噂 1 +坂 1 +餐 1 +葛 1 +滴 1 +鎮 1 +扮 1 +貌 1 +寡 1 +碁 1 +雌 1 +銘 1 +呈 1 +膳 1 +董 1 +采 1 +藍 1 +舟 1 +肪 1 +貞 1 +蛾 1 +牝 1 +茹 1 +呆 1 +慌 1 +厨 1 +騰 1 +叱 1 +晒 1 +愕 1 +ヅ 1 +缶 1 +曳 1 +饉 1 +糾 1 +冑 1 +恰 1 +鴨 1 +弩 1 +往 1 +楕 1 +辣 1 +膏 1 +弧 1 +股 1 +逐 1 +蝶 1 +臭 1 +汎 1 +刀 1 +杖 1 +又 1 +伐 1 +后 1 +肘 1 +殴 1 +貪 1 +酋 1 +贖 1 +蛙 1 +只 1 +膣 1 +粗 1 +虚 1 +宜 1 +錨 1 +閥 1 +喩 1 +絆 1 +駈 1 +綜 1 +斯 1 +汝 1 +丈 1 +尤 1 +朦 1 +朧 1 +或 1 +寵 1 +惜 1 +ぴ 1 +跨 1 +劣 1 +ゅ 1 +怨 1 +瞑 1 +澄 1 diff --git a/mms-1b-all/kam/lexicon.txt b/mms-1b-all/kam/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..920271ee2de68c211d096f69779c0f73692e1b80 --- /dev/null +++ b/mms-1b-all/kam/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/kam/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/kam/tokens.txt b/mms-1b-all/kam/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..3810a4fb78bf3a9faaaa4dfc004f7b24cafbbd43 --- /dev/null +++ b/mms-1b-all/kam/tokens.txt @@ -0,0 +1,64 @@ +| 68245 +a 59691 +i 35038 +n 32449 +k 20657 +e 19538 +u 19059 +m 17565 +t 16822 +w 15018 +y 14378 +o 12644 +s 11881 +l 10296 +ĩ 8673 +h 6855 +ũ 6710 +d 5968 +v 5897 +g 4754 +b 3010 +r 2672 +' 1379 +c 1259 +p 1108 +z 1004 +0 779 +1 552 +f 476 +î 404 +2 334 +j 298 +û 292 +9 248 +5 207 +4 198 +8 188 +3 182 +6 175 +. 162 +- 134 +7 113 +x 103 +, 84 +í 66 +q 59 +ú 52 +” 50 +: 40 +/ 33 +$ 11 +% 9 +ü 9 +; 8 +~ 6 ++ 5 +ì 3 +ḉ 3 +£ 3 +& 2 +° 2 +ã 2 +á 2 +! 1 diff --git a/mms-1b-all/kan/lexicon.txt b/mms-1b-all/kan/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..ba607edcdfb4974cc3d7064b3cb55c832f920b46 --- /dev/null +++ b/mms-1b-all/kan/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/kan/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/kan/tokens.txt b/mms-1b-all/kan/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..367a98e2a752b412c0bb81f16ec0c93a06ea3fad --- /dev/null +++ b/mms-1b-all/kan/tokens.txt @@ -0,0 +1,121 @@ +| 291946 +್ 184677 +ಿ 169455 +ು 174771 +ರ 122610 +ಾ 90743 +ದ 137160 +ನ 181433 +ತ 89035 +ೆ 90875 +ಗ 76054 +ವ 95579 +ಲ 75534 +ಕ 69501 +ಸ 56357 +ಯ 57531 +ಮ 54945 +ಂ 53337 +ಳ 42162 +ಪ 29542 +ಡ 29325 +ಹ 29780 +ಟ 17621 +ಬ 34579 +ೇ 45508 +ಅ 22469 +ೊ 22572 +ಚ 6223 +ೀ 16184 +ೂ 20527 +ೋ 16809 +ಜ 11777 +ಷ 9096 +ಣ 6612 +ಶ 8326 +ಆ 17583 +ಧ 8367 +‌ 1116 +ಎ 8608 +ಇ 8056 +ಭ 5862 +ಥ 3878 +ೈ 1863 +ಒ 4958 +0 85 +ಫ 1286 +ಉ 2493 +ಖ 1183 +1 153 +ಈ 3983 +2 61 +ೃ 1944 +ೌ 1701 +. 198 +9 31 +ಏ 1816 +5 44 +ಘ 231 +4 63 +- 36 +6 40 +3 45 +8 34 +7 19 +ಐ 328 +, 80 +‍ 7 +ಠ 384 +ಞ 1293 +ಓ 272 +a 66 +i 58 +c 44 +o 44 +t 44 +: 43 +m 42 +ಃ 298 +r 40 +s 37 +p 36 +/ 35 +e 30 +n 28 +u 28 +d 27 +ಔ 61 +" 26 +ಊ 663 +l 24 +ಛ 150 +b 20 +v 19 +' 144 +ಢ 225 +g 12 +ಋ 4 +ಝ 23 +h 10 +w 10 +; 9 +$ 9 +% 8 +f 7 +k 7 +x 7 +! 6 +y 6 +​ 4 +z 3 +° 3 +? 2 +õ 2 ++ 2 +– 2 +q 2 +j 2 +² 1 +೪ 1 +[ 1 +] 1 diff --git a/mms-1b-all/kat/lexicon.txt b/mms-1b-all/kat/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..917e6199f67318ec6f4c3ced9fab0d608b6924d5 --- /dev/null +++ b/mms-1b-all/kat/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/kat/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/kat/tokens.txt b/mms-1b-all/kat/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecc5062ad6714d39166cff5850cce1b39db4548c --- /dev/null +++ b/mms-1b-all/kat/tokens.txt @@ -0,0 +1,89 @@ +| 12329 +ა 12561 +ი 11584 +ე 8528 +ს 6429 +რ 5610 +ო 4737 +მ 4347 +ნ 3660 +დ 3743 +ლ 4352 +ბ 3438 +ვ 2902 +უ 2627 +თ 2383 +გ 1855 +შ 1613 +ტ 1589 +ხ 1306 +ც 972 +კ 1423 +ზ 796 +ქ 956 +წ 723 +პ 714 +ფ 724 +ყ 578 +ძ 315 +ღ 437 +ჩ 312 +- 140 +ჯ 210 +0 317 +1 228 +ჰ 112 +ჭ 98 +2 158 +9 110 +a 108 +4 94 +5 87 +n 86 +6 81 +o 77 +3 76 +e 75 +i 71 +8 67 +s 66 +ჟ 63 +r 58 +c 53 +7 51 +t 49 +l 48 +. 41 +p 37 +d 35 +g 33 +, 32 +m 27 +/ 27 +h 25 +: 23 +u 22 +v 22 +b 21 +k 16 +f 15 +— 10 +w 10 +y 10 +x 8 +; 7 +j 6 +% 5 +õ 4 +q 3 +z 3 +' 2 ++ 1 +° 1 +ú 1 +& 1 +[ 1 +] 1 +! 1 +² 1 +– 4 diff --git a/mms-1b-all/kaz/lexicon.txt b/mms-1b-all/kaz/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..f0597d2888f252fe365dbb71abd542329e2b444e --- /dev/null +++ b/mms-1b-all/kaz/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/kaz/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/kaz/tokens.txt b/mms-1b-all/kaz/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..26f6ffb960205851b9f519c4e86bf7c06e315def --- /dev/null +++ b/mms-1b-all/kaz/tokens.txt @@ -0,0 +1,104 @@ +| 115008 +а 85815 +е 57698 +н 49515 +ы 48634 +т 33366 +р 39736 +і 46991 +л 33295 +д 39605 +с 28256 +к 15665 +о 16018 +м 21355 +қ 21876 +б 18267 +у 8955 +и 7357 +ж 9655 +ғ 10962 +й 14963 +п 14126 +ң 12978 +г 7667 +з 10574 +ш 8738 +ү 6720 +ә 6751 +ө 6667 +ұ 8267 +я 1699 +в 20 +ф 152 +- 1170 +х 2580 +0 734 +ц 1 +1 569 +ь 3 +a 2 +э 3 +e 364 +2 354 +n 322 +o 316 +t 306 +r 303 +i 1 +« 288 +» 288 +s 276 +9 259 +5 218 +4 202 +l 189 +. 185 +c 2 +3 180 +8 171 +6 170 +ю 61 +ч 157 +h 142 +p 129 +d 125 +7 124 +u 111 +g 103 +m 99 +b 85 +: 76 +k 69 +f 61 +w 58 +y 55 +/ 51 +v 48 +, 34 +— 2083 +ъ 22 +x 22 +j 20 +z 16 +һ 330 +' 13 +% 9 +q 7 +; 7 +! 5 +& 4 +– 4 ++ 4 +щ 6 +° 4 +№ 3 +[ 2 +] 2 +$ 2 +ú 2 +? 2 +í 2 +ó 2 +õ 2 +ё 1 diff --git a/mms-1b-all/kea/lexicon.txt b/mms-1b-all/kea/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..e18744c4086c42081c2015907785e01e3abc9f65 --- /dev/null +++ b/mms-1b-all/kea/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/kea/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/kea/tokens.txt b/mms-1b-all/kea/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..256e0e9a0185d97493dca2b303bb1eaf2cabbe05 --- /dev/null +++ b/mms-1b-all/kea/tokens.txt @@ -0,0 +1,71 @@ +| 58435 +a 34368 +i 31237 +n 20962 +u 20245 +s 19384 +t 16447 +e 15859 +d 15595 +r 15433 +o 11847 +k 10407 +l 9178 +p 8364 +m 7733 +b 4081 +f 3277 +v 3241 +g 2971 +é 2817 +z 2412 +j 2273 +y 1699 +h 1586 +á 1379 +x 992 +í 891 +ô 714 +0 603 +ó 599 +ê 514 +c 508 +- 482 +1 432 +ú 290 +2 282 +w 246 +9 200 +5 169 +4 168 +. 164 +3 162 +6 137 +8 129 +7 91 +â 85 +: 38 +/ 36 +, 36 +q 36 +' 28 +” 28 +º 18 +ü 10 +% 9 +ã 7 +; 6 +õ 4 ++ 3 +[ 3 +] 3 +¥ 3 +$ 2 +ª 2 +& 2 +° 2 +ł 2 +ç 2 +£ 1 +² 1 +̇ 1 diff --git a/mms-1b-all/khm/lexicon.txt b/mms-1b-all/khm/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/mms-1b-all/khm/lexicon.txt @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/mms-1b-all/khm/tokens.txt b/mms-1b-all/khm/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..729784b0cf5c93ead77f35258fc6d144c779c859 --- /dev/null +++ b/mms-1b-all/khm/tokens.txt @@ -0,0 +1,148 @@ +ា 58553 +្ 72701 +ន 47710 +រ 44483 +ក 31623 +ប 34365 +ស 30594 +ម 25747 +ង 28398 +ល 22336 +ត 17740 +់ 20638 +ទ 17073 +យ 21942 +ដ 18603 +ិ 14099 +ុ 15638 +េ 12961 +ព 20357 +ំ 15700 +ច 14524 +ី 9712 +វ 8897 +ើ 15075 +ែ 10204 +ជ 11911 +គ 13141 +ូ 13278 +អ 19443 +ោ 14996 +ះ 17146 +ៅ 6299 +ួ 6075 +ហ 5203 +ណ 4996 +ញ 7779 +ធ 3082 +ខ 6557 +ថ 4566 +ផ 2542 +a 1253 +ភ 1718 +៉ 2924 +ឹ 4588 +​ 857 +e 852 +ៃ 1294 +r 791 +o 782 +n 778 +ឡ 2717 +i 699 +ឺ 1152 +s 647 +័ 1383 +៊ 1585 +l 507 +t 505 +៍ 917 +ឆ 767 +ៀ 2178 +៏ 2232 +0 374 +h 372 +c 361 +u 326 +m 308 +d 302 +ឋ 231 +1 1 +g 300 +p 257 +ឈ 589 +ឱ 115 +ៈ 641 +ៗ 224 +ឬ 444 +b 192 +k 188 +ឌ 143 +2 178 +ឃ 1185 +y 150 +ឿ 1026 +9 138 +w 121 +f 116 +។ 113 +5 109 +៌ 214 +v 104 +6 102 +4 100 +ឯ 762 +8 92 +ឧ 94 +3 88 +j 74 +ឥ 352 +ឲ 67 +7 61 +z 57 +- 8 +. 42 +x 40 +, 39 +q 1 +ឍ 22 +៖ 19 +' 19 +៧ 15 +/ 11 +” 10 +១ 10 +" 10 +% 10 +០ 9 +: 9 +ü 8 +ឪ 97 +« 7 +» 7 +ç 4 +$ 4 +á 4 +២ 3 +ឮ 271 +— 3 +& 2 +; 2 +ឫ 387 +ឦ 2 +í 2 +ឳ 2 +õ 2 +៎ 26 +! 1 +? 1 +é 1 +° 1 +៥ 1 +៣ 1 +[ 1 +] 1 +² 1 +£ 1 +ú 1 +ឭ 1 diff --git a/mms-1b-all/kir/lexicon.txt b/mms-1b-all/kir/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..1ab6d339a378b5fb7c7680c36a1e24b088daa3e0 --- /dev/null +++ b/mms-1b-all/kir/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/kir/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/kir/tokens.txt b/mms-1b-all/kir/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..edc89b9814e7e4db55db29242bd3d5cd23aa076b --- /dev/null +++ b/mms-1b-all/kir/tokens.txt @@ -0,0 +1,97 @@ +| 109564 +а 86924 +н 51156 +р 35974 +т 29877 +к 37405 +л 31793 +е 36393 +ы 40069 +и 30681 +у 25825 +д 33013 +о 19979 +ү 17899 +г 19025 +б 21864 +м 17930 +с 15737 +ө 15579 +ж 9800 +ш 12679 +п 12895 +й 15179 +ч 7819 +з 8869 +э 6618 +я 2149 +ң 4686 +в 64 +- 889 +ф 240 +0 688 +ю 599 +ц 2 +1 525 +х 456 +2 320 +ь 4 +a 231 +9 228 +e 197 +5 189 +o 188 +n 186 +4 163 +i 162 +6 157 +r 156 +3 155 +t 152 +s 149 +8 146 +. 122 +c 115 +7 108 +l 103 +p 85 +g 74 +u 74 +d 71 +h 65 +m 56 +: 53 +b 48 +f 42 +k 33 +/ 32 +, 32 +' 30 +ё 149 +— 162 +v 28 +ъ 2 +y 27 +w 24 +– 2110 +’ 17 +% 16 +j 10 +" 7 +x 7 +q 6 +z 6 +õ 6 +¥ 6 ++ 5 +№ 5 +; 4 +° 4 +щ 1 +́ 2 +& 2 +² 2 +* 2 +[ 2 +] 2 +ú 1 diff --git a/mms-1b-all/kor/lexicon.txt b/mms-1b-all/kor/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..3604fa477df078da31cfa2a5404b379124266d6a --- /dev/null +++ b/mms-1b-all/kor/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/kor/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/kor/tokens.txt b/mms-1b-all/kor/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..f69876901a982e9d3b3e7b1283c6d8a4accbfa85 --- /dev/null +++ b/mms-1b-all/kor/tokens.txt @@ -0,0 +1,1326 @@ +| 584834 +다 60609 +이 59603 +는 45737 +에 43670 +의 25035 +을 45850 +니 34020 +하 57696 +은 26444 +로 22430 +가 31488 +지 25619 +고 34905 +기 16399 +서 34247 +한 12817 +사 28988 +있 12770 +를 20313 +습 13731 +리 25588 +스 6379 +인 5233 +수 10984 +도 20058 +들 30878 +어 15022 +해 6214 +아 20955 +으 14100 +시 16140 +대 8289 +자 16628 +일 8405 +그 44709 +나 33677 +" 610 +라 15272 +전 3553 +적 1162 +했 836 +부 4349 +보 8097 +것 21030 +과 9021 +0 58 +상 3051 +만 4776 +1 97 +위 3095 +주 12600 +게 25505 +정 2370 +국 204 +되 4889 +제 5891 +었 6124 +장 2749 +여 19787 +성 5216 +동 2177 +구 3561 +a 1 +트 275 +할 4035 +와 7198 +공 439 +비 2256 +문 3146 +소 2931 +우 9978 +입 8650 +미 1667 +원 2411 +경 1671 +면 6307 +화 1141 +유 2730 +요 3082 +세 4499 +오 8358 +며 3365 +르 3083 +2 37 +드 3046 +마 7012 +거 4726 +중 1131 +합 3288 +물 3235 +러 18780 +치 2406 +관 1044 +발 827 +. 259 +행 2032 +분 12631 +야 3662 +명 2351 +년 232 +계 2337 +된 1429 +신 8617 +개 827 +안 4122 +e 230 +모 7328 +용 826 +방 1780 +려 7471 +통 939 +생 3196 +않 5845 +o 224 +조 1456 +간 1442 +i 221 +s 219 +때 5967 +진 1952 +n 214 +프 106 +역 355 +람 19620 +선 1874 +식 1673 +운 3215 +터 1358 +r 193 +영 2432 +무 5145 +당 2671 +더 3056 +후 471 +연 379 +9 185 +크 232 +반 606 +작 719 +내 10354 +바 6140 +체 826 +t 1 +차 929 +포 717 +두 4971 +른 1783 +교 1622 +양 1006 +3 38 +5 12 +학 344 +m 161 +많 1351 +종 1853 +법 1798 +타 1328 +데 6323 +4 53 +히 2708 +록 1533 +파 1038 +함 3067 +회 1742 +l 141 +음 6586 +받 5762 +불 2122 +6 13 +재 750 +호 333 +키 878 +말 14013 +실 3162 +질 803 +결 820 +건 789 +단 869 +없 4449 +또 2729 +알 2561 +점 233 +카 59 +월 177 +항 206 +속 2669 +태 482 +예 9544 +각 2084 +8 4 +력 762 +루 1849 +c 1 +d 122 +p 121 +표 336 +현 87 +달 1235 +따 1529 +약 1136 +최 12 +산 679 +번 699 +남 1543 +저 1738 +레 511 +매 1414 +았 3017 +피 977 +능 771 +h 111 +올 1152 +출 149 +언 2369 +u 107 +처 1269 +형 1664 +노 710 +강 400 +든 5013 +험 317 +버 2676 +독 509 +군 549 +디 1394 +던 1344 +코 141 +향 364 +쪽 331 +직 915 +난 1512 +였 7864 +토 192 +류 135 +같 2807 +금 1875 +심 2933 +7 11 +추 546 +열 1507 +및 20 +린 896 +필 330 +격 431 +래 2063 +증 928 +특 141 +족 672 +본 665 +티 75 +까 4631 +져 1279 +브 708 +업 152 +급 134 +g 81 +근 284 +변 298 +목 627 +민 475 +랜 22 +k 79 +졌 614 +복 1755 +배 1454 +초 317 +접 402 +몇 477 +감 1144 +준 566 +확 394 +새 1177 +투 165 +베 1621 +즈 13 +광 1238 +너 5648 +페 34 +석 427 +집 1698 +등 195 +존 287 +설 287 +케 111 +북 37 +네 2167 +날 2076 +' 2495 +움 597 +름 1421 +임 738 +론 287 +될 950 +왕 676 +곳 743 +늘 2124 +랑 2323 +온 1847 +폭 102 +망 1059 +편 1126 +승 559 +침 578 +판 1168 +탄 310 +립 466 +견 334 +활 472 +메 202 +테 68 +돌 1760 +절 829 +림 277 +란 440 +울 2512 +럽 279 +b 56 +환 511 +외 367 +워 1492 +품 342 +찰 12 +료 189 +천 1637 +겨 1040 +손 1654 +박 552 +v 51 +됩 403 +텔 50 +허 367 +령 2014 +술 227 +런 2657 +, 49 +련 433 +별 380 +착 167 +패 122 +악 1053 +순 593 +객 6 +째 682 +청 572 +취 150 +럼 503 +길 1109 +살 3700 +쟁 256 +y 45 +탈 72 +떤 1011 +- 44 +큰 1084 +갈 968 +권 1191 +눈 789 +플 10 +팀 44 +평 685 +병 686 +량 195 +막 618 +책 385 +츠 4 +응 42 +f 42 +긴 188 +충 281 +볼 360 +션 42 +걸 478 +풍 325 +뉴 41 +픽 41 +창 291 +좋 842 +왔 1036 +백 1220 +쓰 512 +염 108 +머 1167 +참 913 +득 321 +밝 117 +애 447 +극 147 +웨 117 +골 111 +커 34 +채 376 +못 2643 +택 256 +잡 859 +누 2269 +십 4791 +쉽 32 +떨 496 +친 571 +섬 534 +렌 34 +쇼 34 +육 641 +릴 769 +숙 112 +잘 967 +락 346 +께 12034 +완 433 +혀 558 +떠 1158 +잠 330 +획 128 +슬 390 +념 84 +높 366 +끝 410 +엔 7 +첫 273 +황 311 +캐 15 +w 30 +쿠 71 +암 453 +헤 430 +깨 868 +희 4169 +철 56 +녀 1393 +흔 252 +얼 610 +귀 1166 +센 74 +슷 6 +킬 74 +앞 1535 +규 75 +축 186 +줄 803 +측 50 +낮 258 +밤 344 +혹 282 +송 158 +범 182 +클 6 +웹 26 +례 1160 +협 81 +검 56 +뒤 812 +빠 386 +넘 705 +렸 662 +엘 805 +났 740 +얻 724 +먼 618 +찾 787 +겠 2652 +글 163 +륙 22 +핑 14 +팔 290 +퍼 227 +억 306 +끼 299 +/ 23 +밀 391 +턴 23 +킹 23 +벨 114 +흐 19 +블 28 +쳐 824 +셀 16 +먹 1212 +갖 414 +슨 519 +j 22 +즉 20 +씨 234 +농 87 +폴 22 +혐 4 +즐 169 +느 1459 +혼 517 +므 2181 +캠 21 +푸 151 +왜 230 +총 410 +략 24 +닌 153 +끄 258 +탐 158 +뜻 737 +뿐 497 +렴 2 +맞 605 +굴 422 +켓 20 +빛 416 +렇 2583 +x 19 +풀 459 +슈 19 +넷 45 +겁 104 +훨 37 +씬 33 +죄 2045 +답 944 +칠 260 +혔 117 +둘 572 +톱 13 +색 88 +담 426 +효 78 +링 18 +덤 220 +널 46 +괴 196 +맹 294 +휴 7 +몰 233 +뜨 170 +템 17 +벌 532 +% 17 +핵 17 +익 224 +폐 84 +냥 48 +혈 33 +빙 4 +퓨 16 +끌 483 +싱 16 +찬 685 +척 172 +압 212 +힌 155 +냈 400 +z 16 +홀 94 +층 29 +곡 157 +펜 15 +컴 15 +낼 143 +율 928 +덮 106 +싸 401 +률 462 +붙 369 +q 14 +맥 14 +곱 608 +엄 212 +덴 12 +멀 253 +릅 116 +몸 1256 +읽 158 +흑 17 +됐 14 +앙 114 +럭 14 +잔 333 +꼭 92 +벽 95 +융 13 +램 13 +멘 182 +닥 142 +즌 13 +컨 13 +텐 4 +밴 18 +냐 1475 +벗 282 +액 12 +롭 467 +~ 12 +촉 14 +룩 732 +겼 211 +혁 4 +숨 379 +쇄 12 +꺼 120 +죽 2742 +줍 81 +랐 248 +옮 64 +틀 82 +룹 4 +짧 8 +벤 16 +닝 11 +뮤 11 +칙 34 +깥 90 +렵 148 +챔 11 +랍 386 +칼 170 +랙 11 +칸 22 +힘 708 +켜 505 +봉 193 +값 109 +몽 19 +꾸 350 +둔 156 +옵 43 +폰 10 +딩 2 +롬 10 +떻 691 +홍 101 +욕 624 +쳤 360 +탕 108 +롤 6 +곤 52 +덕 138 +렉 50 +롯 342 +휘 76 +콰 10 +힐 56 +킨 22 +놓 947 +덜 37 +켰 110 +납 118 +밑 28 +웃 191 +샤 4 +콜 9 +흥 15 +셔 910 +섯 225 +얇 9 +딘 12 +틴 9 +쉬 119 +톨 9 +깃 32 +써 772 +빨 56 +냅 72 +줌 9 +콘 9 +뛰 110 +털 71 +녁 37 +닙 676 +균 9 +넣 123 +릭 46 +뤄 8 +땅 1381 +덩 43 +탱 12 +훈 207 +탑 3 +님 13602 +돈 346 +징 312 +큼 106 +갑 212 +싼 14 +캔 8 +빌 563 +몬 304 +잎 36 +팬 8 +셈 112 +칭 140 +륜 14 +캄 7 +넓 33 +찍 195 +짜 59 +탁 120 +뿌 282 +봐 19 +갔 1067 +낭 14 +윈 7 +삭 177 +잃 280 +밖 640 +겪 201 +뢰 94 +횡 7 +멸 380 +잇 20 +? 7 +맨 77 +짓 879 +텍 7 +멤 7 +숫 24 +낸 279 +뇌 7 +흩 66 +랫 36 +쓸 191 +끈 29 +곰 28 +쥐 25 +퇴 2 +궁 212 +깝 10 +논 115 +훌 96 +륭 96 +냄 17 +펴 132 +걱 137 +듯 330 +톤 6 +칩 50 +잉 23 +셰 6 +펼 16 +뱅 17 +젠 8 +뜰 38 +쌓 79 +좀 106 +옷 503 +델 12 +낙 82 +묻 239 +놀 422 +춘 6 +븐 6 +둥 158 +싶 292 +옛 114 +붕 29 +윤 6 +끔 29 +돼 46 +럴 114 +펠 5 +묵 100 +겐 7 +” 5 +흡 10 +쇠 111 +콩 5 +묘 16 +닿 44 +옆 22 +뀌 5 +깅 5 +앨 13 +춰 5 +욱 326 +픈 18 +괜 14 +찮 28 +헌 75 +햇 8 +핀 3 +졸 62 +꿈 38 +튜 5 +: 5 +뇨 4 +빅 5 +뉩 5 +뷰 5 +팽 18 +꼽 5 +팡 26 +뮬 4 +왈 4 +깊 100 +젤 4 +젝 4 +삽 15 +탠 4 +헝 4 +맷 24 +짝 82 +섭 16 +혜 1206 +빈 78 +앉 770 +렀 366 +좌 325 +밍 4 +걷 67 +돕 34 +쫒 4 +랩 4 +촬 4 +딜 4 +헬 5 +왼 64 +덧 58 +밸 4 +쨍 4 +뱀 79 +챙 11 +룸 4 +슴 97 +죠 4 +닫 273 +꽤 4 +쁜 302 +뭇 31 +늄 4 +쌍 94 +렘 757 +엽 12 +​ 4 +젼 4 +õ 4 +뜬 8 +듭 34 +헥 3 +낄 3 +콧 3 +℃ 3 +옥 584 +쾰 3 +ü 3 +굽 35 +덫 11 +좁 20 +싣 3 +썰 3 +멋 14 +멜 71 +솔 74 +멍 43 +퀴 15 +녹 54 +엠 6 +캡 3 +슐 3 +똑 217 +큄 3 +뀔 3 +깜 31 +믿 2763 +겔 12 +! 3 +훼 10 +컬 19 +틈 36 +띄 10 +튼 152 +팸 3 +짐 584 +놈 15 +댓 3 +텀 3 +쯤 113 +벡 3 +갇 200 +캘 3 +얏 3 +삶 48 +룽 3 +괄 2 +롱 82 +옹 14 +꽃 71 +룡 23 +빗 7 +긍 23 +꾼 253 +줬 3 +헨 1 +死 3 +因 3 +뽑 184 +벼 25 +깔 27 +삼 472 +빼 229 +듬 28 +딴 50 +밭 180 +깎 26 +뭅 4 +섰 64 +맛 84 +랬 76 +맡 442 +쩌 10 +껏 9 +훔 31 +걀 1 +젯 2 +쌀 8 +햄 2 +펀 2 +봄 6 +쏘 7 +긋 26 +첨 16 +돗 2 +$ 2 +꿨 2 +뼈 22 +겹 15 +췄 2 +묶 96 +혓 2 +렛 131 +뭔 2 +똥 2 +딛 4 +– 2 +젖 61 +칫 2 +뾰 2 +잊 58 +[ 2 +] 2 +곧 1222 +꼈 4 +샌 2 +쿄 2 +릎 85 +씩 89 +뻔 14 +빡 2 +셨 5452 +웰 2 +뒷 2 +앗 198 +삐 4 +웠 182 +쉴 31 +잭 2 +앱 2 +ú 2 +눌 44 +뷴 2 +잦 4 +죔 2 +샛 12 +윙 2 +킵 49 +렬 57 +° 2 +젹 2 +쁩 2 +밋 2 +휠 2 +볍 20 +멈 42 +넛 2 +둡 27 +릉 2 +쥔 2 +꼿 2 +닷 56 +렝 2 +띠 78 +섞 38 +댄 25 +팟 2 +퉁 57 +겉 144 +숭 99 +랄 64 +뚫 52 +썩 165 +낡 66 +릿 12 +떼 243 +컷 4 +딸 151 +씀 4097 +렙 9 +핌 4 +닉 10 +폼 2 +톰 2 +쏟 78 +덟 28 +뻗 26 +냉 4 +넙 2 +녕 34 +얄 2 +콥 2 +벳 98 +쿼 2 +칵 2 +맵 2 +빵 259 +몹 162 +웅 10 +쁘 248 +ç 1 +뚜 11 +렷 1 +슘 1 +흗 11 +얘 16 +윌 1 +흰 83 +샘 30 +봅 58 +됨 22 +쿡 1 +맺 270 +엇 870 +꿉 1 +뭐 1 +밥 20 +컵 1 +² 1 +늪 1 +핍 75 +헷 1 +핼 1 +팩 1 +댈 13 +넨 1 +륨 1 +갱 1 +갤 1 +툼 29 +닐 59 +랭 2 +낌 60 +섹 32 +샵 1 +붐 1 +쌉 1 +푯 1 +앵 1 +옌 1 +꿔 1 +꼴 32 +á 1 +뤼 1 +듀 1 +* 1 +믹 1 +툴 1 +쫓 273 +렁 33 +굳 320 +낀 1 +틱 1 +딱 9 +흘 308 +텨 1 +헙 1 +멧 1 +뚝 1 +솟 8 +읍 33 +겸 65 +쿨 1 +꼬 47 +끊 96 +턱 2 +팅 1 +젊 109 +춥 1 +룻 34 +룰 11 +셉 202 +뀐 1 +됭 1 +샬 1 +굉 13 +듣 1240 +뻐 457 +옳 420 +빕 416 +찌 358 +윗 342 +끗 340 +낳 261 +쁨 226 +릇 226 +갚 223 +랴 199 +곁 197 +짖 189 +낫 187 +엎 184 +헛 180 +씁 167 +둠 149 +빚 149 +흠 133 +헐 124 +셋 123 +씻 117 +얹 105 +닭 103 +썼 96 +튿 95 +둑 91 +김 87 +삯 86 +넉 84 +꿇 81 +깁 79 +몫 74 +픔 74 +쑤 74 +뜯 72 +쓴 72 +굶 72 +욥 70 +밟 70 +뉘 66 +돋 66 +낱 65 +덱 63 +앓 63 +뿔 60 +춤 59 +넬 52 +쪼 52 +돔 50 +맏 49 +홉 48 +뵙 46 +꾀 44 +씌 44 +휩 43 +닦 42 +찢 41 +녔 41 +떳 40 +헴 39 +맙 39 +뜸 36 +갓 35 +엿 34 +얽 34 +맑 33 +좇 33 +흉 32 +푼 32 +탓 32 +흙 30 +웁 28 +닢 28 +늙 28 +궤 28 +벙 28 +짠 27 +쾌 27 +귐 26 +쩔 25 +찼 25 +싹 24 +뺨 24 +눕 24 +럿 24 +쩍 24 +렐 23 +뗄 23 +늦 23 +샀 22 +샅 22 +깐 22 +촛 22 +닻 22 +꺾 22 +떡 22 +댔 22 +휼 21 +낚 20 +뚤 20 +붉 20 +꿰 20 +짊 20 +룟 20 +뵈 20 +뱃 19 +뭍 19 +뱉 19 +뎅 19 +뽐 18 +앎 18 +킴 18 +뜩 17 +엾 16 +룬 16 +엉 16 +꿀 15 +붓 15 +쏠 14 +싫 14 +딪 13 +뿜 13 +쌌 13 +탉 12 +횃 12 +겅 12 +쑥 12 +찔 12 +쌈 11 +엮 11 +밧 10 +뵐 10 +얀 10 +낯 10 +쭐 10 +탔 9 +놋 9 +짚 9 +궂 9 +쨌 9 +텅 9 +얕 8 +껴 8 +맸 8 +겟 8 +넜 8 +겜 8 +잿 8 +떱 8 +꾐 8 +큽 8 +잣 8 +뽕 8 +눔 8 +촌 7 +얌 7 +춧 7 +맘 6 +즙 6 +켈 6 +쬐 6 +덥 6 +갸 6 +숲 6 +떴 6 +꽹 6 +읊 6 +홰 6 +띤 6 +뻤 6 +빤 5 +귈 5 +밈 5 +댁 5 +굵 4 +뭉 4 +숯 4 +챈 4 +옭 4 +솜 4 +팎 4 +몄 4 +쭈 4 +멱 4 +짤 4 +갗 4 +딥 4 +깍 4 +갯 4 +핥 4 +댑 4 +잖 4 +쉰 4 +닮 4 +밉 4 +찐 4 +껄 4 +묽 3 +꽂 3 +쭉 3 +멎 3 +곽 2 +눠 2 +닛 2 +툰 2 +뺀 2 +슭 2 +츰 2 +뢸 2 +챘 2 +첩 2 +넌 2 +늉 2 +뚱 2 +젓 2 +텁 2 +탬 2 +빔 2 +팠 2 +뭄 2 +썽 2 +옴 2 +껑 2 +쩡 2 +훗 2 +륵 2 +땋 2 +뒹 1 +폈 1 +낟 1 +짙 1 +귓 1 +쉼 1 +웬 1 +벅 1 +돛 1 diff --git a/mms-1b-all/lao/lexicon.txt b/mms-1b-all/lao/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/mms-1b-all/lao/lexicon.txt @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/mms-1b-all/lao/tokens.txt b/mms-1b-all/lao/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a216e722a65a92b904b8f8247f1c1c8233ac6ac --- /dev/null +++ b/mms-1b-all/lao/tokens.txt @@ -0,0 +1,119 @@ +າ 61245 +ນ 52745 +່ 40573 +ກ 30964 +ງ 31950 +ດ 26184 +້ 42458 +ັ 28408 +ເ 39003 +ອ 27344 +ະ 24909 +ີ 16114 +ວ 26914 +ມ 18938 +ທ 13732 +ສ 11141 +ລ 16814 +ບ 13906 +ົ 31545 +ຍ 13599 +ແ 13008 +ປ 10231 +ຂ 11252 +ິ 11134 +ຫ 15602 +ຕ 10213 +ໃ 10159 +ື 9657 +ຄ 10728 +ຈ 16902 +ໄ 9218 +ພ 20320 +ຊ 9663 +ຮ 9210 +ູ 8866 +ໍ 13000 +a 1349 +ຸ 3995 +ຳ 1200 +ຼ 72 +ຖ 4033 +ໂ 4339 +ຢ 5692 +e 911 +n 903 +ຽ 1938 +r 848 +ໜ 847 +o 809 +i 754 +ຶ 3352 +ຜ 4658 +s 640 +l 588 +t 576 +h 429 +0 417 +ໝ 399 +c 366 +u 353 +d 340 +g 291 +m 290 +1 288 +ໆ 283 +ຣ 9510 +ຟ 872 +p 244 +b 236 +k 198 +2 171 +y 164 +w 136 +໌ 134 +ຝ 714 +f 123 +9 110 +5 107 +4 105 +j 103 +8 101 +v 101 +3 92 +6 86 +z 70 +. 70 +- 3 +7 61 +, 58 +x 48 +: 30 +q 20 +” 19 +/ 18 +' 69 +% 7 +; 7 +¥ 6 +໊ 5 +á 5 +ü 3 +í 3 +ç 2 +ú 2 +— 2 +[ 2 +] 2 +​ 2 +& 2 +é 2 +‘ 2 +’ 2 +! 2 ++ 2 +? 1 +– 1 +ã 1 +໋ 1 +ó 1 diff --git a/mms-1b-all/lav/lexicon.txt b/mms-1b-all/lav/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..22ca7dd49d1919c0497d6d31eb4fabcd0213481c --- /dev/null +++ b/mms-1b-all/lav/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/lav/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/lav/tokens.txt b/mms-1b-all/lav/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..25b1fe43d803af4b556ed66203bb50a604a9eb85 --- /dev/null +++ b/mms-1b-all/lav/tokens.txt @@ -0,0 +1,71 @@ +| 216902 +a 115140 +i 106949 +s 92365 +t 63812 +e 69276 +r 45852 +u 64793 +n 49816 +ā 40199 +k 36660 +o 24174 +m 39106 +l 27820 +d 37884 +p 27135 +v 38227 +j 28953 +z 20442 +ī 23287 +ē 22088 +b 19951 +g 14637 +c 13337 +š 12189 +ū 9489 +ņ 11168 +ļ 5476 +f 506 +ž 1747 +0 490 +1 364 +ģ 846 +. 338 +h 616 +ķ 1244 +č 369 +2 206 +5 142 +9 136 +6 114 +3 111 +4 103 +8 93 +7 76 +- 2 +y 40 +: 38 +/ 33 +x 2 +w 25 +, 22 +' 3 +% 17 +” 10 +¨ 10 +q 7 +– 1900 +; 5 ++ 5 +õ 4 +° 4 +— 3 +² 2 +£ 2 +" 2 +! 2 +​ 2 +ç 2 +& 2 +$ 1 diff --git a/mms-1b-all/lin/lexicon.txt b/mms-1b-all/lin/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..634d492691b96972f857d7061f1a883138cc1ef0 --- /dev/null +++ b/mms-1b-all/lin/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/lin/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/lin/tokens.txt b/mms-1b-all/lin/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..bff3840dcc2861a92a76f3bd5ab8eb4b601eb4c6 --- /dev/null +++ b/mms-1b-all/lin/tokens.txt @@ -0,0 +1,77 @@ +| 75440 +a 70693 +o 38929 +n 30252 +e 29789 +i 27655 +k 24225 +m 20774 +b 18606 +l 17719 +s 14504 +y 13834 +t 12253 +g 8996 +p 6948 +u 6643 +z 5874 +d 4163 +r 4069 +w 2139 +c 1601 +h 1218 +f 1093 +0 812 +v 670 +1 607 +. 476 +- 395 +2 357 +é 327 +j 269 +9 258 +q 254 +5 230 +4 204 +3 195 +6 185 +8 177 +7 131 +x 129 +; 86 +: 81 +è 68 +, 50 +' 49 +/ 43 +ï 29 +? 24 +! 21 +– 19 +ü 13 +— 13 +" 11 +% 11 +$ 9 +í 7 +& 7 +” 7 +á 7 +õ 6 +¥ 6 +ç 6 ++ 5 +° 5 +â 4 +ú 4 +[ 3 +] 3 +ã 3 +£ 3 +² 2 +ł 2 +à 2 +î 2 +œ 2 +ó 2 +ö 2 diff --git a/mms-1b-all/lit/lexicon.txt b/mms-1b-all/lit/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..23d3c8eaa5f06cb860fcc48ed71ab2047ac8f82c --- /dev/null +++ b/mms-1b-all/lit/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/lit/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/lit/tokens.txt b/mms-1b-all/lit/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..c36dc9459dd12ae08219188105394c27b0d113cb --- /dev/null +++ b/mms-1b-all/lit/tokens.txt @@ -0,0 +1,67 @@ +| 34046 +i 29979 +a 26510 +s 18254 +o 15288 +t 12297 +e 12796 +r 12889 +n 11440 +u 10459 +k 9483 +m 7220 +l 7834 +p 6712 +d 5732 +v 6262 +j 5442 +g 3891 +ė 3896 +y 3590 +b 3243 +ų 3337 +š 2894 +ž 1932 +ą 1262 +į 1206 +ū 1034 +č 966 +c 1047 +z 724 +f 613 +0 658 +h 343 +ę 445 +1 471 +2 328 +– 31 +. 223 +9 203 +5 180 +4 174 +3 174 +8 147 +6 146 +x 12 +7 99 +w 13 +- 77 +' 59 +: 41 +, 40 +/ 39 +q 2 +ü 9 +; 6 +% 5 ++ 5 +° 4 +õ 4 +ç 3 +ú 2 +ö 2 +² 2 +? 2 +é 1 +[ 1 +] 1 diff --git a/mms-1b-all/ltz/lexicon.txt b/mms-1b-all/ltz/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..c079b4da551528ed615f866ada707f4b19d5f7de --- /dev/null +++ b/mms-1b-all/ltz/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ltz/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/ltz/tokens.txt b/mms-1b-all/ltz/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..dedca6908bcae6b1874f3552ccd9008af00a2d07 --- /dev/null +++ b/mms-1b-all/ltz/tokens.txt @@ -0,0 +1,74 @@ +| 50930 +e 44817 +n 26948 +r 19701 +t 18746 +a 18082 +i 17210 +s 16894 +d 12284 +u 11867 +l 11130 +o 10540 +h 10257 +g 9679 +c 7788 +m 7752 +f 5114 +w 5096 +k 4551 +é 4345 +v 4074 +p 4049 +b 3673 +z 3295 +ë 2445 +ä 2169 +' 1731 +j 654 +0 547 +y 458 +1 446 +- 424 +2 282 +ü 260 +9 194 +x 185 +5 170 +4 154 +8 154 +3 146 +6 131 +. 120 +; 118 +q 113 +& 110 +7 99 +/ 34 +, 32 +ö 30 +: 20 +è 15 +– 13 +° 9 +‑ 8 +% 7 +í 6 +$ 6 +" 5 +² 4 ++ 4 +æ 4 +¥ 3 +ç 3 +á 3 +ã 3 +â 2 +ï 2 +ú 2 +[ 2 +] 2 +ł 2 +ó 1 +! 1 +̇ 1 diff --git a/mms-1b-all/lug/lexicon.txt b/mms-1b-all/lug/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..808d0045d7af3623b6ccc70abfd1af49b66f29f6 --- /dev/null +++ b/mms-1b-all/lug/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/lug/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/lug/tokens.txt b/mms-1b-all/lug/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8ff1a8b4ea6f6826fd8ecee5445430ea55e1245 --- /dev/null +++ b/mms-1b-all/lug/tokens.txt @@ -0,0 +1,75 @@ +a 119662 +| 104421 +e 54337 +u 50065 +i 44649 +o 40257 +n 50614 +k 36581 +b 41376 +m 31481 +l 21817 +y 25346 +g 24234 +w 25059 +t 18519 +s 12192 +r 16713 +z 8565 +d 11270 +' 11025 +j 3525 +f 3845 +p 1022 +v 1973 +c 10 +h 814 +0 539 +1 404 +2 274 +’ 270 +. 195 +9 162 +5 161 +4 3 +6 129 +8 126 +3 124 +, 124 +” 91 +7 81 +- 56 +x 29 +q 11 +: 28 +/ 18 +ü 13 +? 13 +– 2 +$ 12 +; 11 +% 10 ++ 4 +° 4 +õ 4 +! 3 +£ 3 +— 3 +á 3 +í 3 +ç 3 +¥ 3 +ã 2 +& 2 +ŋ 606 +ú 1 +² 1 +[ 1 +] 1 +ղ 24 +` 14 +η 6 +̓ 4 +́ 4 +̔ 3 +̀ 3 diff --git a/mms-1b-all/luo/lexicon.txt b/mms-1b-all/luo/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..66751f6b8326800eb9561204d93e8b003676467b --- /dev/null +++ b/mms-1b-all/luo/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/luo/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/luo/tokens.txt b/mms-1b-all/luo/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4b85aac8f19b9dd119a97329ce6084bbe9b1f56 --- /dev/null +++ b/mms-1b-all/luo/tokens.txt @@ -0,0 +1,64 @@ +| 54815 +o 32664 +a 28294 +e 22243 +n 21176 +i 20482 +m 16893 +k 13832 +r 11506 +g 10655 +d 9086 +t 8558 +h 7817 +u 6666 +l 6634 +y 6483 +c 5025 +w 4472 +s 4037 +b 3579 +p 3387 +j 2564 +' 1818 +f 640 +0 576 +1 430 +- 398 +v 323 +2 270 +9 166 +5 153 +4 146 +3 143 +z 132 +6 124 +8 120 +. 115 +7 87 +x 71 +, 61 +q 53 +: 28 +” 18 +/ 7 +$ 7 +á 7 +õ 6 +% 5 +; 5 +ü 4 +¥ 3 +í 3 ++ 3 +° 3 +[ 2 +] 2 +& 2 +‘ 2 +’ 2 +ó 2 +̇ 2 +ú 2 +_ 1 +£ 1 diff --git a/mms-1b-all/mal/lexicon.txt b/mms-1b-all/mal/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..1f3a9d2dc8ec44f4ab22c9034f51033362df5d01 --- /dev/null +++ b/mms-1b-all/mal/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/mal/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/mal/tokens.txt b/mms-1b-all/mal/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..b00403dadc3c3c049c311470ea3dae40d0c2e254 --- /dev/null +++ b/mms-1b-all/mal/tokens.txt @@ -0,0 +1,132 @@ +് 506826 +| 351771 +ി 210024 +ക 200677 +ന 246062 +ു 212981 +ത 158926 +ാ 132022 +യ 112319 +ര 115743 +പ 92132 +ട 80316 +െ 91940 +വ 107856 +മ 63363 +റ 42441 +ല 75140 +ം 75880 +സ 45158 +ള 53688 +ച 50344 +ങ 56376 +ണ 35350 +ോ 37432 +ൽ 13890 +ർ 11339 +േ 33326 +അ 45385 +ൻ 9405 +ൾ 8684 +ദ 30447 +ഗ 9907 +ഷ 15127 +ീ 15643 +ശ 26423 +ൂ 12822 +എ 22204 +ജ 8887 +ധ 9991 +ബ 4737 +ഒ 7048 +ആ 12046 +ഞ 25805 +ഡ 437 +ഭ 8626 +ഉ 5919 +ഹ 14820 +ഇ 8110 +ഴ 8197 +ഫ 1485 +ൈ 9704 +ഥ 3858 +ൊ 14671 +0 39 +‌ 656 +" 632 +1 32 +ഏ 2111 +- 119 +. 377 +2 13 +ൃ 3600 +ൺ 60 +ഈ 2450 +ഓ 997 +ഖ 1531 +9 2 +5 200 +4 4 +ഘ 543 +3 3 +ൗ 797 +, 160 +8 157 +6 1 +ഠ 1067 +7 101 +‍ 59101 +ഐ 38 +ഔ 28 +: 43 +ഊ 102 +/ 24 +ഛ 429 +i 19 +' 128 +ഃ 240 +ൌ 1089 +m 3 +s 14 +; 13 +p 10 +c 1 +a 9 +% 9 +o 1 +b 8 +r 7 +t 6 +n 6 +e 6 +! 5 +$ 5 +‘ 5 +’ 5 +? 4 +u 4 +h 4 +l 4 ++ 3 +x 3 +f 3 +[ 3 +] 3 +k 3 +ഢ 261 +g 3 +d 3 +& 2 +y 2 +ഋ 2 +– 1 +° 1 +w 1 +v 1 +j 1 +” 1 +— 88 +൪ 4 +q 4 +ൎ 3 +൯ 2 diff --git a/mms-1b-all/mar/lexicon.txt b/mms-1b-all/mar/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..ef24f94962d65e3c100d998fc99a7fcb91b6c1a6 --- /dev/null +++ b/mms-1b-all/mar/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/mar/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/mar/tokens.txt b/mms-1b-all/mar/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..40b33ea89c2045b07e91eaf697268de5a0d3c24d --- /dev/null +++ b/mms-1b-all/mar/tokens.txt @@ -0,0 +1,142 @@ +| 379007 +ा 269905 +् 146900 +र 92147 +े 124304 +त 128703 +य 94225 +क 64097 +ी 78741 +स 54965 +ल 75474 +न 65022 +ि 47878 +व 67969 +ह 66550 +म 59969 +प 44835 +ं 38229 +च 41924 +ण 42550 +ो 43557 +आ 37382 +द 30048 +ज 26204 +ु 31383 +ग 19938 +अ 14577 +ट 8922 +श 21777 +ध 10181 +ू 23970 +ब 13308 +ड 10131 +ष 11931 +ळ 10347 +ख 8968 +ठ 8677 +थ 5871 +भ 7770 +ए 3652 +फ 2432 +उ 3680 +घ 4896 +इ 1980 +ॉ 1 +" 766 +झ 6589 +ॅ 4 +ृ 1760 +ै 1614 +0 98 +1 32 +. 436 +ऱ 2225 +ई 2739 +2 28 +ढ 2348 +ऊ 2284 +० 278 +ओ 710 +ऑ 2 +१ 208 +- 75 +ौ 1753 +ँ 6 +4 4 +9 8 +3 154 +5 145 +ञ 1052 +छ 927 +‍ 157 +6 118 +, 113 +8 110 +ः 1611 +२ 106 +९ 95 +7 6 +a 80 +: 79 +५ 78 +६ 71 +८ 67 +३ 66 +' 157 +ॲ 56 +/ 52 +४ 52 +ऐ 1356 +७ 50 +p 48 +i 46 +m 42 +c 38 +औ 5 +n 33 +d 31 +t 31 +s 27 +o 26 +v 23 +u 23 +f 19 +h 18 +w 18 +b 17 +r 17 +e 17 +l 16 +$ 15 +; 14 +g 14 +% 13 +ऋ 12 +k 10 +! 7 +— 6 +x 5 ++ 4 +° 4 +õ 4 +j 3 +¥ 3 +– 3 +z 3 +‘ 2 +’ 2 +ऍ 2 +q 2 +y 2 +? 2 +[ 2 +] 2 +& 2 +² 2 +़ 1 +ॊ 1 +ऴ 8 +ʇ 4 +ङ 2 +ʈ 2 diff --git a/mms-1b-all/mkd/lexicon.txt b/mms-1b-all/mkd/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..9e0fb8284a1cfdf5a846f24f1f35c1d55a6d96c0 --- /dev/null +++ b/mms-1b-all/mkd/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/mkd/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/mkd/tokens.txt b/mms-1b-all/mkd/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..2270077e922832e7fa6ed2e2ba78779a4888a6ba --- /dev/null +++ b/mms-1b-all/mkd/tokens.txt @@ -0,0 +1,93 @@ +| 803 +а 514 +о 405 +и 362 +е 414 +т 262 +н 252 +р 165 +с 196 +в 170 +д 153 +к 172 +л 119 +п 103 +м 114 +у 84 +з 75 +г 68 +ј 58 +б 72 +ч 33 +ш 59 +ц 26 +ж 18 +њ 13 +ф 11 +ќ 12 +0 532 +х 7 +1 397 +ѓ 4 +2 238 +- 1 +9 173 +5 158 +џ 155 +4 139 +. 129 +8 128 +3 122 +6 119 +a 105 +7 87 +e 78 +s 77 +o 76 +i 67 +n 65 +r 58 +m 51 +c 49 +d 48 +l 46 +t 43 +ѐ 2 +p 41 +, 37 +h 33 +: 31 +/ 29 +u 27 +ѕ 27 +љ 1 +g 23 +k 23 +f 20 +v 20 +b 16 +y 14 +ѝ 14 +" 13 +w 10 +' 9 +% 8 +x 7 +õ 6 +; 6 +j 6 +¥ 6 +$ 4 +z 4 +q 4 +è 3 +² 2 +& 2 +` 2 +° 2 +— 1 +[ 1 +] 1 +! 1 ++ 1 +ú 1 diff --git a/mms-1b-all/mlt/lexicon.txt b/mms-1b-all/mlt/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..2303f8f98ad572c5f2d79c00b5e7877a7352b593 --- /dev/null +++ b/mms-1b-all/mlt/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/mlt/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/mlt/tokens.txt b/mms-1b-all/mlt/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..1743bc29f99a6a8e005db1a933129cada3fcd483 --- /dev/null +++ b/mms-1b-all/mlt/tokens.txt @@ -0,0 +1,79 @@ +| 13815 +a 9465 +i 9801 +t 5196 +l 5612 +e 5527 +n 5240 +r 3673 +u 3019 +s 3035 +m 3507 +- 2169 +j 2606 +o 2448 +k 2567 +d 2835 +ħ 2427 +b 1438 +f 1483 +p 1255 +g 1676 +w 1111 +h 1205 +z 741 +ġ 722 +ż 599 +q 873 +' 764 +x 1031 +v 431 +ċ 420 +0 632 +c 172 +1 491 +à 62 +’ 319 +y 33 +2 304 +9 208 +5 182 +4 166 +3 155 +8 154 +6 142 +. 118 +7 98 +, 83 +” 71 +: 42 +/ 39 +" 15 +ü 11 +$ 11 +% 9 +; 9 +á 1 +è 1 +¥ 6 +ç 6 +í 5 +— 5 ++ 4 +õ 4 +ò 20 +ã 3 +! 3 +£ 3 +[ 3 +] 3 +° 3 +ì 3 +é 1 +ú 2 +² 2 +& 2 +ł 2 +ù 2 +– 2 +ó 1 diff --git a/mms-1b-all/mon/lexicon.txt b/mms-1b-all/mon/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..8e091e60cbf4f64ffbc65a17c802887efb723a8e --- /dev/null +++ b/mms-1b-all/mon/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/mon/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/mon/tokens.txt b/mms-1b-all/mon/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..e036e312f6d98be8210f6f8e4af49c67f79d8aa2 --- /dev/null +++ b/mms-1b-all/mon/tokens.txt @@ -0,0 +1,96 @@ +| 100380 +а 57945 +н 44164 +э 47558 +г 32663 +л 24131 +о 17861 +р 29541 +д 27880 +и 25759 +х 24997 +й 21439 +у 17828 +т 21863 +с 17637 +б 16233 +ү 24223 +ө 3128 +м 10469 +ж 6033 +в 7177 +ы 4516 +з 4946 +ч 8548 +ь 5893 +е 3044 +ц 2576 +ш 989 +к 381 +я 2212 +п 585 +0 741 +ю 1503 +1 558 +ф 253 +ё 658 +- 1 +2 348 +9 256 +5 201 +4 195 +3 184 +6 168 +8 165 +7 120 +, 105 +. 99 +i 9 +a 7 +p 9 +/ 43 +: 40 +r 9 +s 8 +c 8 +o 7 +t 6 +m 9 +u 7 +n 9 +l 10 +g 9 +e 10 +v 9 +d 8 +ъ 36 +h 6 +" 11 +? 9 +” 9 +f 9 +' 2 +% 8 +b 9 +y 7 +​ 6 +w 8 +õ 6 +k 8 +x 9 +; 5 ++ 5 +j 10 +! 3 +‘ 3 +’ 3 +– 2 +z 7 +² 2 +[ 1 +] 1 +& 1 +ѳ 14764 +щ 3015 +— 862 +q 9 diff --git a/mms-1b-all/mri/lexicon.txt b/mms-1b-all/mri/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..7fb4828c7ffe986b4ef1b43d9e8a305f6ea9e5d4 --- /dev/null +++ b/mms-1b-all/mri/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/mri/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/mri/tokens.txt b/mms-1b-all/mri/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8b8d0daa8db1173460817f3446965f85411d463 --- /dev/null +++ b/mms-1b-all/mri/tokens.txt @@ -0,0 +1,57 @@ +| 91956 +a 58153 +i 36793 +e 33271 +t 32819 +k 22543 +n 22085 +h 19727 +o 19466 +r 17420 +u 14989 +ā 12969 +g 10458 +m 10019 +w 8215 +p 6625 +ō 3110 +ē 2948 +s 1845 +ū 1549 +l 1386 +c 1059 +ī 945 +d 855 +0 785 +1 594 +b 545 +- 528 +y 449 +f 367 +2 365 +v 356 +9 255 +5 213 +4 198 +j 193 +3 191 +6 190 +8 187 +. 160 +z 123 +7 118 +x 103 +, 100 +/ 53 +q 50 +: 38 +' 35 +$ 15 +% 10 +; 6 +& 5 ++ 5 +[ 3 +] 3 +! 2 +? 1 diff --git a/mms-1b-all/mya/lexicon.txt b/mms-1b-all/mya/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/mms-1b-all/mya/lexicon.txt @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/mms-1b-all/mya/tokens.txt b/mms-1b-all/mya/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..49191e00b11ea3281db39a6b8022d0b003bae2da --- /dev/null +++ b/mms-1b-all/mya/tokens.txt @@ -0,0 +1,136 @@ +် 296429 +း 124876 +ာ 153677 +ု 144384 +င 109674 +က 118425 +ိ 134630 +ေ 123418 +သ 123739 +မ 86316 +ပ 48542 +န 52072 +တ 95646 +ြ 78562 +့ 87145 +ရ 57649 +စ 46725 +အ 55388 +ည 59361 +ှ 51436 +ျ 36501 +ွ 26474 +ခ 44834 +လ 49556 +ီ 13125 +ံ 19852 +ဖ 19363 +ါ 25745 +ဆ 14942 +ဲ 11271 +ယ 20029 +ထ 23147 +ူ 37483 +ဘ 12043 +ဟ 11016 +၏ 1343 +ဝ 7963 +္ 5726 +ဒ 3057 +ဂ 2599 +" 821 +ဉ 3087 +၎ 633 +။ 588 +a 514 +ဗ 1634 +0 478 +e 414 +၍ 411 +n 370 +1 359 +ဏ 1693 +o 354 +ဇ 3010 +r 341 +i 333 +s 322 +t 287 +ဦ 1665 +2 2 +၀ 238 +l 224 +၊ 220 +ဓ 622 +ဥ 572 +h 198 +ဤ 2570 +c 186 +၁ 176 +d 172 +u 168 +m 166 +9 163 +p 158 +g 143 +၌ 131 +5 125 +k 117 +8 117 +4 110 +b 107 +- 106 +6 99 +၂ 97 +3 94 +. 89 +y 89 +၅ 81 +7 78 +၄ 75 +w 72 +၃ 70 +f 68 +ဌ 319 +ဧ 566 +၆ 62 +၉ 61 +v 59 +, 52 +၈ 45 +ဿ 108 +ဍ 125 +/ 35 +၇ 35 +' 110 +ဩ 301 +x 22 +: 19 +ဠ 3 +j 18 +ဈ 29 +q 13 +$ 12 +ဋ 222 +% 11 +z 8 +‌ 7 +° 6 +á 6 +” 6 +õ 6 ++ 4 +ł 2 +£ 2 +— 2 +[ 2 +] 2 +– 2 +í 2 +² 1 +ú 1 +ó 1 +ၤ 1 +ဣ 233 +ဃ 99 +ဪ 1 diff --git a/mms-1b-all/nld/lexicon.txt b/mms-1b-all/nld/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..09a8bf5b6be1265facdb742d1edad4b617000f7e --- /dev/null +++ b/mms-1b-all/nld/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/nld/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/nld/tokens.txt b/mms-1b-all/nld/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..123f1ad7cfdd781f2822177b4d849c9388ad281a --- /dev/null +++ b/mms-1b-all/nld/tokens.txt @@ -0,0 +1,82 @@ +| 148968 +e 124882 +n 69931 +a 47061 +i 43535 +t 38119 +r 34075 +o 37895 +d 42010 +s 19954 +l 20586 +g 22432 +v 15940 +h 22912 +m 15413 +k 11929 +u 13588 +p 6094 +b 9104 +w 13367 +c 5386 +j 13597 +z 15499 +f 4964 +0 683 +y 195 +1 482 +' 40 +- 30 +2 313 +ë 360 +. 237 +x 21 +9 200 +5 171 +4 166 +3 152 +8 143 +6 134 +é 347 +7 107 +q 8 +, 49 +ï 28 +: 39 +/ 27 +ó 104 +á 2 +% 11 +; 7 +& 7 +ü 60 +í 19 +õ 6 +² 4 +̇ 4 +ö 72 +° 3 +ç 356 +ú 7 +$ 3 ++ 3 +è 1 +£ 3 +à 2 +ã 2 +ł 2 +[ 2 +] 2 +ê 8 +û 3654 +â 1149 +ô 896 +î 837 +ä 10 +ù 279 +æ 147 +– 113 +́ 5 +š 1 +ţ 1 +ă 1 diff --git a/mms-1b-all/nob/lexicon.txt b/mms-1b-all/nob/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..5e191930d93e5665201fc406ae412b67fe04ec86 --- /dev/null +++ b/mms-1b-all/nob/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/nob/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/nob/tokens.txt b/mms-1b-all/nob/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..65b2c4fcc22aaca45e7a538038d32578af0d6810 --- /dev/null +++ b/mms-1b-all/nob/tokens.txt @@ -0,0 +1,71 @@ +| 65146 +e 51747 +r 28213 +n 26686 +t 26445 +s 20460 +a 20170 +i 19231 +l 17380 +o 16691 +d 13875 +k 11740 +g 11290 +m 10547 +v 7702 +p 6735 +f 6713 +u 5797 +b 4938 +å 4799 +h 4321 +y 2734 +j 2647 +ø 2467 +0 868 +c 846 +æ 749 +- 722 +1 643 +2 384 +. 352 +w 293 +« 255 +» 255 +9 251 +5 224 +4 211 +3 203 +6 186 +8 165 +z 155 +7 117 +x 94 +: 80 +é 55 +q 49 +/ 36 +, 34 +– 22 +' 17 +% 15 +ü 12 +í 9 +õ 6 +! 5 +á 5 +ç 5 ++ 5 +ú 4 +; 4 +¥ 3 +& 2 +ł 2 +° 2 +ã 2 +ó 2 +[ 2 +] 2 +² 2 +" 1 +? 1 diff --git a/mms-1b-all/npi/lexicon.txt b/mms-1b-all/npi/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..4b9050774a3c2c8548b9b08611f062783d2e9d7e --- /dev/null +++ b/mms-1b-all/npi/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/npi/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/npi/tokens.txt b/mms-1b-all/npi/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..55783b1366568989981ff8a6b40bd4815554e886 --- /dev/null +++ b/mms-1b-all/npi/tokens.txt @@ -0,0 +1,120 @@ +| 59548 +ा 31826 +् 29600 +र 26829 +न 19921 +क 18397 +ि 16528 +स 12269 +ो 12267 +े 11537 +म 11228 +ल 10607 +त 9933 +य 9490 +ह 9127 +प 8851 +ग 6842 +ु 6436 +व 6084 +ी 5570 +द 5189 +ू 4888 +ब 4354 +ट 4277 +ज 3968 +छ 3802 +भ 3777 +ए 2964 +अ 2654 +ै 2465 +थ 2349 +उ 2342 +ध 2247 +श 2238 +च 2085 +ड 2044 +ष 1884 +ई 1880 +फ 1606 +ण 1579 +आ 1554 +ँ 1532 +ख 1509 +इ 1286 +ं 1125 +0 833 +ङ 695 +1 615 +घ 589 +। 564 +ठ 443 +ौ 441 +ृ 438 +ढ 393 +2 383 +a 379 +झ 335 +- 280 +9 263 +5 228 +ञ 220 +o 216 +e 208 +4 204 +n 203 +6 197 +3 197 +s 190 +8 175 +r 173 +ओ 170 +i 163 +ः 153 +l 143 +‍ 131 +c 130 +7 127 +t 126 +h 123 +p 116 +औ 111 +m 104 +u 102 +, 94 +. 93 +b 75 +g 75 +d 71 +k 62 +‌ 59 +: 58 +” 51 +y 47 +f 44 +j 42 +v 38 +/ 35 +w 30 +x 25 +ऊ 24 +ऐ 23 +' 16 +z 15 +; 14 +% 12 +ऋ 11 +$ 10 +? 6 +¥ 6 +° 5 ++ 5 +õ 4 +९ 4 +q 2 +ã 2 +£ 2 +! 2 +– 2 +१ 2 +४ 2 diff --git a/mms-1b-all/nso/lexicon.txt b/mms-1b-all/nso/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..8ca4a6f372c6058debf92d35ba5eb3e011edaede --- /dev/null +++ b/mms-1b-all/nso/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/nso/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/nso/tokens.txt b/mms-1b-all/nso/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5d6bdb0de92f9d8e6232ee6c159388355bb1a6f --- /dev/null +++ b/mms-1b-all/nso/tokens.txt @@ -0,0 +1,71 @@ +| 50427 +a 33807 +e 27296 +o 22710 +l 14693 +g 14009 +t 11827 +i 10152 +n 9139 +k 7707 +b 7336 +m 7064 +š 6234 +s 5924 +d 5370 +h 4617 +r 4601 +w 4478 +y 3925 +p 3323 +u 2659 +f 2264 +j 1066 +c 828 +0 491 +1 370 +2 226 +v 224 +- 214 +9 154 +5 137 +4 126 +3 117 +8 109 +z 108 +. 106 +6 102 +7 70 +x 66 +, 49 +q 33 +' 32 +: 29 +/ 22 +$ 11 +í 8 +% 6 +— 6 ++ 4 +" 4 +; 4 +á 4 +ç 3 +ü 3 +& 3 +¥ 3 +ł 2 +£ 2 +[ 2 +] 2 +ú 2 +– 2 +é 2 +õ 2 +² 1 +! 1 +ã 1 +° 1 +” 1 +ó 1 +̇ 1 diff --git a/mms-1b-all/nya/lexicon.txt b/mms-1b-all/nya/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..1b6766c635913aa1f0fd9e84e456dec55d181b37 --- /dev/null +++ b/mms-1b-all/nya/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/nya/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/nya/tokens.txt b/mms-1b-all/nya/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..f99465b6fe3017630a5c9701fa1c888a3957858f --- /dev/null +++ b/mms-1b-all/nya/tokens.txt @@ -0,0 +1,70 @@ +a 114096 +| 108701 +i 66705 +n 60370 +o 36811 +k 37135 +m 39802 +e 41483 +u 48098 +t 25964 +l 20981 +h 21489 +d 23405 +w 23706 +z 16521 +s 16851 +p 16632 +r 13564 +c 10515 +y 13621 +g 8624 +b 7808 +f 4319 +v 1440 +j 1515 +' 10 +0 214 +1 67 +2 32 +9 8 +5 16 +4 24 +6 7 +3 8 +8 2 +. 111 +7 2 +x 89 +- 86 +, 69 +q 5 +: 39 +/ 28 +” 24 +% 18 +$ 12 +‘ 10 +’ 10 +ü 10 +; 9 +á 8 +í 8 +õ 4 +& 4 +¥ 3 +é 3 +◦ 3 ++ 3 +ç 2 +[ 2 +] 2 +ã 2 +ú 1 +ŵ 1 +! 1 +² 1 +ó 1 +⁰ 1 +ö 1 +ʼ 2142 diff --git a/mms-1b-all/oci/lexicon.txt b/mms-1b-all/oci/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..232a7ad45ab0ef485fe542392c87045e625c20e4 --- /dev/null +++ b/mms-1b-all/oci/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/oci/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/oci/tokens.txt b/mms-1b-all/oci/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..f37dfd1b7a0632887992724e642ee476af02c1a5 --- /dev/null +++ b/mms-1b-all/oci/tokens.txt @@ -0,0 +1,77 @@ +| 79259 +a 52767 +e 40819 +s 35846 +n 29168 +r 26510 +i 25516 +t 23544 +o 21624 +l 21561 +d 18226 +u 15160 +c 14607 +p 12095 +m 11947 +è 6145 +g 5624 +b 4876 +v 4750 +f 4180 +q 4040 +' 3199 +ò 3007 +h 2720 +’ 1389 +j 1340 +z 1037 +ç 822 +0 805 +x 686 +á 685 +à 650 +1 570 +. 537 +k 531 +é 433 +í 426 +2 382 +y 336 +w 305 +9 269 +5 222 +4 206 +3 199 +- 189 +6 183 +8 175 +ï 157 +7 122 +: 105 +ó 79 +, 66 +ü 64 +; 52 +/ 50 +" 40 +– 24 +? 22 +! 21 +— 19 +‘ 18 +ú 17 +” 13 +% 10 +$ 8 +õ 6 +¥ 6 +& 5 +² 4 +° 4 +[ 3 +] 3 +ã 3 ++ 3 +ë 2 +ł 2 +ö 1 diff --git a/mms-1b-all/orm/lexicon.txt b/mms-1b-all/orm/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..01a95e932604385c775103d67cac047627e65c43 --- /dev/null +++ b/mms-1b-all/orm/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/orm/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/orm/tokens.txt b/mms-1b-all/orm/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ed2e546c1c6d85c9fc9ed9fd406d3bc77e2bc4d --- /dev/null +++ b/mms-1b-all/orm/tokens.txt @@ -0,0 +1,61 @@ +a 355073 +| 258771 +i 181344 +n 116700 +e 98743 +u 90879 +t 80045 +o 63501 +r 58935 +s 66603 +k 48157 +d 51589 +m 53987 +h 56628 +b 39599 +l 30423 +f 31623 +g 27454 +y 31949 +w 17502 +j 21081 +q 12736 +c 11273 +' 9780 +p 1891 +x 2095 +0 357 +’ 277 +1 273 +z 537 +v 203 +- 588 +2 188 +9 98 +5 90 +8 89 +4 84 +6 78 +3 75 +. 70 +, 66 +7 52 +: 20 +” 15 +/ 12 +‘ 11 +— 5 +$ 5 +% 4 +" 4 +í 3 += 2 +£ 2 +! 2 +õ 2 +; 2 +° 1 +[ 1 +] 1 +ó 1 +& 1 diff --git a/mms-1b-all/ory/lexicon.txt b/mms-1b-all/ory/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..b7c4fed75111c562c149abfedc40fa95d7752ef0 --- /dev/null +++ b/mms-1b-all/ory/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/ory/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/ory/tokens.txt b/mms-1b-all/ory/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..b93fb52491ffdefe190fab9874cbef906fc37e80 --- /dev/null +++ b/mms-1b-all/ory/tokens.txt @@ -0,0 +1,130 @@ +| 283653 +୍ 126396 +ା 129233 +ର 106810 +ି 106576 +କ 88651 +େ 103802 +ବ 48617 +ନ 60108 +ତ 62618 +ସ 43858 +ପ 44722 +ୁ 67211 +ମ 66753 +ହ 42911 +ୟ 19056 +ଲ 30142 +ଏ 11483 +ଥ 15590 +ଦ 23196 +ୋ 18009 +ଣ 15615 +ଇ 15173 +ଟ 8846 +ଗ 16011 +ଯ 19855 +ଅ 11712 +ୀ 16177 +ଜ 11999 +ଶ 20633 +ଆ 13913 +ଷ 10217 +ଡ 5375 +ଧ 8221 +ଭ 26070 +ଳ 7435 +ଙ 22605 +ଚ 4450 +ଉ 9227 +ଂ 1895 +ଛ 9831 +ଁ 15235 +ଖ 8781 +ୂ 5078 +ଫ 1530 +ୱ 9558 +଼ 4343 +ଠ 4746 +ୃ 2952 +ଓ 5761 +0 174 +1 54 +। 212 +ଘ 1688 +ଞ 1358 +ୌ 1124 +‌ 123 +2 42 +ଵ 111 +- 96 +9 2 +ୈ 338 +5 5 +ଃ 328 +4 26 +6 10 +3 4 +8 58 +ଢ 658 +. 50 +a 44 +ଝ 572 +7 2 +m 36 +n 33 +c 31 +i 30 +e 30 +' 3172 +p 25 +t 24 +/ 20 +s 20 +o 19 +d 18 +, 17 +r 16 +: 16 +b 14 +’ 14 +l 14 +h 13 +u 12 +‍ 12 +ଋ 46 +w 11 +ଔ 4 +ଐ 14 +k 7 +g 7 +% 7 +ଈ 152 +f 6 +" 5 +v 5 +z 4 +y 4 +x 4 +[ 3 +] 3 +; 3 +¥ 3 +‘ 3 +” 2 +୦ 2 ++ 2 +õ 2 +* 1 +ୗ 1 +j 1 +ୄ 1 +° 1 +୭ 1 +£ 1 +! 1 +q 1 +$ 1 +– 2 +ଊ 6 +­ 7 diff --git a/mms-1b-all/pan/lexicon.txt b/mms-1b-all/pan/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..d5f809220e8075aacf02dbc5f5d324ace98b42dc --- /dev/null +++ b/mms-1b-all/pan/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/pan/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/pan/tokens.txt b/mms-1b-all/pan/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e913eb092b00436ba39ca9e6588cf496ca9295a --- /dev/null +++ b/mms-1b-all/pan/tokens.txt @@ -0,0 +1,91 @@ +| 154172 +ਾ 46073 +ਰ 26365 +ੀ 20684 +ਸ 24415 +ਿ 24903 +ੇ 36906 +ਕ 19950 +ਨ 24502 +ਦ 21904 +ਹ 30746 +ਤ 23857 +ਂ 25208 +ਲ 13296 +ਵ 13504 +ੱ 15304 +ੋ 14760 +ਜ 12309 +ਆ 13580 +ਮ 12217 +ਪ 11944 +ੰ 10809 +਼ 5521 +ੈ 6668 +ਅ 9671 +ਚ 6379 +ੁ 11323 +ਗ 7655 +ਬ 5153 +ੂ 10221 +ਟ 1207 +ਇ 5193 +। 2098 +ਣ 6169 +ਉ 12448 +ਈ 3246 +੍ 4340 +ਖ 5110 +ਡ 2626 +ਧ 1946 +ਫ 1377 +ਭ 3147 +ਥ 1814 +ੜ 3094 +ੌ 930 +ਯ 2164 +0 392 +ਏ 1448 +ਘ 610 +- 483 +1 303 +' 1 +ਝ 951 +ਛ 740 +ਐ 64 +‘ 195 +ਓ 471 +2 172 +ਠ 962 +9 129 +ਊ 112 +5 122 +4 102 +ਢ 276 +8 95 +. 92 +6 3 +3 86 +7 58 +, 58 +” 42 +: 36 +" 29 +; 28 +ਔ 133 +/ 21 +? 21 +! 10 +​ 10 +% 6 +– 5 +’ 5 +— 4 +° 3 +¥ 3 +x 2 +i 1 +l 79 +c 1 +f 1 +¤ 1 diff --git a/mms-1b-all/pol/lexicon.txt b/mms-1b-all/pol/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..856b6eb0f9e39c76c99e867b612feb244798fbd3 --- /dev/null +++ b/mms-1b-all/pol/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/pol/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/pol/tokens.txt b/mms-1b-all/pol/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd32cf9f7e7fba803148dddd1b6dc3fc8ff6bd13 --- /dev/null +++ b/mms-1b-all/pol/tokens.txt @@ -0,0 +1,70 @@ +| 113205 +a 49288 +i 55340 +o 42692 +e 46554 +n 28287 +z 36659 +r 21518 +w 26905 +s 26114 +t 21225 +c 24192 +y 23560 +d 21464 +k 13574 +p 16427 +m 18428 +u 12496 +j 12793 +l 10733 +g 9340 +ł 13906 +b 10776 +ą 5971 +h 6812 +ę 8201 +ó 5913 +ż 5873 +ś 5832 +ć 2934 +f 551 +0 2 +ń 697 +1 7 +2 3 +ź 552 +9 4 +5 5 +v 3 +3 1 +4 1 +6 3 +- 1 +8 3 +. 106 +x 6 +7 3 +: 43 +/ 37 +” 31 +' 13 +q 1 +, 26 +– 15 +% 14 +ü 10 +õ 6 +ú 4 +² 4 +é 3 +ã 3 +á 2 ++ 2 +° 2 +! 2 +ç 2 +; 2 +í 2 +& 1 +— 1237 diff --git a/mms-1b-all/por/lexicon.txt b/mms-1b-all/por/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..c2fd3b754b25d7293749ad5b62d674374495bc63 --- /dev/null +++ b/mms-1b-all/por/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/por/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/por/tokens.txt b/mms-1b-all/por/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..b532cf3089b06e200b5c8facff1ce39bdf0fe705 --- /dev/null +++ b/mms-1b-all/por/tokens.txt @@ -0,0 +1,78 @@ +| 858559 +a 411758 +e 501169 +o 428472 +s 359090 +r 236611 +i 193120 +n 166496 +d 193606 +t 149441 +m 188486 +u 185076 +c 114614 +p 102426 +l 87151 +v 66069 +g 38628 +f 38181 +b 28524 +h 41621 +q 57987 +ã 35860 +ç 13732 +é 24141 +á 16041 +í 9051 +z 15210 +j 21938 +x 4596 +ê 13300 +ó 8534 +0 1 +õ 2186 +1 2 +- 8077 +k 219 +ú 1596 +2 2 +â 467 +à 2508 +y 198 +w 190 +9 184 +5 1 +. 161 +4 5 +3 1 +6 1 +8 126 +ô 1270 +7 96 +/ 39 +, 31 +: 27 +' 157 +% 20 +" 11 +º 11 +° 7 +; 7 +$ 6 +” 4 +​ 4 +& 3 ++ 2 +ª 2 +² 2 +ë 2 +‘ 2 +’ 2 +[ 2 +] 2 +! 1 +ü 13 +ò 27 +́ 2 +ž 1 +— 4005 diff --git a/mms-1b-all/pus/lexicon.txt b/mms-1b-all/pus/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..a7289f51084cafdddc22d141110934b36bcd764a --- /dev/null +++ b/mms-1b-all/pus/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/pus/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/pus/tokens.txt b/mms-1b-all/pus/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e51ab45f54addc728d2a16b110878359cd25866 --- /dev/null +++ b/mms-1b-all/pus/tokens.txt @@ -0,0 +1,135 @@ +| 65862 +و 25962 +ا 21157 +ه 16778 +ی 15288 +ر 14113 +د 14047 +ن 12037 +ل 11776 +ک 9520 +ې 8576 +ت 8239 +م 7675 +ي 7178 +پ 6798 +س 6154 +ب 4154 +خ 3533 +ش 3032 +ړ 2888 +چ 2821 +ګ 1817 +ز 1782 +ټ 1663 +غ 1545 +ف 1500 +ځ 1489 +ج 1450 +ډ 1443 +ع 1387 +څ 1324 +ق 1118 +ح 1071 +ښ 941 +ږ 882 +a 740 +e 558 +ۍ 549 +ص 544 +، 540 +ط 533 +n 500 +o 476 +i 460 +r 459 +s 438 +ژ 397 +t 360 +0 347 +l 315 +1 300 +ئ 290 +ض 280 +ث 243 +u 230 +ظ 229 +c 225 +h 219 +d 208 +آ 187 +2 176 +g 168 +m 166 +. 144 +p 142 +9 135 +ڼ 132 +k 125 +۰ 123 +5 123 +b 117 +y 110 +4 97 +v 88 +8 84 +ذ 83 +3 77 +6 70 +f 67 +۱ 67 +: 58 +7 57 +w 57 +j 55 +۲ 49 +- 45 +z 44 +۔ 41 +ً 41 +۳ 36 +x 33 +۶ 32 +۴ 32 +۵ 29 +/ 25 +۹ 22 +ؤ 18 +۸ 17 +q 16 +, 15 +" 13 +َ 10 +۷ 10 +ك 9 +– 7 +' 7 +_ 6 +ى 6 +أ 5 +! 5 +” 5 +‌ 5 +ّ 4 +ھ 4 +$ 4 +— 4 +ـ 4 +؛ 3 +٪ 3 ++ 3 +° 3 +ء 3 +; 2 +é 2 +% 2 +گ 2 +ے 2 +؟ 2 +` 2 +í 1 +ُ 1 +¥ 1 +& 1 +ٌ 1 +‍ 1 diff --git a/mms-1b-all/ron/lexicon.txt b/mms-1b-all/ron/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..5378895274532351210560e27cd82729f9efd76f --- /dev/null +++ b/mms-1b-all/ron/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ron/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/ron/tokens.txt b/mms-1b-all/ron/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd6eed8c229e343a423062327f94b43b873d0f58 --- /dev/null +++ b/mms-1b-all/ron/tokens.txt @@ -0,0 +1,79 @@ +| 267297 +e 128553 +a 107385 +i 130732 +r 71670 +t 69123 +n 74396 +u 84514 +l 51250 +c 64106 +o 40670 +s 53689 +d 38621 +p 34915 +ă 50412 +m 35399 +f 13640 +v 21436 +ț 1017 +î 16862 +g 7709 +b 8426 +ș 11099 +z 9550 +â 10522 +h 5042 +- 10689 +j 2613 +0 670 +x 107 +k 17 +1 1 +. 348 +2 319 +y 2 +w 10 +9 224 +5 196 +4 186 +« 181 +» 180 +3 168 +8 149 +6 147 +7 93 +: 71 +q 10 +/ 39 +, 34 +% 19 +' 97 +; 12 +á 2 +ü 1 +” 10 +! 7 +– 207 +í 4 +õ 4 +ú 3 +¥ 3 +é 1 +² 3 +ç 3 +° 3 +? 2 +— 2 +[ 2 +] 2 ++ 2 +ã 2 +̇ 2 +ł 2 +$ 1 +ö 1 +ţ 15010 +ş 10752 +đ 1 +ć 1 diff --git a/mms-1b-all/rus/lexicon.txt b/mms-1b-all/rus/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..0579dc9b867580b7f3b0aa1a749f3bdbb648e6bb --- /dev/null +++ b/mms-1b-all/rus/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/rus/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/rus/tokens.txt b/mms-1b-all/rus/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9b15df8617514d6226c469ffbe5ba6dd0f57049 --- /dev/null +++ b/mms-1b-all/rus/tokens.txt @@ -0,0 +1,91 @@ +| 236024 +о 123583 +е 98480 +и 90704 +а 81569 +н 65744 +т 75524 +с 62801 +р 44974 +в 56706 +л 49971 +к 29814 +м 38577 +д 37289 +п 29251 +у 31688 +ы 24122 +я 22000 +г 22558 +ь 19628 +з 17108 +б 21652 +ч 15731 +й 11010 +х 11705 +ж 11590 +ю 7662 +ш 9876 +ц 3270 +щ 4653 +э 3380 +ф 819 +0 54 +- 2612 +ё 3042 +1 27 +a 202 +2 26 +e 2 +o 3 +i 2 +n 195 +r 2 +9 173 +t 171 +5 159 +s 158 +3 134 +4 4 +6 127 +l 2 +8 122 +ъ 88 +c 8 +h 2 +p 1 +7 85 +d 85 +g 76 +. 76 +u 74 +m 47 +k 1 +b 1 +v 43 +: 43 +— 876 +/ 42 +y 35 +w 33 +x 1 +f 3 +, 29 +j 12 +" 10 +z 1 +№ 7 +% 7 +' 21 +q 21 +– 1838 +õ 4 +; 4 ++ 3 +[ 2 +] 2 +& 2 +° 1 +! 1 +́ 1 +‐ 2 diff --git a/mms-1b-all/slk/lexicon.txt b/mms-1b-all/slk/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..86073bcf4a441a62017cd6b3632b1a32792e4d39 --- /dev/null +++ b/mms-1b-all/slk/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/slk/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/slk/tokens.txt b/mms-1b-all/slk/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..89dfaf25d738d819b7249aa34ca205798032b33f --- /dev/null +++ b/mms-1b-all/slk/tokens.txt @@ -0,0 +1,73 @@ +| 10934 +o 6421 +a 6712 +e 5438 +n 5637 +i 4651 +t 3380 +r 3816 +s 3067 +v 3024 +k 3278 +l 2706 +d 2335 +p 2042 +m 1878 +u 1784 +c 1756 +h 1512 +j 938 +z 1251 +á 1853 +b 1447 +y 1042 +í 653 +ý 1410 +ž 544 +č 721 +ú 531 +é 533 +š 496 +ť 531 +ľ 363 +g 402 +f 369 +ô 74 +0 387 +ó 129 +ď 77 +1 292 +ä 68 +ň 123 +2 194 +x 69 +. 151 +5 134 +w 39 +9 126 +8 113 +4 108 +6 104 +3 1 +7 75 +- 28 +ĺ 21 +/ 25 +: 22 +ŕ 9 +, 20 +q 2 +% 7 +' 6 +ü 5 +– 3 +õ 4 +ã 3 +; 3 +$ 2 +̇ 2 +ç 2 +& 1 +² 1 ++ 1 +° 1 diff --git a/mms-1b-all/slv/lexicon.txt b/mms-1b-all/slv/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..69d4300dd26c839e91cec1db02ed4dbd7e860cb8 --- /dev/null +++ b/mms-1b-all/slv/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/slv/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/slv/tokens.txt b/mms-1b-all/slv/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ea04f39b5d49cc5b8bfc8c20bd84ff28fa7442b --- /dev/null +++ b/mms-1b-all/slv/tokens.txt @@ -0,0 +1,68 @@ +| 8087 +a 3791 +e 4287 +o 3372 +i 3055 +n 2195 +r 1935 +t 1686 +s 1687 +j 1933 +l 1730 +v 1365 +k 1416 +d 1290 +p 1103 +m 1232 +z 764 +u 694 +b 782 +g 541 +h 361 +č 629 +c 233 +š 417 +ž 292 +f 25 +0 618 +1 449 +2 293 +9 188 +y 1 +5 163 +4 149 +w 5 +8 144 +3 136 +6 132 +. 130 +- 4 +7 87 +– 2 +/ 39 +, 30 +q 3 +x 1 +» 21 +« 21 +: 17 +' 1 +% 10 +ü 9 +á 7 +ú 4 +õ 4 +; 3 +ç 3 +é 3 ++ 3 +ć 3 +ö 2 +° 2 +² 2 +í 2 +& 1 +[ 1 +] 1 +ó 1 +" 1 diff --git a/mms-1b-all/sna/lexicon.txt b/mms-1b-all/sna/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..49dda76d1e8dc7da2689168a01f997c4b968af87 --- /dev/null +++ b/mms-1b-all/sna/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/sna/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/sna/tokens.txt b/mms-1b-all/sna/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecde40d1ac09c074870e964e71727202d2aead36 --- /dev/null +++ b/mms-1b-all/sna/tokens.txt @@ -0,0 +1,61 @@ +a 115378 +| 90332 +i 59438 +n 44522 +e 35199 +u 48213 +k 40058 +r 30002 +o 31963 +m 26557 +v 27120 +z 19374 +h 17886 +t 18567 +s 18058 +d 17777 +w 18325 +y 12055 +g 10788 +p 11033 +c 7765 +b 7188 +f 2681 +l 3 +j 2052 +0 580 +1 26 +2 1 +9 197 +5 145 +4 7 +3 132 +8 129 +6 119 +- 72 +. 96 +7 82 +x 74 +, 45 +' 20 +q 6 +: 30 +/ 25 +$ 10 +? 6 +; 5 +% 5 +ü 5 +° 4 +" 4 +! 4 +& 4 +ç 3 +á 3 +[ 2 +] 2 +_ 2 +ã 2 +ú 1 +é 1 +ʼ 51 diff --git a/mms-1b-all/snd/lexicon.txt b/mms-1b-all/snd/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..d9e05d60d3274fa6896b97152d5ed8cf92a034e2 --- /dev/null +++ b/mms-1b-all/snd/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/snd/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/snd/tokens.txt b/mms-1b-all/snd/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..4bce6eeb26c745aa730672e836dd2f0487bedd7d --- /dev/null +++ b/mms-1b-all/snd/tokens.txt @@ -0,0 +1,124 @@ +| 82920 +ي 42226 +ا 31208 +ن 26621 +و 23735 +ر 18266 +ه 16845 +ڪ 11430 +ج 11222 +م 9934 +ل 9910 +ت 8976 +س 8738 +د 5310 +پ 4846 +ب 4698 +ئ 4648 +ٽ 4261 +گ 3760 +ڻ 3706 +آ 3617 +ک 3539 +ٿ 3040 +ف 2686 +۾ 2421 +ش 2402 +ق 2171 +ع 2072 +ڏ 2011 +ز 1903 +ڊ 1873 +۽ 1858 +ح 1612 +چ 1527 +ص 1350 +خ 1333 +ء 1264 +ہ 1258 +ِ 1258 +ڙ 1228 +ط 1228 +ٻ 1087 +0 857 +ڳ 853 +ڌ 788 +1 643 +ڀ 608 +ٺ 556 +، 537 +ث 458 +ظ 456 +ڇ 416 +2 409 +ض 375 +ذ 358 +ڃ 333 +غ 290 +9 279 +ُ 258 +5 251 +ڍ 224 +4 224 +ڄ 224 +3 212 +6 203 +8 189 +- 173 +ھ 166 +. 155 +a 146 +7 140 +c 113 +s 105 +p 88 +i 87 +o 78 +e 77 +ڦ 76 +َ 70 +t 69 +u 68 +n 68 +ٰ 58 +d 54 +m 53 +r 53 +g 51 +: 51 +b 49 +l 40 +v 36 +h 33 +ڱ 33 +/ 31 +f 30 +, 29 +ً 26 +ی 20 +– 18 +— 17 +k 16 +” 16 +w 16 +y 13 +% 11 +‘ 9 +’ 9 +x 9 +؛ 8 +z 7 +j 6 +q 6 +" 6 +¥ 6 +õ 6 ++ 5 +° 5 +ڈ 3 +' 3 +² 2 +؟ 2 +! 2 +ـ 2 +& 2 diff --git a/mms-1b-all/som/lexicon.txt b/mms-1b-all/som/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..f45eac7eba2a59b472a5a1414b1b2fffaac2e7ef --- /dev/null +++ b/mms-1b-all/som/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/som/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/som/tokens.txt b/mms-1b-all/som/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..59a6714fe73fbc3dae630b9bc602142fbc16ad75 --- /dev/null +++ b/mms-1b-all/som/tokens.txt @@ -0,0 +1,69 @@ +a 336614 +| 246831 +i 131403 +o 91506 +d 79824 +n 84039 +y 68058 +e 48989 +u 70156 +s 46797 +h 47054 +k 45497 +r 34453 +l 42187 +b 28005 +g 31106 +m 30088 +t 21301 +w 31487 +x 23309 +c 11642 +q 10343 +f 4927 +j 6794 +0 790 +1 591 +p 584 +2 357 +- 325 +v 265 +9 254 +' 1420 +5 205 +4 191 +3 183 +. 173 +6 166 +8 162 +, 157 +z 154 +7 123 +: 52 +/ 46 +’ 30 +” 23 +$ 18 +ü 13 +" 11 +; 11 +% 8 +& 6 +¥ 6 +‘ 5 ++ 5 +ú 4 +° 4 +á 4 +​ 4 +! 3 +£ 3 +_ 3 +— 3 +[ 3 +] 3 +ç 3 +² 2 +ö 2 +í 2 +é 2 diff --git a/mms-1b-all/spa/lexicon.txt b/mms-1b-all/spa/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..a19dacf3e98d5bf5069306458c7980848a7a9297 --- /dev/null +++ b/mms-1b-all/spa/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/spa/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/spa/tokens.txt b/mms-1b-all/spa/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e67538038b448cdf043479c3f3bbdeccb53d0e7 --- /dev/null +++ b/mms-1b-all/spa/tokens.txt @@ -0,0 +1,239 @@ +| 1554514 +e 918851 +a 718793 +o 665140 +s 596211 +n 449635 +r 431493 +i 349530 +l 357824 +d 349211 +t 272412 +c 232527 +u 297328 +m 178471 +p 172658 +g 73409 +b 95751 +v 64378 +f 36063 +q 95232 +h 74062 +y 83495 +ó 39198 +í 47404 +á 36509 +j 50806 +z 20790 +é 34214 +x 3301 +0 474 +ñ 14621 +1 106 +k 65 +ú 13918 +2 88 +» 235 +« 233 +9 18 +w 7681 +5 44 +4 72 +3 20 +6 22 +. 153 +8 10 +7 18 +: 81 +- 4650 +ü 292 +% 25 +; 23 +/ 22 +¿ 22 +? 22 +, 20 +¡ 11 +! 11 +' 77 +$ 10 +° 7 +õ 7 ++ 5 +ç 3 +[ 3 +] 3 +ö 134 +& 2 +º 2 +ł 22 +ō 93 +ã 92 +— 2954 +ä 47 +ë 44 +– 2605 +́ 38 +ū 33 +â 30 +ø 28 +š 21 +ā 21 +ê 19 +ô 18 +ć 17 +ə 15 +ş 14 +å 13 +č 13 +ß 13 +ð 10 +ʻ 10 +ï 10 +ò 9 +æ 9 +л 8 +и 8 +е 8 +ː 8 +ś 8 +ý 7 +р 7 +ž 7 +` 7 +ī 6 +а 6 +ń 6 +î 6 +ė 6 +̈ 6 +œ 5 +đ 5 +þ 5 +′ 5 +ă 5 +ő 4 +ę 4 +ְ 4 +ź 4 +ř 4 +ì 4 +в 4 +б 3 +т 3 +н 3 +к 3 +о 3 +ě 3 +ו 3 +à 3 +û 3 +ș 3 +ਸ 3 +ਾ 3 +ү 2 +ш 2 +ʿ 2 +כ 2 +נ 2 +ל 2 +ש 2 +ḫ 2 +ё 2 +̇ 2 +♭ 2 +ù 2 +г 2 +с 2 +ь 2 +ч 2 +д 2 +ゴ 2 +ı 2 +‐ 2 +ț 2 + 2 +ŏ 1 +毵 1 +ө 1 +п 1 +ы 1 +⃗ 1 +飢 1 +飧 1 +飲 1 +ת 1 +ִ 1 +ס 1 +מ 1 +ֵ 1 +ּ 1 +ָ 1 +ֹ 1 +ם 1 +鮨 1 +鮓 1 +下 1 +妻 1 +市 1 +仙 1 +ϙ 1 +の 1 +ṃ 1 +й 1 +ら 1 +ラ 1 +良 1 +ミ 1 +箱 1 +消 1 +し 1 +ム 1 +戌 1 +山 1 +口 1 +真 1 +生 1 +̪ 1 +ē 1 +ذ 1 +ه 1 +ب 1 +ي 1 +ة 1 +寛 1 +裕 1 +浩 1 +ṁ 1 +ʽ 1 +申 1 +ב 1 +ר 1 +ק 1 +ṇ 1 +日 1 +本 1 +י 1 +ה 1 +肋 1 +肌 1 +背 1 +ਆ 1 +ਿ 1 +ੰ 1 +ਘ 1 +ਮ 1 +ਤ 1 +ਨ 1 +周 1 +ğ 1 +ю 1 +я 1 +网 1 +罒 1 +罓 1 +㓁 1 +貧 1 +貨 1 +販 1 +­ 1253 diff --git a/mms-1b-all/srp-script_latin/lexicon.txt b/mms-1b-all/srp-script_latin/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..13007b0f80426fd25e5c52bd97eb489d906af40f --- /dev/null +++ b/mms-1b-all/srp-script_latin/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/srp/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/srp-script_latin/tokens.txt b/mms-1b-all/srp-script_latin/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..b969d636f3dee6dcf4eda9154dbee0bccb9612fe --- /dev/null +++ b/mms-1b-all/srp-script_latin/tokens.txt @@ -0,0 +1,92 @@ +| 57933 +a 31823 +i 25880 +o 25305 +e 25116 +n 17752 +r 13539 +s 12998 +t 12904 +u 12208 +j 12041 +k 10027 +d 9742 +v 9428 +l 9326 +m 8752 +p 7887 +z 5047 +g 4640 +b 4042 +а 3570 +е 2906 +и 2873 +о 2776 +č 2641 +š 2355 +c 2263 +н 1895 +h 1777 +ž 1652 +р 1529 +т 1386 +ć 1381 +у 1345 +с 1307 +д 1147 +к 1094 +в 1016 +л 999 +ј 981 +м 947 +п 919 +f 879 +đ 682 +0 658 +з 512 +1 507 +г 487 +б 454 +. 443 +2 311 +ч 308 +ш 257 +ц 252 +њ 236 +9 212 +љ 194 +ж 192 +5 177 +- 168 +х 164 +4 160 +6 151 +3 149 +8 141 +ћ 125 +ф 116 +7 104 +ђ 56 +/ 47 +: 45 +, 28 +џ 17 +” 10 +w 9 +% 8 +x 7 +​ 6 +; 5 +° 4 +y 4 +õ 4 +' 3 +[ 3 +] 3 +! 2 ++ 2 +² 2 +q 2 +— 2 +– 1 +× 1 diff --git a/mms-1b-all/swe/lexicon.txt b/mms-1b-all/swe/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..29c96bcdbfdc96f262d1f089a15919a80a588310 --- /dev/null +++ b/mms-1b-all/swe/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/swe/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/swe/tokens.txt b/mms-1b-all/swe/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d54e2515009ad8d535fea92548038150fced73b --- /dev/null +++ b/mms-1b-all/swe/tokens.txt @@ -0,0 +1,70 @@ +| 395466 +e 156850 +a 154199 +r 136745 +t 124160 +n 133787 +s 104284 +i 82526 +l 84040 +d 95754 +o 79707 +m 63798 +k 41452 +g 55732 +v 38540 +f 35981 +ä 41670 +p 20049 +u 34015 +h 58853 +å 30067 +ö 28928 +b 18406 +c 25907 +y 6400 +j 19171 +" 736 +0 610 +1 406 +- 134 +. 359 +x 447 +2 2 +w 112 +9 168 +5 155 +4 142 +3 142 +z 12 +6 8 +8 115 +7 2 +: 62 +, 60 +q 16 +é 58 +/ 26 +' 34 +” 6 +; 6 +í 5 +% 5 +! 5 +á 5 ++ 5 +õ 4 +& 4 +– 212 +ú 3 +° 3 +ç 2 +ó 2 +̇ 2 +[ 2 +] 2 +ô 1 +$ 1 +² 1 +? 1 +ã 1 diff --git a/mms-1b-all/swh/lexicon.txt b/mms-1b-all/swh/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..a39a821de025d83ad2d2fd3d15df28707b92855f --- /dev/null +++ b/mms-1b-all/swh/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/swh/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/swh/tokens.txt b/mms-1b-all/swh/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..c42b6d2d82479a3826dbfa7be3b70c03449737df --- /dev/null +++ b/mms-1b-all/swh/tokens.txt @@ -0,0 +1,75 @@ +a 117901 +| 101268 +i 59813 +u 39538 +n 37923 +k 38348 +m 30633 +e 29978 +w 30965 +o 22729 +h 15426 +t 18614 +l 18135 +s 13515 +y 17929 +r 5910 +z 6124 +b 10705 +d 7405 +g 7136 +p 6287 +j 5831 +f 4735 +c 2098 +v 3135 +0 68 +1 10 +. 463 +2 5 +9 210 +5 6 +4 12 +3 1 +6 1 +8 2 +- 47 +7 1 +, 87 +x 41 +: 62 +” 50 +" 46 +; 43 +’ 40 +' 9 +/ 30 +q 7 +? 23 +! 17 +‘ 13 +– 13 +á 10 +ü 9 +õ 6 +— 6 +% 5 +° 3 ++ 3 +ç 3 +[ 2 +] 2 +& 2 +º 2 +ú 2 +í 1 +é 1 +â 28 +ï 2 +ø 2 +μ 2 +ã 2 +å 1 +ū 1 +ó 1 +ʼ 81 diff --git a/mms-1b-all/tam/lexicon.txt b/mms-1b-all/tam/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..d177444235dc9f63cdfbc69d7e84f0f705b49f3d --- /dev/null +++ b/mms-1b-all/tam/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/tam/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/tam/tokens.txt b/mms-1b-all/tam/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..9014520422b22a2c854fcf2b3e4b94655d69f4c3 --- /dev/null +++ b/mms-1b-all/tam/tokens.txt @@ -0,0 +1,116 @@ +் 587195 +| 386709 +க 291883 +ு 254061 +ி 193785 +த 229346 +ப 136257 +ம 134139 +ட 102052 +ர 145662 +ா 134435 +ல 90284 +வ 131759 +ன 140989 +ற 80366 +ள 96319 +ை 86899 +ய 90635 +ச 66432 +ந 63639 +அ 47612 +ெ 25554 +ண 27067 +ே 57343 +இ 31582 +ோ 30040 +ங 28026 +ொ 22324 +எ 30295 +ழ 15852 +உ 21006 +ீ 21188 +ூ 11132 +ஆ 11166 +ஒ 7741 +ஸ 4654 +0 126 +" 547 +1 36 +ஜ 2421 +. 339 +ஏ 3624 +2 32 +- 598 +ஹ 2 +ஷ 1307 +ஃ 595 +9 2 +5 2 +4 26 +ஞ 1321 +3 4 +6 10 +8 2 +ஓ 676 +ஐ 432 +7 2 +a 2 +o 866 +, 84 +m 381 +e 1190 +s 737 +ஈ 200 +t 688 +r 951 +i 856 +n 922 +c 412 +ஊ 827 +u 368 +p 244 +: 43 +h 489 +/ 32 +l 674 +k 196 +' 160 +b 319 +ௌ 38 +g 299 +f 148 +w 166 +d 402 +v 115 +$ 13 +x 32 +; 11 +y 235 +​ 10 +% 7 +! 5 +j 60 ++ 4 +q 12 +á 1 +z 56 +° 3 +? 2 +& 2 +’ 2 +² 2 +õ 2 +— 49 +[ 1 +] 1 +” 1 +– 4 +` 7 +ஔ 1 +š 1 +◯ 1 +ௗ 1 +ô 1 +é 1 +ഥ 1 diff --git a/mms-1b-all/tel/lexicon.txt b/mms-1b-all/tel/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..6d72f70fa30ea6b8937d7864b7af94f72f6a8097 --- /dev/null +++ b/mms-1b-all/tel/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/tel/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/tel/tokens.txt b/mms-1b-all/tel/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..30920bae1b5255063b46ad13349ff9efa324f58f --- /dev/null +++ b/mms-1b-all/tel/tokens.txt @@ -0,0 +1,121 @@ +| 487014 +్ 225489 +ి 258969 +ా 212375 +ు 264998 +ర 147371 +న 233362 +ం 142724 +ల 118641 +క 136532 +త 115076 +ప 101359 +వ 113780 +స 72216 +య 66449 +ద 89848 +మ 90511 +ట 48816 +చ 75725 +డ 81434 +ే 84197 +ో 56902 +గ 52999 +ీ 44798 +అ 43415 +ె 30513 +బ 16992 +జ 16635 +ై 12242 +ఉ 17026 +ూ 30232 +ధ 13560 +ొ 18138 +ష 13804 +శ 18243 +ణ 12454 +ఇ 9017 +ఆ 21133 +భ 11851 +హ 9866 +ఎ 8084 +ఫ 1361 +థ 3579 +ఒ 5499 +" 520 +0 20 +ళ 25246 +1 7 +ఈ 4982 +ృ 2994 +a 277 +ఏ 3409 +2 4 +ఖ 1943 +ౌ 2030 +. 219 +s 197 +e 194 +i 180 +౦ 173 +r 157 +o 156 +9 150 +n 148 +3 138 +u 137 +t 135 +5 1 +4 4 +l 118 +8 117 +- 17 +m 113 +6 7 +c 106 +ఓ 707 +h 100 +ఐ 328 +7 1 +ఘ 1267 +g 75 +‌ 73 +d 68 +p 66 +b 63 +f 61 +ఛ 297 +k 48 +, 46 +y 44 +w 31 +/ 31 +: 28 +ఊ 396 +ఞ 1994 +v 26 +ఠ 435 +' 192 +z 15 +j 13 +ఢ 132 +​ 10 +$ 9 +ఔ 94 +; 7 +x 7 +% 7 +q 5 +õ 4 +° 4 ++ 4 +ç 3 +ఋ 56 +ః 283 +! 1 +² 1 +? 1 +– 757 +ఱ 416 +ఝ 10 +ౄ 1 diff --git a/mms-1b-all/tgk/lexicon.txt b/mms-1b-all/tgk/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..78471cfc845c958d3632bdcf69eaec9708eba853 --- /dev/null +++ b/mms-1b-all/tgk/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/tgk/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/tgk/tokens.txt b/mms-1b-all/tgk/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..25d8c24f836bb3168b5a42f46f8cc268744112d1 --- /dev/null +++ b/mms-1b-all/tgk/tokens.txt @@ -0,0 +1,102 @@ +| 128868 +а 94399 +и 46644 +о 56677 +р 36270 +н 38255 +д 42162 +м 28579 +т 20809 +б 21845 +у 24659 +с 16449 +ҳ 16378 +к 17262 +е 16530 +в 15339 +л 8183 +ш 12292 +з 11675 +г 7267 +ф 5718 +қ 3682 +х 8207 +я 3021 +ҷ 3336 +ӣ 3116 +п 3395 +й 2569 +ё 3251 +ӯ 6174 +ъ 2104 +ч 2733 +0 546 +ғ 894 +1 423 +э 764 +- 274 +2 263 +a 225 +e 192 +9 184 +ю 320 +4 154 +5 154 +n 153 +s 151 +o 146 +t 144 +r 141 +6 137 +3 136 +i 127 +8 121 +ж 37 +l 94 +p 88 +c 88 +» 87 +7 87 +« 81 +. 74 +d 67 +m 65 +g 60 +u 58 +h 46 +: 43 +/ 42 +b 36 +w 34 +v 33 +k 31 +, 29 +f 27 +" 23 +y 21 +​ 20 +x 18 +‑ 18 +j 10 +' 9 +ь 742 +% 7 +z 6 +– 5 +° 5 +q 5 +щ 4 +№ 4 +õ 4 +; 4 +ц 3 ++ 3 +\ 2 +² 2 +! 2 +­ 2 +? 1 +[ 1 +] 1 +— 291 +‐ 63 diff --git a/mms-1b-all/tgl/lexicon.txt b/mms-1b-all/tgl/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..efb52da91a86459e779d3bb541c49fffd1e41d90 --- /dev/null +++ b/mms-1b-all/tgl/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/tgl/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/tgl/tokens.txt b/mms-1b-all/tgl/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..53da22afeefbbdd4e89217ce701cf6d51b5af8c0 --- /dev/null +++ b/mms-1b-all/tgl/tokens.txt @@ -0,0 +1,78 @@ +a 557687 +| 454525 +n 314476 +g 203053 +i 204946 +s 114918 +t 102658 +o 96337 +m 83903 +l 83126 +p 68962 +k 81679 +u 70660 +y 76259 +r 30483 +b 38646 +e 16404 +h 39230 +d 39150 +w 18221 +- 5671 +c 3086 +0 210 +1 98 +f 463 +v 252 +2 76 +' 5574 +9 9 +j 5561 +. 106 +5 18 +6 4 +z 337 +4 49 +8 5 +3 9 +7 4 +x 42 +, 44 +q 148 +: 26 +/ 21 +â 76 +ã 14 +€ 13 +á 170 +” 6 +$ 5 +% 5 +­ 3 +; 3 +& 3 +" 3 +˜ 3 +™ 3 +¥ 3 +! 2 +í 18 +£ 2 +œ 2 +¼ 2 +§ 2 ++ 1 +° 1 +• 1 +µ 1 +¶ 1 +º 1 +¡ 1 +ʼ 1203 +– 50 +— 46 +ô 16 +ó 8 +ú 8 +ñ 3 +î 2 diff --git a/mms-1b-all/tha/lexicon.txt b/mms-1b-all/tha/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/mms-1b-all/tha/lexicon.txt @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/mms-1b-all/tha/tokens.txt b/mms-1b-all/tha/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f1184eaa17ff86ab24e9a38e0f13a6e80b887cc --- /dev/null +++ b/mms-1b-all/tha/tokens.txt @@ -0,0 +1,129 @@ +า 220826 +น 159062 +่ 142369 +ร 141792 +ก 96368 +อ 118087 +ง 117827 +เ 125761 +้ 125386 +ม 77789 +ี 56821 +ั 80141 +ว 85140 +ย 69844 +ล 61427 +ท 74434 +ด 49870 +ส 42644 +ต 50653 +ิ 46641 +ห 49476 +บ 40508 +ะ 83143 +ป 39004 +ค 55908 +แ 43110 +จ 57104 +ใ 31463 +ข 48389 +ื 30036 +ไ 34746 +พ 73370 +์ 26219 +ช 19395 +ู 31688 +็ 30708 +ุ 18755 +ำ 2227 +โ 13302 +ึ 15877 +ถ 12742 +ซ 8251 +ศ 4909 +ผ 13224 +ณ 4847 +ภ 3124 +ษ 4842 +ญ 10266 +a 634 +0 150 +ธ 6969 +ฟ 3414 +e 455 +1 54 +ๆ 439 +o 434 +ฐ 1430 +r 397 +n 387 +2 38 +s 327 +i 326 +t 280 +l 262 +. 253 +ฉ 2207 +5 207 +ฝ 1752 +ฮ 704 +c 199 +ฤ 1184 +u 182 +4 30 +h 172 +3 170 +9 162 +d 157 +p 153 +m 150 +6 2 +ฎ 222 +g 137 +8 136 +k 111 +ฏ 943 +7 97 +b 92 +f 87 +ฒ 2 +, 80 +y 76 +w 65 +- 20 +๊ 74 +v 57 +ฬ 8 +j 39 +ฆ 681 +/ 33 +๋ 387 +x 26 +' 96 +z 26 +ฑ 56 +q 14 +ฯ 12 +% 10 +á 8 +ํ 15203 +ü 5 +° 4 +õ 4 +ú 4 +í 4 +​ 4 +[ 3 +] 3 +: 3 +é 2 +” 2 +" 2 +ó 2 ++ 2 +& 2 +– 2 +ฌ 4 +ç 1 +ł 1 +ๅ 8 diff --git a/mms-1b-all/tur/lexicon.txt b/mms-1b-all/tur/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..73d96db72acb64f7f08ac6266bceaaafa4932bb7 --- /dev/null +++ b/mms-1b-all/tur/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/tur/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/tur/tokens.txt b/mms-1b-all/tur/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6c78494b479ada7ae74a1aaefeb64eafe68f18f --- /dev/null +++ b/mms-1b-all/tur/tokens.txt @@ -0,0 +1,72 @@ +| 451802 +a 319357 +e 270133 +i 252347 +n 229689 +r 189105 +l 180905 +ı 132368 +k 114922 +d 132249 +t 89265 +y 96210 +s 93979 +m 91697 +u 96669 +o 59551 +b 70399 +ü 57461 +ş 41550 +g 34850 +z 51636 +v 29931 +ç 27442 +h 40035 +ğ 31860 +c 24767 +p 20848 +ö 29453 +f 9568 +' 23782 +0 235 +1 79 +j 731 +̇ 9233 +2 65 +. 251 +9 162 +w 14 +5 152 +4 28 +3 2 +6 6 +8 123 +7 81 +- 42 +x 4 +/ 32 +, 27 +: 22 +q 2 +; 13 +â 3404 +% 11 +’ 11 +" 6 +í 6 +¥ 4 +” 4 +– 253 +ó 2 +ã 2 +é 2 +° 2 +ú 2 +— 2 +! 2 +? 1 ++ 1 +ł 1 +î 1090 +û 171 +á 1 diff --git a/mms-1b-all/ukr/lexicon.txt b/mms-1b-all/ukr/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..40bf7820b4459745d458cb651d9281cd8e67af7e --- /dev/null +++ b/mms-1b-all/ukr/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/ukr/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/ukr/tokens.txt b/mms-1b-all/ukr/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f8e5bf3e07cddfe32ab9496019809a5b3b8f21f --- /dev/null +++ b/mms-1b-all/ukr/tokens.txt @@ -0,0 +1,94 @@ +| 111394 +о 52473 +а 39370 +н 28662 +і 28335 +и 33691 +т 25161 +в 31908 +р 17967 +е 22609 +с 22652 +к 12342 +л 16803 +д 17459 +у 15465 +м 15307 +п 13429 +з 9998 +я 11283 +ь 7591 +б 12111 +г 9654 +ч 5761 +й 8321 +х 7119 +ю 4159 +ж 5582 +ц 2842 +ш 3878 +ї 2957 +є 2951 +щ 4293 +ф 287 +0 635 +- 3 +1 520 +' 611 +2 304 +9 233 +5 190 +4 168 +6 165 +3 164 +8 162 +7 114 +. 78 +a 2 +n 1 +r 67 +e 2 +i 246 +c 1 +o 46 +p 46 +: 42 +/ 39 +t 37 +ґ 37 +h 35 +l 1 +g 30 +, 29 +b 27 +— 1782 +s 24 +u 1 +k 22 +v 20 +d 20 +m 1 +y 13 +– 99 +f 9 +% 8 +№ 8 +w 8 +x 1 +​ 6 +; 5 +j 4 +" 4 ++ 3 +z 3 +õ 2 +q 2 +[ 1 +] 1 +ú 1 +° 1 +& 1 +! 1 +² 1 +́ 5 +‐ 46 diff --git a/mms-1b-all/umb/lexicon.txt b/mms-1b-all/umb/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..7cf57ac8cae0df8a11a980b4940b06b04d214dd2 --- /dev/null +++ b/mms-1b-all/umb/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/umb/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/umb/tokens.txt b/mms-1b-all/umb/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a56faef78d60720267797edee68adff2799c025 --- /dev/null +++ b/mms-1b-all/umb/tokens.txt @@ -0,0 +1,71 @@ +| 23958 +a 22708 +o 16657 +i 12178 +l 11685 +e 10837 +k 9705 +n 8138 +u 6843 +v 6754 +w 5819 +t 5543 +y 5498 +m 4539 +s 3947 +c 3280 +d 2690 +p 2628 +g 2381 +b 1512 +' 1473 +h 1003 +r 943 +j 817 +f 787 +ã 652 +ĩ 353 +- 311 +õ 293 +ñ 287 +ẽ 233 +0 209 +1 164 +’ 126 +2 98 +z 72 +9 61 +é 60 +á 60 +3 60 +5 56 +4 56 +q 53 +6 52 +â 45 +8 44 +x 40 +. 36 +7 34 +ó 31 +í 29 +ç 23 +ũ 19 +: 17 +/ 15 +, 12 +ú 12 +º 11 +” 10 +ê 8 +ῖ 6 +% 5 +; 3 +– 3 +` 1 +! 1 +& 1 +² 1 +$ 1 +ô 1 +? 1 diff --git a/mms-1b-all/urd-script_arabic/lexicon.txt b/mms-1b-all/urd-script_arabic/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..dbda65c1b0fa6e5aeae9c26a75c57e68422d76fb --- /dev/null +++ b/mms-1b-all/urd-script_arabic/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/urd/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/urd-script_arabic/tokens.txt b/mms-1b-all/urd-script_arabic/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d74888233af85c0bb6c189ed3783f16721e3fd6 --- /dev/null +++ b/mms-1b-all/urd-script_arabic/tokens.txt @@ -0,0 +1,123 @@ +| 540143 +ا 209414 +ی 157099 +ک 122035 +ر 104294 +و 126912 +ے 108823 +ن 85263 +ہ 114956 +م 73549 +ت 68443 +س 71300 +ل 49107 +ب 45991 +ں 53425 +د 47311 +پ 30113 +ج 28337 +گ 25502 +ھ 36577 +ئ 18833 +ٹ 5023 +ق 11598 +ع 14980 +ف 9934 +ز 10005 +ش 12575 +ح 12006 +چ 13130 +ط 6204 +خ 14728 +ص 5509 +ڈ 1273 +آ 10713 +ڑ 5117 +0 135 +ض 2508 +ث 1053 +1 24 +ظ 1510 +غ 2508 +ذ 1059 +ُ 259 +۔ 255 +، 247 +2 29 +9 8 +5 12 +ِ 141 +4 23 +3 7 +ؤ 2354 +6 3 +ء 143 +8 4 +ّ 97 +7 7 +ً 90 +- 42 +َ 42 +ٰ 1508 +. 30 +: 26 +/ 19 +' 43 +a 18 +” 17 +ژ 52 +" 15 +, 14 +u 10 +s 10 +n 9 +i 8 +‏ 8 +y 7 +o 6 +ۃ 8 +ۂ 13 +d 5 +l 5 +؛ 4 +% 4 +c 4 +p 1 +e 4 +b 1 +إ 3 +[ 3 +] 3 +! 3 +ٍ 3 +° 3 +m 3 +f 3 +w 3 +ﷺ 2 +g 2 +​ 2 +$ 2 +x 2 +j 2 +h 2 +ٖ 2 +õ 2 +k 2 +— 12 +ي 23 +٪ 1 +ه 1 +r 1 +ٴ 1 +ٔ 4 +ؔ 797 +أ 34 +۰ 8 +– 5 +۴ 3 +۷ 3 +۲ 2 +۵ 2 +۳ 1 +۱ 1 diff --git a/mms-1b-all/uzb-script_latin/lexicon.txt b/mms-1b-all/uzb-script_latin/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..8545c0b5e1c69c65d521783212cca3237be4a24b --- /dev/null +++ b/mms-1b-all/uzb-script_latin/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/uzb/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/uzb-script_latin/tokens.txt b/mms-1b-all/uzb-script_latin/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5d5fcf835ac7e25389127bdb79306eca0ce4fb4 --- /dev/null +++ b/mms-1b-all/uzb-script_latin/tokens.txt @@ -0,0 +1,109 @@ +a 241314 +| 248492 +i 212551 +n 112301 +o 112012 +l 91207 +r 86718 +s 75537 +h 68368 +t 64205 +d 67992 +g 57898 +u 52876 +y 48527 +m 61200 +k 44891 +b 56524 +e 39297 +q 39421 +ʻ 5424 +v 16508 +z 27003 +c 17290 +p 8923 +f 7033 +j 7737 +x 9186 +- 4567 +0 663 +' 26037 +ʼ 1365 +1 520 +2 324 +9 238 +5 195 +4 179 +3 167 +6 163 +8 149 +7 110 +. 63 +/ 54 +" 46 +w 27 +: 34 +— 1320 +, 31 +– 1156 +% 10 +; 7 +õ 10 +¥ 6 +² 4 ++ 3 +[ 2 +] 2 +х 2 +! 2 + 2 +° 1 +& 1 +` 4861 +а 107 +и 31 +́ 27 +р 26 +н 22 +м 21 +т 20 +о 19 +у 18 +\ 14 +с 13 +г 10 +қ 10 +к 10 +б 8 +ҳ 7 +ў 6 +л 6 +д 6 +ч 5 +е 5 +ʙ 5 +з 5 +й 4 +ó 4 +ғ 4 +‒ 4 +ģ 4 +ò 4 +в 3 +ь 3 +ӯ 3 +ш 2 +✔ 2 +️ 2 +э 2 +ј 2 +ƣ 1 +μ 1 +ş 1 +п 1 +ё 1 +ы 1 +♂ 1 +ë 1 +ı 1 +ƶ 1 diff --git a/mms-1b-all/vie/lexicon.txt b/mms-1b-all/vie/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..04e162ca13689486341fe0e0d9d4a9659470b5f7 --- /dev/null +++ b/mms-1b-all/vie/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/vie/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/vie/tokens.txt b/mms-1b-all/vie/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..aebf64bbee57675a5f561b2848450a408e4230b9 --- /dev/null +++ b/mms-1b-all/vie/tokens.txt @@ -0,0 +1,133 @@ +| 818202 +n 333020 +h 258973 +t 171625 +c 179580 +i 189947 +g 170076 +a 105841 +u 57742 +đ 86757 +o 53636 +m 71614 +à 66958 +r 48633 +v 50256 +l 57094 +ư 57975 +b 30429 +y 46715 +á 34676 +k 32243 +s 36823 +p 28313 +d 15938 +ế 26134 +ộ 13262 +ạ 18326 +e 19164 +ó 23442 +ô 40603 +ả 16031 +ố 13427 +ê 29048 +ấ 19380 +ể 7899 +ờ 36841 +ớ 15768 +ệ 8932 +ủ 10803 +ợ 12165 +ề 15803 +â 12790 +q 6947 +ị 9048 +ầ 12577 +ơ 12507 +ậ 12694 +ã 16425 +ữ 11464 +ự 8428 +ì 23396 +ở 8507 +x 12161 +ă 5569 +í 7325 +ứ 20048 +ọ 12089 +ặ 4891 +ụ 4019 +ú 26399 +0 91 +ắ 5346 +ổ 2678 +ù 5377 +ừ 6200 +1 39 +ồ 6878 +ò 7045 +ử 2628 +ằ 5747 +ẽ 6141 +ỏ 4010 +2 18 +ũ 4652 +ý 1620 +ỉ 2253 +f 2 +w 255 +9 248 +ễ 1309 +é 4265 +ĩ 909 +ẫ 1411 +5 8 +3 4 +4 14 +- 52839 +ẩ 705 +j 1524 +6 4 +ỗ 2869 +8 169 +. 157 +z 1 +ẻ 4999 +7 1 +ỳ 518 +ỷ 542 +è 1566 +ỹ 71 +ỡ 1463 +ẹ 1291 +/ 64 +ẳ 3125 +õ 762 +, 42 +: 35 +ẵ 288 +' 133 +% 17 +” 16 +< 12 +> 12 +ü 10 +; 9 +ỵ 61 +’ 7 +$ 6 +‘ 5 +° 4 ++ 4 +" 4 +¥ 3 +ç 2 +[ 2 +] 2 +² 2 +& 2 +! 2 +— 213 +ö 1 +– 136 +‐ 1407 diff --git a/mms-1b-all/wol/lexicon.txt b/mms-1b-all/wol/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..683410c2fb69f0e56ebda19e90596686e3bf0b0c --- /dev/null +++ b/mms-1b-all/wol/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/wol/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/wol/tokens.txt b/mms-1b-all/wol/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..84adbae41646daed16cf71ecfde33e45d7fef6eb --- /dev/null +++ b/mms-1b-all/wol/tokens.txt @@ -0,0 +1,83 @@ +| 52514 +a 28245 +i 18364 +n 15864 +e 15346 +u 12730 +o 12603 +l 9211 +y 8257 +t 8085 +k 8002 +b 7709 +r 7689 +m 7395 +d 6696 +g 6304 +s 5357 +c 5022 +w 4399 +j 3258 +x 3016 +p 2894 +ñ 2833 +ë 2665 +f 2628 +é 2615 +à 1247 +h 688 +- 335 +v 324 +0 272 +ó 214 +1 189 +q 147 +2 108 +z 101 +ŋ 99 +5 78 +3 76 +6 66 +4 65 +9 59 +8 56 +. 47 +' 38 +, 35 +è 33 +7 26 +: 25 +’ 23 +/ 20 +ê 19 +” 16 +ç 15 +ü 13 +« 11 +» 11 +% 10 +ï 10 +; 10 +â 9 +‘ 8 +ô 8 +$ 8 +õ 6 +ã 5 +á 4 +í 4 +< 4 +> 4 +" 3 +² 2 +? 2 +& 2 +ú 2 +– 2 ++ 2 +° 2 +î 2 +— 1 +[ 1 +] 1 +ù 1 diff --git a/mms-1b-all/xho/lexicon.txt b/mms-1b-all/xho/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..f3d1c973b2c0adc9fe952147c546dca0f83ee7b0 --- /dev/null +++ b/mms-1b-all/xho/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/xho/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/xho/tokens.txt b/mms-1b-all/xho/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e03cd0ca3025a8843bcb78f8a8b7cea74241fa9 --- /dev/null +++ b/mms-1b-all/xho/tokens.txt @@ -0,0 +1,72 @@ +a 53161 +| 52385 +e 40950 +i 37556 +n 33747 +u 26626 +k 24339 +o 24306 +l 22475 +h 16444 +s 13866 +b 12911 +y 12829 +w 11891 +m 11727 +z 11242 +t 10586 +g 9964 +d 5926 +p 4594 +r 3867 +c 3367 +- 2896 +q 2888 +f 2550 +x 2035 +v 1609 +j 1487 +0 862 +1 652 +2 403 +9 281 +5 242 +4 228 +. 217 +3 210 +6 198 +8 191 +7 131 +, 128 +' 76 +: 55 +/ 44 +$ 20 +; 18 +” 17 +á 13 +% 10 +" 9 +ü 9 +í 8 +õ 6 +¥ 6 +— 6 +& 5 +° 5 ++ 5 +é 4 +ú 4 +ã 3 +ç 3 +– 3 +[ 3 +] 3 +£ 3 +² 2 +‘ 2 +̇ 2 +ó 2 +’ 2 +ö 2 +! 2 diff --git a/mms-1b-all/yor/lexicon.txt b/mms-1b-all/yor/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..44ee75929c5fdf35381679ed658a35df5cf973c9 --- /dev/null +++ b/mms-1b-all/yor/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed2/yor/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/yor/tokens.txt b/mms-1b-all/yor/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..4316582cdabd41bd179b2f410e85349aa4d4ea7c --- /dev/null +++ b/mms-1b-all/yor/tokens.txt @@ -0,0 +1,90 @@ +| 164509 +n 52754 +i 28076 +a 20622 +ọ 32373 +t 21710 +í 36708 +l 16071 +o 12236 +à 21828 +r 20377 +b 19135 +e 7872 +ẹ 21784 +k 14927 +w 16858 +s 13612 +ì 17751 +á 14592 +̀ 15921 +g 11980 +́ 16281 +p 8189 +d 6634 +m 10964 +j 7670 +u 7951 +ó 10101 +ú 13379 +y 10283 +ò 7564 +f 7216 +é 6711 +è 6416 +ṣ 6804 +ù 5051 +h 2291 +c 758 +ń 2992 +0 305 +- 863 +1 251 +. 226 +v 216 +2 169 +ǹ 674 +9 154 +z 97 +8 89 +4 87 +5 87 +6 84 +, 78 +3 71 +x 56 +7 51 +” 49 +ę 35 +' 11 +: 29 +q 28 +´ 25 +; 25 +/ 18 +ḿ 15 +’ 10 +ë 10 +ü 9 +? 9 +— 12 +% 6 +̣ 6 +‘ 5 +õ 4 +\ 3 +ç 3 +$ 3 +} 2 +° 2 +[ 2 +] 2 +£ 2 ++ 2 +² 2 +! 2 +ï 2 +– 2 += 1 +ṛ 1 +ạ 1 diff --git a/mms-1b-all/yue-script_traditional/lexicon.txt b/mms-1b-all/yue-script_traditional/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..4af18322e32f3dd19579c80e26e4a306ad11e049 --- /dev/null +++ b/mms-1b-all/yue-script_traditional/lexicon.txt @@ -0,0 +1 @@ +None \ No newline at end of file diff --git a/mms-1b-all/yue-script_traditional/tokens.txt b/mms-1b-all/yue-script_traditional/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e17abd746c8a6e999953b534f310c781b4ccf64 --- /dev/null +++ b/mms-1b-all/yue-script_traditional/tokens.txt @@ -0,0 +1,2295 @@ +的 2581 +在 776 +一 661 +是 640 +有 588 +人 472 +為 415 +國 394 +0 377 +會 377 +1 362 +和 356 +地 352 +大 350 +時 338 +他 338 +這 335 +以 326 +了 324 +不 323 +能 316 +中 305 +可 293 +上 282 +、 279 +到 274 +多 268 +年 266 +們 256 +行 248 +成 242 +動 239 +個 238 +之 235 +於 235 +來 235 +要 229 +生 216 +或 210 +其 209 +法 209 +也 205 +用 200 +方 199 +2 198 +後 191 +常 189 +發 187 +物 186 +家 186 +與 186 +斯 184 +「 184 +些 183 +部 183 +過 183 +都 182 +」 179 +此 173 +而 173 +並 168 +出 168 +所 168 +但 165 +最 165 +自 164 +分 163 +學 160 +種 159 +因 156 +現 153 +公 152 +得 151 +表 151 +對 150 +亞 149 +球 149 +前 147 +名 146 +加 145 +間 145 +將 145 +9 143 +下 141 +事 141 +就 140 +比 139 +特 138 +更 137 +該 136 +西 135 +子 135 +作 135 +使 135 +經 134 +及 133 +當 131 +利 129 +度 129 +理 129 +體 128 +被 127 +月 125 +重 124 +同 124 +車 124 +開 123 +通 122 +者 122 +日 121 +定 121 +我 121 +里 121 +美 120 +從 120 +電 118 +你 118 +受 117 +數 117 +許 116 +著 115 +式 114 +文 114 +爾 113 +5 113 +4 113 +場 111 +世 110 +進 110 +主 109 +水 109 +如 108 +本 108 +說 108 +提 107 +a 107 +德 107 +力 107 +然 106 +克 106 +由 104 +達 104 +它 103 +區 102 +入 102 +英 102 +旅 101 +小 101 +語 101 +稱 100 +天 100 +示 100 +8 100 +外 99 +新 99 +機 99 +相 99 +已 98 +e 98 +拉 97 +3 97 +長 97 +至 97 +隊 96 +海 96 +第 96 +位 95 +賽 95 +面 95 +類 95 +造 94 +活 92 +手 91 +o 91 +道 91 +員 90 +馬 89 +原 89 +非 89 +蘭 88 +程 88 +意 87 +關 87 +果 87 +民 87 +運 86 +t 86 +正 86 +化 86 +期 86 +空 85 +變 85 +全 84 +明 83 +起 82 +洲 82 +內 82 +n 82 +6 82 +i 81 +接 81 +性 81 +次 80 +無 79 +影 79 +應 79 +羅 77 +目 77 +心 77 +選 76 +點 76 +議 76 +很 76 +量 76 +r 75 +確 75 +基 75 +巴 75 +且 75 +等 74 +風 74 +身 73 +沒 73 +需 72 +那 72 +太 72 +型 72 +高 72 +二 72 +路 71 +s 71 +持 71 +代 71 +必 71 +樣 71 +7 70 +界 70 +任 70 +軍 69 +帶 69 +少 69 +計 69 +組 68 +認 68 +知 68 +三 68 +立 68 +合 68 +聯 67 +: 67 +教 67 +只 67 +據 67 +直 67 +今 66 +約 66 +南 65 +速 65 +每 65 +查 65 +科 64 +報 64 +產 64 +保 64 +您 64 +業 64 +奧 63 +建 63 +結 63 +幾 63 +讓 63 +工 63 +維 62 +北 62 +流 62 +供 62 +指 62 +遊 62 +十 61 +院 61 +d 61 +獲 60 +較 60 +l 60 +網 59 +史 59 +看 59 +格 59 +近 59 +回 58 +金 58 +城 58 +· 58 +解 58 +政 58 +言 58 +單 58 +布 57 +傳 57 +戰 57 +島 57 +統 57 +件 57 +氣 57 +號 56 +尼 56 +實 56 +司 56 +山 55 +決 55 +票 55 +市 54 +阿 54 +光 54 +兩 54 +觀 54 +須 54 +始 54 +取 53 +去 53 +擊 53 +站 53 +客 53 +星 52 +打 52 +平 52 +雪 52 +住 51 +州 51 +處 51 +才 51 +導 51 +何 51 +未 51 +座 51 +像 51 +記 51 +頭 50 +試 50 +己 50 +管 50 +給 50 +歐 49 +務 49 +往 49 +c 49 +放 49 +季 49 +圖 49 +質 49 +題 49 +安 48 +‧ 48 +某 48 +士 48 +論 48 +口 48 +紀 47 +東 47 +改 47 +印 47 +義 47 +舉 47 +助 47 +還 46 +際 46 +堡 46 +則 46 +神 46 +交 46 +做 46 +案 46 +u 46 +邊 46 +形 46 +卡 46 +飛 45 +歷 45 +資 45 +狀 45 +好 45 +服 45 +準 45 +裡 44 +各 44 +病 44 +足 44 +節 44 +航 44 +先 44 +線 43 +份 43 +p 43 +h 43 +品 43 +m 43 +制 43 +片 43 +冰 43 +感 43 +根 43 +字 42 +極 42 +熱 42 +情 42 +別 42 +週 42 +傷 42 +續 42 +林 41 +費 41 +見 41 +察 41 +四 41 +響 41 +修 41 +控 41 +落 41 +連 40 +存 40 +再 40 +強 40 +反 40 +仍 40 +調 40 +告 40 +土 40 +價 39 +居 39 +g 39 +越 39 +包 39 +具 39 +預 39 +群 39 +府 39 +致 39 +證 39 +問 39 +波 38 +王 38 +萬 38 +食 38 +曾 38 +況 38 +系 38 +消 38 +警 38 +威 38 +鏡 38 +野 37 +伊 37 +支 37 +害 37 +隨 37 +領 36 +即 36 +想 36 +源 36 +船 36 +易 36 +視 36 +信 36 +規 36 +例 35 +裝 35 +備 35 +設 35 +輕 35 +甚 35 +局 35 +首 35 +班 35 +求 35 +向 35 +超 35 +爭 35 +團 35 +聲 35 +滑 35 +失 34 +花 34 +老 34 +遭 34 +態 34 +洛 34 +離 34 +訊 34 +希 33 +聖 33 +測 33 +古 33 +官 33 +適 33 +攻 33 +權 33 +顯 33 +細 33 +千 33 +器 33 +醫 33 +話 32 +演 32 +河 32 +低 32 +移 32 +請 32 +眾 32 +, 32 +參 31 +蘇 31 +術 31 +華 31 +油 31 +完 31 +故 31 +雖 31 +共 31 +遠 31 +屬 31 +級 31 +洋 31 +火 31 +音 31 +總 31 +難 31 +創 31 +納 31 +坦 31 +命 31 +五 31 +門 31 +龍 31 +望 30 +項 30 +死 30 +展 30 +險 30 +織 30 +協 30 +標 30 +樂 30 +抗 30 +養 30 +酒 30 +境 30 +普 29 +雷 29 +元 29 +律 29 +乎 29 +紐 29 +快 29 +研 29 +愛 29 +獨 29 +畫 29 +石 28 +便 28 +師 28 +步 28 +登 28 +推 28 +她 28 +育 28 +習 28 +夫 28 +收 28 +腦 28 +究 28 +夏 28 +排 28 +治 28 +微 28 +環 28 +; 27 +另 27 +商 27 +降 27 +健 27 +瑞 27 +塔 27 +辦 27 +陸 27 +園 27 +潛 27 +母 27 +社 26 +衛 26 +技 26 +般 26 +跟 26 +功 26 +斷 26 +廣 26 +麼 26 +周 26 +核 26 +構 26 +薩 26 +留 26 +《 26 +》 26 +庭 25 +早 25 +- 25 +否 25 +駛 25 +滿 25 +引 25 +含 25 +銀 25 +若 25 +容 25 +森 25 +待 25 +料 24 +宣 24 +陽 24 +照 24 +富 24 +白 24 +房 24 +景 24 +域 24 +製 24 +牠 24 +毒 24 +嚴 24 +牙 24 +找 24 +輪 24 +射 24 +令 24 +附 24 +伯 24 +投 24 +營 24 +驗 24 +條 24 +整 23 +哥 23 +駕 23 +止 23 +轉 23 +危 23 +初 23 +委 23 +澳 23 +息 23 +夠 23 +走 22 +防 22 +盟 22 +束 22 +限 22 +真 22 +透 22 +米 22 +購 22 +晚 22 +黑 22 +送 22 +序 22 +興 22 +鐘 22 +孩 22 +批 22 +置 22 +護 21 +店 21 +除 21 +廟 21 +藝 21 +y 21 +聞 21 +檢 21 +恐 21 +識 21 +短 21 +增 21 +露 21 +卻 21 +爆 21 +似 21 +效 21 +蓋 21 +避 21 +依 21 +拿 21 +塞 21 +括 21 +專 21 +b 21 +思 21 +差 21 +百 21 +樹 21 +餐 21 +碼 20 +女 20 +象 20 +簡 20 +底 20 +典 20 +缺 20 +什 20 +錄 20 +朝 20 +免 20 +遺 20 +率 20 +邦 20 +捕 20 +訴 20 +圍 20 +損 20 +埃 20 +充 20 +互 20 +又 20 +簽 20 +弓 20 +沙 19 +異 19 +終 19 +寫 19 +諾 19 +久 19 +胞 19 +清 19 +監 19 +摩 19 +列 19 +承 19 +艙 19 +優 19 +炸 19 +密 19 +艇 19 +把 19 +補 19 +攝 19 +模 19 +親 19 +飲 19 +. 19 +返 18 +奇 18 +念 18 +色 18 +角 18 +席 18 +濟 18 +階 18 +亡 18 +隻 18 +訂 18 +精 18 +俄 18 +颶 18 +農 18 +魯 18 +努 18 +游 18 +艦 18 +救 18 +切 18 +注 18 +木 18 +洞 18 +評 18 +敗 18 +館 17 +飯 17 +播 17 +佳 17 +雨 17 +兒 17 +帕 17 +? 17 +w 17 +買 17 +汽 17 +繼 17 +築 17 +廳 17 +康 17 +壓 17 +倫 17 +暴 17 +尋 17 +距 17 +段 17 +集 17 +溫 17 +額 17 +負 17 +耳 17 +戶 17 +艾 16 +競 16 +博 16 +亦 16 +染 16 +k 16 +七 16 +燈 16 +半 16 +醒 16 +符 16 +抵 16 +輸 16 +歲 16 +審 16 +勒 16 +編 16 +腳 16 +考 16 +宿 16 +冬 16 +付 16 +係 16 +算 16 +摺 16 +九 16 +索 16 +翻 16 +彈 16 +擁 15 +擇 15 +頓 15 +層 15 +右 15 +破 15 +壞 15 +猛 15 +儘 15 +深 15 +烈 15 +症 15 +啟 15 +沿 15 +蒙 15 +京 15 +六 15 +獵 15 +岩 15 +焦 15 +脅 15 +錢 15 +訓 15 +停 15 +幫 15 +憲 15 +守 14 +唯 14 +獎 14 +湖 14 +跑 14 +庫 14 +版 14 +複 14 +減 14 +判 14 +v 14 +% 14 +享 14 +攜 14 +書 14 +鬆 14 +灣 14 +換 14 +騎 14 +復 14 +震 14 +稅 14 +脈 14 +. 14 +荷 14 +嘯 14 +掉 14 +板 14 +隆 14 +略 14 +犯 14 +仰 14 +丹 14 +雜 14 +乘 14 +皇 14 +閉 14 +芬 14 +傑 14 +素 14 +億 14 +歡 14 +估 14 +哈 14 +寄 14 +譯 14 +恩 13 +男 13 +左 13 +跨 13 +私 13 +友 13 +患 13 +泰 13 +舊 13 +郵 13 +省 13 +梵 13 +殿 13 +僅 13 +台 13 +翰 13 +激 13 +蒂 13 +釋 13 +疾 13 +冠 13 +曼 13 +豹 13 +姆 13 +勝 13 +封 13 +匹 13 +施 13 +盧 13 +皆 13 +宗 13 +喜 13 +頂 13 +鹿 13 +詞 13 +積 13 +戲 13 +贏 13 +嫌 13 +准 13 +族 13 +爪 13 +違 13 +侵 13 +永 13 +坡 12 +麥 12 +彼 12 +劃 12 +媒 12 +迫 12 +臘 12 +觸 12 +童 12 +幅 12 +泡 12 +覺 12 +征 12 +盡 12 +探 12 +兵 12 +貨 12 +脫 12 +針 12 +延 12 +萊 12 +雲 12 +捲 12 +訪 12 +範 12 +雙 12 +尺 12 +室 12 +趣 12 +衝 12 +眼 12 +暫 12 +練 12 +劇 12 +述 12 +尤 12 +候 12 +味 12 +紙 12 +張 12 +惡 12 +貴 12 +托 12 +牛 12 +升 12 +陵 12 +魚 12 +拒 12 +弗 12 +幕 12 +丁 12 +筆 12 +禮 11 +退 11 +汗 11 +毛 11 +吉 11 +擔 11 +鄰 11 +液 11 +徒 11 +窗 11 +授 11 +/ 11 +遇 11 +宜 11 +霍 11 +武 11 +尚 11 +瓦 11 +良 11 +禁 11 +貓 11 +鬧 11 +順 11 +嶼 11 +殺 11 +拖 11 +採 11 +遷 11 +菌 11 +迪 11 +聽 11 +款 11 +吸 11 +緊 11 +揮 11 +讀 11 +八 11 +血 11 +骨 11 +瓜 11 +植 11 +慶 11 +責 11 +突 11 +絲 10 +援 10 +街 10 +徵 10 +絕 10 +弱 10 +願 10 +蟲 10 +薄 10 +校 10 +遍 10 +診 10 +困 10 +覆 10 +: 10 +岸 10 +申 10 +帝 10 +吃 10 +休 10 +夜 10 +裂 10 +端 10 +繞 10 +歌 10 +搭 10 +漢 10 +礙 10 +急 10 +陡 10 +賞 10 +搜 10 +予 10 +攀 10 +慮 10 +婚 10 +融 10 +冷 10 +迴 10 +泛 10 +俱 10 +莫 10 +咖 10 +毀 10 +罪 10 +災 10 +疑 10 +臺 10 +淨 10 +尿 10 +載 10 +值 9 +橋 9 +覽 9 +稍 9 +噴 9 +財 9 +漫 9 +靈 9 +途 9 +址 9 +福 9 +厚 9 +槍 9 +姿 9 +獄 9 +臨 9 +港 9 +柏 9 +執 9 +隱 9 +洩 9 +痛 9 +涉 9 +飾 9 +橫 9 +鮮 9 +撞 9 +背 9 +佔 9 +畢 9 +殖 9 +喬 9 +衡 9 +賴 9 +穿 9 +氏 9 +葉 9 +! 9 +善 9 +紅 9 +佛 9 +鳥 9 +倍 9 +署 9 +派 9 +濃 9 +寬 9 +草 9 +雅 9 +輛 9 +緣 9 +督 9 +伴 8 +駐 8 +瑪 8 +川 8 +谷 8 +架 8 +逐 8 +志 8 +嘗 8 +阻 8 +刻 8 +插 8 +追 8 +巨 8 +熟 8 +黨 8 +談 8 +賓 8 +莉 8 +夢 8 +課 8 +肯 8 +央 8 +皮 8 +怕 8 +靠 8 +亮 8 +藉 8 +浮 8 +鎮 8 +勢 8 +敦 8 +促 8 +徙 8 +鄉 8 +顆 8 +呈 8 +硬 8 +泳 8 +智 8 +丘 8 +坐 8 +虛 8 +祝 8 +側 8 +惠 8 +啡 8 +驚 8 +暗 8 +粒 8 +烏 8 +─ 8 +刺 8 +藏 8 +箱 8 +盛 8 +軌 8 +秒 8 +琴 8 +賣 7 +葛 7 +錯 7 +烹 7 +峽 7 +懼 7 +糟 7 +玻 7 +絡 7 +佩 7 +末 7 +憶 7 +幣 7 +瑟 7 +菲 7 +柔 7 +辛 7 +頻 7 +譽 7 +睡 7 +獅 7 +貝 7 +售 7 +靜 7 +剩 7 +囚 7 +描 7 +餘 7 +績 7 +颱 7 +句 7 +革 7 +吧 7 +輯 7 +療 7 +床 7 +卸 7 +舒 7 +貿 7 +軟 7 +葬 7 +寶 7 +鏢 7 +舞 7 +牧 7 +顧 7 +配 7 +奪 7 +亂 7 +・ 7 +墓 7 +戴 7 +跡 7 +跳 7 +疫 7 +操 7 +糖 7 +禍 7 +桶 7 +耶 7 +怪 7 +香 6 +拜 6 +撤 6 +婦 6 +煮 6 +屋 6 +揭 6 +韓 6 +菜 6 +散 6 +﹑ 6 +穩 6 +頒 6 +祕 6 +宇 6 +牌 6 +徑 6 +橄 6 +欖 6 +偏 6 +企 6 +憂 6 +勞 6 +頁 6 +汀 6 +恢 6 +拘 6 +殊 6 +熊 6 +稀 6 +握 6 +圈 6 +拍 6 +寓 6 +概 6 +午 6 +踢 6 +峭 6 +壁 6 +溜 6 +紛 6 +謝 6 +霄 6 +崎 6 +麻 6 +馴 6 +晨 6 +旋 6 +衣 6 +旗 6 +鞋 6 +碎 6 +堅 6 +虐 6 +呼 6 +浪 6 +凱 6 +逮 6 +臟 6 +癌 6 +樓 6 +擬 6 +撥 6 +唱 6 +父 6 +顛 6 +峰 6 +豪 6 +挪 6 +妻 6 +尖 6 +毫 6 +截 6 +槽 6 +廷 6 +塊 6 +青 6 +迅 6 +冒 6 +租 6 +佐 6 +豐 6 +厄 6 +套 6 +茲 6 +寺 6 +胡 6 +馳 6 +伽 6 +慢 6 +鐵 6 +堂 6 +疼 6 +旁 5 +灘 5 +柬 5 +埔 5 +寨 5 +敏 5 +碰 5 +劍 5 +職 5 +乏 5 +圭 5 +宙 5 +膨 5 +脹 5 +刪 5 +鬥 5 +飽 5 +串 5 +挑 5 +削 5 +碩 5 +屆 5 +齊 5 +戈 5 +棕 5 +縫 5 +儲 5 +倖 5 +f 5 +湯 5 +x 5 +岡 5 +猶 5 +妨 5 +巧 5 +剛 5 +鋼 5 +壘 5 +眠 5 +磅 5 +彙 5 +獻 5 +悅 5 +煩 5 +匈 5 +悉 5 +振 5 +赫 5 +耐 5 +堵 5 +蔥 5 +鬱 5 +徹 5 +假 5 +填 5 +遲 5 +遣 5 +艘 5 +肺 5 +丟 5 +宅 5 +榜 5 +廂 5 +策 5 +旦 5 +梅 5 +均 5 +爵 5 +討 5 +漏 5 +肝 5 +釀 5 +秀 5 +契 5 +羊 5 +謹 5 +禽 5 +敵 5 +慎 5 +凡 5 +奶 5 +玩 5 +胎 5 +迎 5 +拔 5 +賭 5 +益 5 +臉 5 +虎 5 +刑 5 +瓶 5 +歸 5 +凌 5 +釐 5 +裁 5 +彎 5 +涵 5 +窯 5 +戛 5 +漸 4 +桑 4 +誤 4 +乾 4 +盜 4 +恆 4 +漿 4 +糕 4 +叢 4 +璃 4 +搬 4 +j 4 +兌 4 +荒 4 +潤 4 +藤 4 +袋 4 +榮 4 +隙 4 +猩 4 +寸 4 +曲 4 +貸 4 +蘋 4 +疊 4 +訟 4 +宰 4 +累 4 +燒 4 +販 4 +陷 4 +摔 4 +纜 4 +黃 4 +棒 4 +闊 4 +混 4 +培 4 +彩 4 +姻 4 +亨 4 +龐 4 +貼 4 +磁 4 +妮 4 +崩 4 +膀 4 +搞 4 +錦 4 +齒 4 +悟 4 +障 4 +董 4 +滋 4 +材 4 +瞭 4 +尾 4 +炎 4 +棄 4 +昂 4 +匯 4 +椅 4 +折 4 +茶 4 +幸 4 +逃 4 +莎 4 +胺 4 +酸 4 +閥 4 +役 4 +輻 4 +仿 4 +肩 4 +甲 4 +慣 4 +懷 4 +藻 4 +后 4 +沃 4 +廠 4 +俚 4 +掃 4 +叫 4 +– 4 +閒 4 +桿 4 +挺 4 +擅 4 +勤 4 +躍 4 +閃 4 +誌 4 +騙 4 +招 4 +墮 4 +縮 4 +擠 4 +苗 4 +黎 4 +殼 4 +儀 4 +奔 4 +厭 4 +瀑 4 +綠 4 +乃 4 +祖 4 +鯊 4 +駝 4 +麋 4 +珍 4 +扯 4 +杜 4 +尊 4 +聚 4 +冊 4 +洪 4 +貫 4 +答 4 +朗 4 +掌 4 +礎 4 +竊 4 +韋 4 +札 4 +塌 3 +涅 3 +檯 3 +岳 3 +坑 3 +溪 3 +碟 3 +曝 3 +醜 3 +塗 3 +鴉 3 +齡 3 +牆 3 +擋 3 +裸 3 +銜 3 +誕 3 +撰 3 +魅 3 +敲 3 +鍵 3 +拼 3 +副 3 +春 3 +宴 3 +詢 3 +擺 3 +雄 3 +裏 3 +掩 3 +李 3 +墜 3 +堆 3 +固 3 +筒 3 +券 3 +棟 3 +屠 3 +鎖 3 +耗 3 +鐙 3 +措 3 +扶 3 +牽 3 +躋 3 +擎 3 +哪 3 +貢 3 +純 3 +秘 3 +嚇 3 +屈 3 +飼 3 +趟 3 +佈 3 +辯 3 +井 3 +繁 3 +洗 3 +疲 3 +仇 3 +斜 3 +蚊 3 +榻 3 +鵝 3 +卵 3 +銷 3 +陪 3 +借 3 +昆 3 +隧 3 +詩 3 +咬 3 +枝 3 +廉 3 +划 3 +蹤 3 +池 3 +乳 3 +裕 3 +詐 3 +撫 3 +錫 3 +范 3 +浩 3 +揚 3 +涯 3 +巡 3 +窮 3 +燃 3 +擲 3 +吾 3 +堪 3 +鼻 3 +泥 3 +枚 3 +旨 3 +慘 3 +允 3 +幹 3 +黏 3 +爐 3 +羽 3 +泵 3 +勸 3 +忘 3 +伐 3 +q 3 +笑 3 +講 3 +珊 3 +麗 3 +詹 3 +悖 3 +螢 3 +聰 3 +謊 3 +嶇 3 +懸 3 +鼠 3 +昭 3 +彰 3 +繪 3 +粉 3 +楚 3 +貪 3 +慧 3 +誠 3 +粗 3 +腹 3 +盎 3 +寧 3 +圓 3 +嘆 3 +塑 3 +采 3 +亭 3 +妥 3 +寒 3 +昨 3 +閱 3 +滾 3 +藥 3 +傾 3 +擾 3 +膽 3 +鞍 3 +憤 3 +怒 3 +匿 3 +暱 3 +邀 3 +墳 3 +奮 3 +澤 3 +姓 3 +遜 3 +銳 3 +杉 3 +磯 3 +謀 3 +縱 3 +扎 3 +罹 3 +卓 3 +穴 3 +抑 3 +鉛 3 +循 3 +媲 3 +遂 3 +旺 3 +歧 3 +矛 3 +盾 3 +占 2 +倒 2 +氫 2 +氧 2 +泊 2 +杭 2 +漆 2 +朋 2 +夾 2 +雇 2 +餵 2 +飪 2 +村 2 +í 2 +ó 2 +伏 2 +俘 2 +虜 2 +寇 2 +嘔 2 +吐 2 +盤 2 +蕪 2 +剝 2 +穫 2 +仁 2 +狼 2 +鷹 2 +松 2 +摘 2 +纏 2 +搥 2 +映 2 +拳 2 +〈 2 +〉 2 +膜 2 +腫 2 +瘤 2 +睛 2 +鈔 2 +彌 2 +鍋 2 +陶 2 +煙 2 +俏 2 +猜 2 +鍛 2 +鍊 2 +諸 2 +踐 2 +寂 2 +鋪 2 +貌 2 +斤 2 +替 2 +崇 2 +狂 2 +訣 2 +裔 2 +齋 2 +茨 2 +暖 2 +覲 2 +夕 2 +蟻 2 +誘 2 +倦 2 +倚 2 +牢 2 +罰 2 +柱 2 +梠 2 +裙 2 +翅 2 +倡 2 +陰 2 +氛 2 +薦 2 +靴 2 +仗 2 +紓 2 +籲 2 +憾 2 +魏 2 +奢 2 +寢 2 +剪 2 +劣 2 +蹈 2 +株 2 +蜃 2 +酬 2 +鎊 2 +邁 2 +跌 2 +崗 2 +孤 2 +丈 2 +侶 2 +譴 2 +哇 2 +臥 2 +濘 2 +艱 2 +澀 2 +盈 2 +郁 2 +甘 2 +菊 2 +髮 2 +睹 2 +怖 2 +孔 2 +滅 2 +茱 2 +揣 2 +洽 2 +暈 2 +託 2 +娜 2 +襲 2 +腰 2 +塵 2 +暑 2 +犬 2 +箭 2 +盔 2 +淆 2 +煤 2 +炭 2 +碳 2 +厲 2 +薪 2 +諺 2 +郊 2 +矩 2 +潮 2 +綿 2 +勃 2 +逗 2 +謂 2 +溢 2 +刊 2 +抱 2 +奉 2 +哩 2 +椰 2 +悠 2 +爬 2 +偉 2 +鳴 2 +殘 2 +割 2 +沉 2 +摧 2 +呆 2 +婪 2 +趨 2 +怎 2 +濤 2 +鼓 2 +勵 2 +汙 2 +溼 2 +梯 2 +夥 2 +剖 2 ++ 2 +唐 2 +邸 2 +吼 2 +銘 2 +轄 2 +擦 2 +芯 2 +墨 2 +喪 2 +棺 2 +癱 2 +瘓 2 +晝 2 +辭 2 +隔 2 +榨 2 +汁 2 +昔 2 +拋 2 +梭 2 +獺 2 +壯 2 +輔 2 +溝 2 +默 2 +偵 2 +虔 2 +咎 2 +籍 2 +纖 2 +熨 2 +斗 2 +襪 2 +吹 2 +瓣 2 +緩 2 +幟 2 +橇 2 +轎 2 +珀 2 +渴 2 +& 2 +奈 2 +偷 2 +怨 2 +哲 2 +顏 2 +吳 2 +窟 2 +櫚 2 +喝 2 +醉 2 +亥 2 +玆 2 +緹 2 +蓄 2 +欲 2 +撐 2 +鶇 2 +陀 2 +螺 2 +蒸 2 +棍 2 +扔 2 +逝 2 +刀 2 +" 2 +õ 2 +辨 2 +哭 2 +泣 2 +賺 2 +柯 2 +君 2 +氮 2 +邏 2 +陌 2 +廊 2 +濫 2 +嘲 2 +帳 2 +獸 2 +餚 2 +驅 2 +嚐 2 +忽 2 +仲 2 +碉 2 +狩 2 +遵 2 +盃 2 +坎 2 +彗 2 +蛋 2 +趁 2 +氰 2 +抽 2 +污 2 +遮 2 +沼 2 +析 2 +廁 2 +礁 2 +綢 1 +櫃 1 +涸 1 +劫 1 +雕 1 +姊 1 +妹 1 +疏 1 +莊 1 +諮 1 +賠 1 +鑰 1 +匙 1 +伸 1 +猞 1 +猁 1 +雞 1 +頗 1 +傲 1 +喉 1 +嚨 1 +扮 1 +綜 1 +軋 1 +彭 1 +貶 1 +綽 1 +螞 1 +巢 1 +蔡 1 +蹄 1 +囊 1 +刷 1 +擴 1 +犒 1 +欣 1 +聆 1 +瘧 1 +伍 1 +欺 1 +慌 1 +儕 1 +栓 1 +諳 1 +挖 1 +翔 1 +卉 1 +綱 1 +塚 1 +屏 1 +鈾 1 +遙 1 +俗 1 +漂 1 +紮 1 +漠 1 +崔 1 +淹 1 +堤 1 +纂 1 +窩 1 +歎 1 +壇 1 +酋 1 +鞭 1 +苦 1 +裹 1 +巾 1 +豔 1 +戒 1 +懶 1 +鸚 1 +鵡 1 +潦 1 +俯 1 +瞰 1 +崖 1 +狗 1 +哺 1 +骸 1 +淚 1 +喀 1 +醇 1 +聳 1 +偶 1 +肢 1 +疽 1 +鉗 1 +拯 1 +砸 1 +豁 1 +癒 1 +扣 1 +『 1 +』 1 +孫 1 +誰 1 +遏 1 +垃 1 +圾 1 +賀 1 +幼 1 +嬰 1 +顱 1 +宮 1 +催 1 +肚 1 +詛 1 +咒 1 +瀉 1 +贈 1 +馮 1 +介 1 +噱 1 +抄 1 +臂 1 +顎 1 +臣 1 +函 1 +蕾 1 +​ 1 +緻 1 +伺 1 +廢 1 +娛 1 +晤 1 +潔 1 +紋 1 +迷 1 +臭 1 +腸 1 +惜 1 +衰 1 +曆 1 +慈 1 +祭 1 +頃 1 +弧 1 +挫 1 +憑 1 +搖 1 +懲 1 +兔 1 +羚 1 +勘 1 +隕 1 +/ 1 +侷 1 +框 1 +逛 1 +按 1 +濾 1 +崙 1 +罕 1 +撒 1 +埋 1 +嶄 1 +併 1 +膝 1 +縣 1 +晉 1 +沫 1 +嶺 1 +劑 1 +赤 1 +藍 1 +迦 1 +涼 1 +奏 1 +峻 1 +甦 1 +惱 1 +旱 1 +衫 1 +銅 1 +讚 1 +朱 1 +侖 1 +灼 1 +曉 1 +阪 1 +滕 1 +奠 1 +妙 1 +猴 1 +蛇 1 +齧 1 +烘 1 +烤 1 +蝕 1 +惹 1 +晶 1 +零 1 +陣 1 +迄 1 +逆 1 +渺 1 +茫 1 +誇 1 +盪 1 +棘 1 +坍 1 +蔽 1 +兆 1 +勇 1 +盒 1 +巽 1 +喙 1 +壤 1 +註 1 +鎢 1 +竟 1 +袖 1 +郡 1 +汐 1 +棚 1 +賊 1 +浴 1 +澡 1 +禦 1 +掠 1 +既 1 +舍 1 +攔 1 +耕 1 +繳 1 +債 1 +詳 1 +z 1 +敞 1 +賈 1 +豬 1 +暹 1 +跋 1 +鴨 1 +幻 1 +耀 1 +矯 1 +抹 1 +— 1 +抬 1 +魁 1 +籤 1 +櫻 1 +瀆 1 +鋃 1 +鐺 1 +梨 1 +桌 1 +畜 1 +田 1 +卷 1 +竄 1 +仔 1 +狄 1 +繫 1 +鑽 1 +躲 1 +捷 1 +輩 1 +猿 1 +撼 1 +渡 1 +鄧 1 +訝 1 diff --git a/mms-1b-all/zlm/lexicon.txt b/mms-1b-all/zlm/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..8a3f1d974616f6eff6f154422d80b42cd67fda7d --- /dev/null +++ b/mms-1b-all/zlm/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/zlm/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/zlm/tokens.txt b/mms-1b-all/zlm/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0e65231fd7c33be64eed9fadf26f6fc368603cb --- /dev/null +++ b/mms-1b-all/zlm/tokens.txt @@ -0,0 +1,70 @@ +a 492783 +| 387001 +n 203950 +e 182851 +i 159197 +r 104997 +u 137787 +t 108457 +k 135775 +m 112055 +d 96832 +s 85647 +g 82032 +l 79280 +p 60692 +b 59636 +h 73541 +o 23502 +y 46823 +j 17803 +c 6124 +w 6769 +f 2415 +- 12086 +0 8 +v 481 +1 1 +z 1117 +2 1 +9 190 +5 2 +4 6 +8 131 +. 129 +3 1 +6 6 +7 85 +, 70 +x 69 +/ 36 +q 4 +” 31 +: 29 +' 1324 +< 12 +> 12 +á 9 +% 8 +; 7 +$ 6 +¥ 6 +& 6 +õ 6 +ü 5 +í 5 +° 4 +[ 4 +] 4 +ç 3 +ú 3 +ł 2 ++ 2 +ã 2 +! 2 +² 2 +é 2 +ó 1 +— 1 +£ 1 +– 164 diff --git a/mms-1b-all/zul/lexicon.txt b/mms-1b-all/zul/lexicon.txt new file mode 120000 index 0000000000000000000000000000000000000000..8df5df96c9c86109f0fcf77ecf92c1aa7d815eb9 --- /dev/null +++ b/mms-1b-all/zul/lexicon.txt @@ -0,0 +1 @@ +/large_experiments/mms/data/scratch/cc200xl//processed/zul/lexicon_fleurs_limit250k_fixed.txt \ No newline at end of file diff --git a/mms-1b-all/zul/tokens.txt b/mms-1b-all/zul/tokens.txt new file mode 100644 index 0000000000000000000000000000000000000000..9454b246933611a1d3b93cfe70fcc7e7b497847d --- /dev/null +++ b/mms-1b-all/zul/tokens.txt @@ -0,0 +1,71 @@ +a 45597 +| 42925 +i 34325 +e 34005 +n 29720 +u 24294 +k 20866 +l 20052 +o 19897 +h 18455 +s 12858 +m 11662 +z 11038 +b 9621 +g 9570 +w 9413 +t 8547 +y 7744 +d 5321 +p 4158 +- 2712 +c 2169 +r 2164 +f 2039 +q 1656 +v 1617 +j 1286 +0 683 +x 553 +1 519 +2 337 +9 229 +5 182 +4 179 +3 163 +8 149 +6 148 +. 133 +7 98 +, 73 +: 44 +' 41 +/ 39 +$ 22 +á 11 +% 11 +; 10 +ü 9 +— 8 +" 7 +– 7 +” 6 +ç 6 +í 5 +& 4 +õ 4 +‘ 4 ++ 4 +ú 3 +[ 3 +] 3 +ã 3 +° 3 +£ 2 +! 2 +ó 2 +é 2 +̇ 2 +² 2 +ł 1 +ö 1