tiedeman commited on
Commit
c3645de
1 Parent(s): 1d10192

Initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.spm filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ language:
4
+ - bru
5
+ - cmo
6
+ - de
7
+ - en
8
+ - es
9
+ - fr
10
+ - kha
11
+ - km
12
+ - kxm
13
+ - mnw
14
+ - ngt
15
+ - pt
16
+ - vi
17
+ - wbm
18
+
19
+ tags:
20
+ - translation
21
+ - opus-mt-tc-bible
22
+
23
+ license: apache-2.0
24
+ model-index:
25
+ - name: opus-mt-tc-bible-big-deu_eng_fra_por_spa-mkh
26
+ results:
27
+ - task:
28
+ name: Translation deu-vie
29
+ type: translation
30
+ args: deu-vie
31
+ dataset:
32
+ name: flores200-devtest
33
+ type: flores200-devtest
34
+ args: deu-vie
35
+ metrics:
36
+ - name: BLEU
37
+ type: bleu
38
+ value: 33.9
39
+ - name: chr-F
40
+ type: chrf
41
+ value: 0.53535
42
+ - task:
43
+ name: Translation eng-vie
44
+ type: translation
45
+ args: eng-vie
46
+ dataset:
47
+ name: flores200-devtest
48
+ type: flores200-devtest
49
+ args: eng-vie
50
+ metrics:
51
+ - name: BLEU
52
+ type: bleu
53
+ value: 42.6
54
+ - name: chr-F
55
+ type: chrf
56
+ value: 0.60021
57
+ - task:
58
+ name: Translation fra-vie
59
+ type: translation
60
+ args: fra-vie
61
+ dataset:
62
+ name: flores200-devtest
63
+ type: flores200-devtest
64
+ args: fra-vie
65
+ metrics:
66
+ - name: BLEU
67
+ type: bleu
68
+ value: 34.6
69
+ - name: chr-F
70
+ type: chrf
71
+ value: 0.54168
72
+ - task:
73
+ name: Translation por-vie
74
+ type: translation
75
+ args: por-vie
76
+ dataset:
77
+ name: flores200-devtest
78
+ type: flores200-devtest
79
+ args: por-vie
80
+ metrics:
81
+ - name: BLEU
82
+ type: bleu
83
+ value: 35.9
84
+ - name: chr-F
85
+ type: chrf
86
+ value: 0.55046
87
+ - task:
88
+ name: Translation spa-vie
89
+ type: translation
90
+ args: spa-vie
91
+ dataset:
92
+ name: flores200-devtest
93
+ type: flores200-devtest
94
+ args: spa-vie
95
+ metrics:
96
+ - name: BLEU
97
+ type: bleu
98
+ value: 28.1
99
+ - name: chr-F
100
+ type: chrf
101
+ value: 0.50262
102
+ - task:
103
+ name: Translation deu-vie
104
+ type: translation
105
+ args: deu-vie
106
+ dataset:
107
+ name: flores101-devtest
108
+ type: flores_101
109
+ args: deu vie devtest
110
+ metrics:
111
+ - name: BLEU
112
+ type: bleu
113
+ value: 33.9
114
+ - name: chr-F
115
+ type: chrf
116
+ value: 0.53623
117
+ - task:
118
+ name: Translation eng-vie
119
+ type: translation
120
+ args: eng-vie
121
+ dataset:
122
+ name: flores101-devtest
123
+ type: flores_101
124
+ args: eng vie devtest
125
+ metrics:
126
+ - name: BLEU
127
+ type: bleu
128
+ value: 42.7
129
+ - name: chr-F
130
+ type: chrf
131
+ value: 0.59986
132
+ - task:
133
+ name: Translation por-vie
134
+ type: translation
135
+ args: por-vie
136
+ dataset:
137
+ name: flores101-devtest
138
+ type: flores_101
139
+ args: por vie devtest
140
+ metrics:
141
+ - name: BLEU
142
+ type: bleu
143
+ value: 35.7
144
+ - name: chr-F
145
+ type: chrf
146
+ value: 0.54819
147
+ - task:
148
+ name: Translation deu-vie
149
+ type: translation
150
+ args: deu-vie
151
+ dataset:
152
+ name: ntrex128
153
+ type: ntrex128
154
+ args: deu-vie
155
+ metrics:
156
+ - name: BLEU
157
+ type: bleu
158
+ value: 31.2
159
+ - name: chr-F
160
+ type: chrf
161
+ value: 0.51996
162
+ - task:
163
+ name: Translation eng-vie
164
+ type: translation
165
+ args: eng-vie
166
+ dataset:
167
+ name: ntrex128
168
+ type: ntrex128
169
+ args: eng-vie
170
+ metrics:
171
+ - name: BLEU
172
+ type: bleu
173
+ value: 42.7
174
+ - name: chr-F
175
+ type: chrf
176
+ value: 0.60050
177
+ - task:
178
+ name: Translation fra-vie
179
+ type: translation
180
+ args: fra-vie
181
+ dataset:
182
+ name: ntrex128
183
+ type: ntrex128
184
+ args: fra-vie
185
+ metrics:
186
+ - name: BLEU
187
+ type: bleu
188
+ value: 31.7
189
+ - name: chr-F
190
+ type: chrf
191
+ value: 0.51988
192
+ - task:
193
+ name: Translation por-vie
194
+ type: translation
195
+ args: por-vie
196
+ dataset:
197
+ name: ntrex128
198
+ type: ntrex128
199
+ args: por-vie
200
+ metrics:
201
+ - name: BLEU
202
+ type: bleu
203
+ value: 33.3
204
+ - name: chr-F
205
+ type: chrf
206
+ value: 0.52931
207
+ - task:
208
+ name: Translation spa-vie
209
+ type: translation
210
+ args: spa-vie
211
+ dataset:
212
+ name: ntrex128
213
+ type: ntrex128
214
+ args: spa-vie
215
+ metrics:
216
+ - name: BLEU
217
+ type: bleu
218
+ value: 33.1
219
+ - name: chr-F
220
+ type: chrf
221
+ value: 0.53347
222
+ - task:
223
+ name: Translation deu-vie
224
+ type: translation
225
+ args: deu-vie
226
+ dataset:
227
+ name: tatoeba-test-v2021-08-07
228
+ type: tatoeba_mt
229
+ args: deu-vie
230
+ metrics:
231
+ - name: BLEU
232
+ type: bleu
233
+ value: 25.3
234
+ - name: chr-F
235
+ type: chrf
236
+ value: 0.45222
237
+ - task:
238
+ name: Translation eng-vie
239
+ type: translation
240
+ args: eng-vie
241
+ dataset:
242
+ name: tatoeba-test-v2021-08-07
243
+ type: tatoeba_mt
244
+ args: eng-vie
245
+ metrics:
246
+ - name: BLEU
247
+ type: bleu
248
+ value: 39.0
249
+ - name: chr-F
250
+ type: chrf
251
+ value: 0.56413
252
+ - task:
253
+ name: Translation fra-vie
254
+ type: translation
255
+ args: fra-vie
256
+ dataset:
257
+ name: tatoeba-test-v2021-08-07
258
+ type: tatoeba_mt
259
+ args: fra-vie
260
+ metrics:
261
+ - name: BLEU
262
+ type: bleu
263
+ value: 35.6
264
+ - name: chr-F
265
+ type: chrf
266
+ value: 0.53078
267
+ - task:
268
+ name: Translation multi-multi
269
+ type: translation
270
+ args: multi-multi
271
+ dataset:
272
+ name: tatoeba-test-v2020-07-28-v2023-09-26
273
+ type: tatoeba_mt
274
+ args: multi-multi
275
+ metrics:
276
+ - name: BLEU
277
+ type: bleu
278
+ value: 24.9
279
+ - name: chr-F
280
+ type: chrf
281
+ value: 0.43068
282
+ - task:
283
+ name: Translation spa-vie
284
+ type: translation
285
+ args: spa-vie
286
+ dataset:
287
+ name: tatoeba-test-v2021-08-07
288
+ type: tatoeba_mt
289
+ args: spa-vie
290
+ metrics:
291
+ - name: BLEU
292
+ type: bleu
293
+ value: 34.0
294
+ - name: chr-F
295
+ type: chrf
296
+ value: 0.51783
297
+ ---
298
+ # opus-mt-tc-bible-big-deu_eng_fra_por_spa-mkh
299
+
300
+ ## Table of Contents
301
+ - [Model Details](#model-details)
302
+ - [Uses](#uses)
303
+ - [Risks, Limitations and Biases](#risks-limitations-and-biases)
304
+ - [How to Get Started With the Model](#how-to-get-started-with-the-model)
305
+ - [Training](#training)
306
+ - [Evaluation](#evaluation)
307
+ - [Citation Information](#citation-information)
308
+ - [Acknowledgements](#acknowledgements)
309
+
310
+ ## Model Details
311
+
312
+ Neural machine translation model for translating from unknown (deu+eng+fra+por+spa) to Mon-Khmer languages (mkh).
313
+
314
+ This model is part of the [OPUS-MT project](https://github.com/Helsinki-NLP/Opus-MT), an effort to make neural machine translation models widely available and accessible for many languages in the world. All models are originally trained using the amazing framework of [Marian NMT](https://marian-nmt.github.io/), an efficient NMT implementation written in pure C++. The models have been converted to pyTorch using the transformers library by huggingface. Training data is taken from [OPUS](https://opus.nlpl.eu/) and training pipelines use the procedures of [OPUS-MT-train](https://github.com/Helsinki-NLP/Opus-MT-train).
315
+ **Model Description:**
316
+ - **Developed by:** Language Technology Research Group at the University of Helsinki
317
+ - **Model Type:** Translation (transformer-big)
318
+ - **Release**: 2024-05-30
319
+ - **License:** Apache-2.0
320
+ - **Language(s):**
321
+ - Source Language(s): deu eng fra por spa
322
+ - Target Language(s): bru cmo kha khm kxm mnw ngt vie wbm
323
+ - Valid Target Language Labels: >>aem<< >>alk<< >>aml<< >>bbh<< >>bdq<< >>bgk<< >>bgl<< >>blr<< >>brb<< >>bru<< >>brv<< >>btq<< >>caq<< >>cbn<< >>cma<< >>cmo<< >>cog<< >>crv<< >>crw<< >>cua<< >>cwg<< >>dnu<< >>hal<< >>hld<< >>hnu<< >>hre<< >>huo<< >>jah<< >>jeh<< >>jhi<< >>kdt<< >>kha<< >>khf<< >>khm<< >>kjg<< >>kjm<< >>knq<< >>kns<< >>kpm<< >>krr<< >>krv<< >>kta<< >>ktv<< >>kuf<< >>kxm<< >>kxy<< >>lbn<< >>lbo<< >>lcp<< >>lnh<< >>lwl<< >>lyg<< >>mef<< >>mhe<< >>mlf<< >>mml<< >>mng<< >>mnn<< >>mnq<< >>mnw<< >>moo<< >>mqt<< >>mra<< >>mtq<< >>mzt<< >>ncb<< >>ncq<< >>nev<< >>ngt<< >>ngt_Latn<< >>nik<< >>nuo<< >>nyl<< >>omx<< >>oog<< >>oyb<< >>pac<< >>pbv<< >>pcb<< >>pce<< >>phg<< >>pkt<< >>pll<< >>ply<< >>pnx<< >>prk<< >>prt<< >>puo<< >>rbb<< >>ren<< >>ril<< >>rka<< >>rmx<< >>sbo<< >>scb<< >>scq<< >>sct<< >>sea<< >>sed<< >>sii<< >>smu<< >>spu<< >>sqq<< >>ssm<< >>sss<< >>stg<< >>sti<< >>stt<< >>stu<< >>syo<< >>sza<< >>szc<< >>tdf<< >>tdr<< >>tea<< >>tef<< >>thm<< >>tkz<< >>tlq<< >>tmo<< >>tnz<< >>tou<< >>tpu<< >>tth<< >>tto<< >>tyh<< >>uuu<< >>vie<< >>vwa<< >>wbm<< >>xao<< >>xkk<< >>xnh<< >>xxx<< >>yin<< >>zng<<
324
+ - **Original Model**: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/deu+eng+fra+por+spa-mkh/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip)
325
+ - **Resources for more information:**
326
+ - [OPUS-MT dashboard](https://opus.nlpl.eu/dashboard/index.php?pkg=opusmt&test=all&scoreslang=all&chart=standard&model=Tatoeba-MT-models/deu%2Beng%2Bfra%2Bpor%2Bspa-mkh/opusTCv20230926max50%2Bbt%2Bjhubc_transformer-big_2024-05-30)
327
+ - [OPUS-MT-train GitHub Repo](https://github.com/Helsinki-NLP/OPUS-MT-train)
328
+ - [More information about MarianNMT models in the transformers library](https://huggingface.co/docs/transformers/model_doc/marian)
329
+ - [Tatoeba Translation Challenge](https://github.com/Helsinki-NLP/Tatoeba-Challenge/)
330
+ - [HPLT bilingual data v1 (as part of the Tatoeba Translation Challenge dataset)](https://hplt-project.org/datasets/v1)
331
+ - [A massively parallel Bible corpus](https://aclanthology.org/L14-1215/)
332
+
333
+ This is a multilingual translation model with multiple target languages. A sentence initial language token is required in the form of `>>id<<` (id = valid target language ID), e.g. `>>bru<<`
334
+
335
+ ## Uses
336
+
337
+ This model can be used for translation and text-to-text generation.
338
+
339
+ ## Risks, Limitations and Biases
340
+
341
+ **CONTENT WARNING: Readers should be aware that the model is trained on various public data sets that may contain content that is disturbing, offensive, and can propagate historical and current stereotypes.**
342
+
343
+ Significant research has explored bias and fairness issues with language models (see, e.g., [Sheng et al. (2021)](https://aclanthology.org/2021.acl-long.330.pdf) and [Bender et al. (2021)](https://dl.acm.org/doi/pdf/10.1145/3442188.3445922)).
344
+
345
+ ## How to Get Started With the Model
346
+
347
+ A short example code:
348
+
349
+ ```python
350
+ from transformers import MarianMTModel, MarianTokenizer
351
+
352
+ src_text = [
353
+ ">>bru<< Replace this with text in an accepted source language.",
354
+ ">>wbm<< This is the second sentence."
355
+ ]
356
+
357
+ model_name = "pytorch-models/opus-mt-tc-bible-big-deu_eng_fra_por_spa-mkh"
358
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
359
+ model = MarianMTModel.from_pretrained(model_name)
360
+ translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
361
+
362
+ for t in translated:
363
+ print( tokenizer.decode(t, skip_special_tokens=True) )
364
+ ```
365
+
366
+ You can also use OPUS-MT models with the transformers pipelines, for example:
367
+
368
+ ```python
369
+ from transformers import pipeline
370
+ pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-mkh")
371
+ print(pipe(">>bru<< Replace this with text in an accepted source language."))
372
+ ```
373
+
374
+ ## Training
375
+
376
+ - **Data**: opusTCv20230926max50+bt+jhubc ([source](https://github.com/Helsinki-NLP/Tatoeba-Challenge))
377
+ - **Pre-processing**: SentencePiece (spm32k,spm32k)
378
+ - **Model Type:** transformer-big
379
+ - **Original MarianNMT Model**: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/deu+eng+fra+por+spa-mkh/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip)
380
+ - **Training Scripts**: [GitHub Repo](https://github.com/Helsinki-NLP/OPUS-MT-train)
381
+
382
+ ## Evaluation
383
+
384
+ * [Model scores at the OPUS-MT dashboard](https://opus.nlpl.eu/dashboard/index.php?pkg=opusmt&test=all&scoreslang=all&chart=standard&model=Tatoeba-MT-models/deu%2Beng%2Bfra%2Bpor%2Bspa-mkh/opusTCv20230926max50%2Bbt%2Bjhubc_transformer-big_2024-05-30)
385
+ * test set translations: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.test.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/deu+eng+fra+por+spa-mkh/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.test.txt)
386
+ * test set scores: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.eval.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/deu+eng+fra+por+spa-mkh/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.eval.txt)
387
+ * benchmark results: [benchmark_results.txt](benchmark_results.txt)
388
+ * benchmark output: [benchmark_translations.zip](benchmark_translations.zip)
389
+
390
+ | langpair | testset | chr-F | BLEU | #sent | #words |
391
+ |----------|---------|-------|-------|-------|--------|
392
+ | deu-vie | tatoeba-test-v2021-08-07 | 0.45222 | 25.3 | 400 | 3768 |
393
+ | eng-kha | tatoeba-test-v2021-08-07 | 9.076 | 0.4 | 1314 | 9269 |
394
+ | eng-vie | tatoeba-test-v2021-08-07 | 0.56413 | 39.0 | 2500 | 24427 |
395
+ | fra-vie | tatoeba-test-v2021-08-07 | 0.53078 | 35.6 | 1299 | 13219 |
396
+ | spa-vie | tatoeba-test-v2021-08-07 | 0.51783 | 34.0 | 594 | 4740 |
397
+ | deu-vie | flores101-devtest | 0.53623 | 33.9 | 1012 | 33331 |
398
+ | eng-khm | flores101-devtest | 0.42022 | 1.4 | 1012 | 7006 |
399
+ | eng-vie | flores101-devtest | 0.59986 | 42.7 | 1012 | 33331 |
400
+ | por-vie | flores101-devtest | 0.54819 | 35.7 | 1012 | 33331 |
401
+ | deu-vie | flores200-devtest | 0.53535 | 33.9 | 1012 | 33331 |
402
+ | eng-khm | flores200-devtest | 0.41987 | 1.3 | 1012 | 7006 |
403
+ | eng-vie | flores200-devtest | 0.60021 | 42.6 | 1012 | 33331 |
404
+ | fra-khm | flores200-devtest | 0.40241 | 2.3 | 1012 | 7006 |
405
+ | fra-vie | flores200-devtest | 0.54168 | 34.6 | 1012 | 33331 |
406
+ | por-khm | flores200-devtest | 0.41582 | 2.3 | 1012 | 7006 |
407
+ | por-vie | flores200-devtest | 0.55046 | 35.9 | 1012 | 33331 |
408
+ | spa-vie | flores200-devtest | 0.50262 | 28.1 | 1012 | 33331 |
409
+ | deu-khm | ntrex128 | 0.44917 | 3.2 | 1997 | 15866 |
410
+ | deu-vie | ntrex128 | 0.51996 | 31.2 | 1997 | 64655 |
411
+ | eng-khm | ntrex128 | 0.50215 | 1.6 | 1997 | 15866 |
412
+ | eng-vie | ntrex128 | 0.60050 | 42.7 | 1997 | 64655 |
413
+ | fra-khm | ntrex128 | 0.44024 | 2.3 | 1997 | 15866 |
414
+ | fra-vie | ntrex128 | 0.51988 | 31.7 | 1997 | 64655 |
415
+ | por-khm | ntrex128 | 0.46752 | 2.4 | 1997 | 15866 |
416
+ | por-vie | ntrex128 | 0.52931 | 33.3 | 1997 | 64655 |
417
+ | spa-khm | ntrex128 | 0.46166 | 2.5 | 1997 | 15866 |
418
+ | spa-vie | ntrex128 | 0.53347 | 33.1 | 1997 | 64655 |
419
+ | eng-khm | tico19-test | 0.54267 | 3.4 | 2100 | 15810 |
420
+ | fra-khm | tico19-test | 0.45333 | 4.8 | 2100 | 15810 |
421
+ | por-khm | tico19-test | 0.52339 | 6.8 | 2100 | 15810 |
422
+ | spa-khm | tico19-test | 0.51848 | 6.8 | 2100 | 15810 |
423
+
424
+ ## Citation Information
425
+
426
+ * Publications: [Democratizing neural machine translation with OPUS-MT](https://doi.org/10.1007/s10579-023-09704-w) and [OPUS-MT – Building open translation services for the World](https://aclanthology.org/2020.eamt-1.61/) and [The Tatoeba Translation Challenge – Realistic Data Sets for Low Resource and Multilingual MT](https://aclanthology.org/2020.wmt-1.139/) (Please, cite if you use this model.)
427
+
428
+ ```bibtex
429
+ @article{tiedemann2023democratizing,
430
+ title={Democratizing neural machine translation with {OPUS-MT}},
431
+ author={Tiedemann, J{\"o}rg and Aulamo, Mikko and Bakshandaeva, Daria and Boggia, Michele and Gr{\"o}nroos, Stig-Arne and Nieminen, Tommi and Raganato, Alessandro and Scherrer, Yves and Vazquez, Raul and Virpioja, Sami},
432
+ journal={Language Resources and Evaluation},
433
+ number={58},
434
+ pages={713--755},
435
+ year={2023},
436
+ publisher={Springer Nature},
437
+ issn={1574-0218},
438
+ doi={10.1007/s10579-023-09704-w}
439
+ }
440
+
441
+ @inproceedings{tiedemann-thottingal-2020-opus,
442
+ title = "{OPUS}-{MT} {--} Building open translation services for the World",
443
+ author = {Tiedemann, J{\"o}rg and Thottingal, Santhosh},
444
+ booktitle = "Proceedings of the 22nd Annual Conference of the European Association for Machine Translation",
445
+ month = nov,
446
+ year = "2020",
447
+ address = "Lisboa, Portugal",
448
+ publisher = "European Association for Machine Translation",
449
+ url = "https://aclanthology.org/2020.eamt-1.61",
450
+ pages = "479--480",
451
+ }
452
+
453
+ @inproceedings{tiedemann-2020-tatoeba,
454
+ title = "The Tatoeba Translation Challenge {--} Realistic Data Sets for Low Resource and Multilingual {MT}",
455
+ author = {Tiedemann, J{\"o}rg},
456
+ booktitle = "Proceedings of the Fifth Conference on Machine Translation",
457
+ month = nov,
458
+ year = "2020",
459
+ address = "Online",
460
+ publisher = "Association for Computational Linguistics",
461
+ url = "https://aclanthology.org/2020.wmt-1.139",
462
+ pages = "1174--1182",
463
+ }
464
+ ```
465
+
466
+ ## Acknowledgements
467
+
468
+ The work is supported by the [HPLT project](https://hplt-project.org/), funded by the European Union’s Horizon Europe research and innovation programme under grant agreement No 101070350. We are also grateful for the generous computational resources and IT infrastructure provided by [CSC -- IT Center for Science](https://www.csc.fi/), Finland, and the [EuroHPC supercomputer LUMI](https://www.lumi-supercomputer.eu/).
469
+
470
+ ## Model conversion info
471
+
472
+ * transformers version: 4.45.1
473
+ * OPUS-MT git hash: 0882077
474
+ * port time: Tue Oct 8 10:19:52 EEST 2024
475
+ * port machine: LM0-400-22516.local
benchmark_results.txt ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ multi-multi tatoeba-test-v2020-07-28-v2023-09-26 0.43068 24.9 8636 61987
2
+ deu-khm flores101-devtest 0.38874 2.5 1012 7006
3
+ deu-vie flores101-devtest 0.53623 33.9 1012 33331
4
+ eng-khm flores101-devtest 0.42022 1.4 1012 7006
5
+ eng-vie flores101-devtest 0.59986 42.7 1012 33331
6
+ por-vie flores101-devtest 0.54819 35.7 1012 33331
7
+ spa-khm flores101-devtest 0.37253 1.5 1012 7006
8
+ deu-khm flores200-devtest 0.38872 2.5 1012 7006
9
+ deu-vie flores200-devtest 0.53535 33.9 1012 33331
10
+ eng-khm flores200-devtest 0.41987 1.3 1012 7006
11
+ eng-vie flores200-devtest 0.60021 42.6 1012 33331
12
+ fra-khm flores200-devtest 0.40241 2.3 1012 7006
13
+ fra-vie flores200-devtest 0.54168 34.6 1012 33331
14
+ por-khm flores200-devtest 0.41582 2.3 1012 7006
15
+ por-vie flores200-devtest 0.55046 35.9 1012 33331
16
+ spa-khm flores200-devtest 0.36975 1.5 1012 7006
17
+ spa-vie flores200-devtest 0.50262 28.1 1012 33331
18
+ eng-khm newstest2020 0.35200 0.9 2320 15454
19
+ deu-khm ntrex128 0.44917 3.2 1997 15866
20
+ deu-vie ntrex128 0.51996 31.2 1997 64655
21
+ eng-khm ntrex128 0.50215 1.6 1997 15866
22
+ eng-vie ntrex128 0.60050 42.7 1997 64655
23
+ fra-khm ntrex128 0.44024 2.3 1997 15866
24
+ fra-vie ntrex128 0.51988 31.7 1997 64655
25
+ por-khm ntrex128 0.46752 2.4 1997 15866
26
+ por-vie ntrex128 0.52931 33.3 1997 64655
27
+ spa-khm ntrex128 0.46166 2.5 1997 15866
28
+ spa-vie ntrex128 0.53347 33.1 1997 64655
29
+ eng-khm tatoeba-test-v2020-07-28 0.33749 0.2 752 1737
30
+ spa-khm tatoeba-test-v2020-07-28 0.36872 0.2 1472 3391
31
+ deu-vie tatoeba-test-v2021-03-30 0.45438 25.3 401 3775
32
+ eng-khm tatoeba-test-v2021-03-30 0.33751 0.2 754 1741
33
+ spa-khm tatoeba-test-v2021-03-30 0.36872 0.2 1472 3391
34
+ spa-vie tatoeba-test-v2021-03-30 0.51477 33.9 604 4824
35
+ deu-vie tatoeba-test-v2021-08-07 0.45222 25.3 400 3768
36
+ eng-kha tatoeba-test-v2021-08-07 9.076 0.4 1314 9269
37
+ eng-khm tatoeba-test-v2021-08-07 0.33349 0.2 726 1692
38
+ eng-vie tatoeba-test-v2021-08-07 0.56413 39.0 2500 24427
39
+ fra-vie tatoeba-test-v2021-08-07 0.53078 35.6 1299 13219
40
+ spa-khm tatoeba-test-v2021-08-07 0.36552 0.3 1448 3343
41
+ spa-vie tatoeba-test-v2021-08-07 0.51783 34.0 594 4740
42
+ eng-khm tico19-test 0.54267 3.4 2100 15810
43
+ fra-khm tico19-test 0.45333 4.8 2100 15810
44
+ por-khm tico19-test 0.52339 6.8 2100 15810
45
+ spa-khm tico19-test 0.51848 6.8 2100 15810
benchmark_translations.zip ADDED
File without changes
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "pytorch-models/opus-mt-tc-bible-big-deu_eng_fra_por_spa-mkh",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "relu",
5
+ "architectures": [
6
+ "MarianMTModel"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "bos_token_id": 0,
10
+ "classifier_dropout": 0.0,
11
+ "d_model": 1024,
12
+ "decoder_attention_heads": 16,
13
+ "decoder_ffn_dim": 4096,
14
+ "decoder_layerdrop": 0.0,
15
+ "decoder_layers": 6,
16
+ "decoder_start_token_id": 57128,
17
+ "decoder_vocab_size": 57129,
18
+ "dropout": 0.1,
19
+ "encoder_attention_heads": 16,
20
+ "encoder_ffn_dim": 4096,
21
+ "encoder_layerdrop": 0.0,
22
+ "encoder_layers": 6,
23
+ "eos_token_id": 1104,
24
+ "forced_eos_token_id": null,
25
+ "init_std": 0.02,
26
+ "is_encoder_decoder": true,
27
+ "max_length": null,
28
+ "max_position_embeddings": 1024,
29
+ "model_type": "marian",
30
+ "normalize_embedding": false,
31
+ "num_beams": null,
32
+ "num_hidden_layers": 6,
33
+ "pad_token_id": 57128,
34
+ "scale_embedding": true,
35
+ "share_encoder_decoder_embeddings": true,
36
+ "static_position_embeddings": true,
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.45.1",
39
+ "use_cache": true,
40
+ "vocab_size": 57129
41
+ }
generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bad_words_ids": [
4
+ [
5
+ 57128
6
+ ]
7
+ ],
8
+ "bos_token_id": 0,
9
+ "decoder_start_token_id": 57128,
10
+ "eos_token_id": 1104,
11
+ "forced_eos_token_id": 1104,
12
+ "max_length": 512,
13
+ "num_beams": 4,
14
+ "pad_token_id": 57128,
15
+ "transformers_version": "4.45.1"
16
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:beed128e73a7bf58d78092f4448fea328563d3f7f3a2bfafd356d0e9c1d67fa4
3
+ size 939688020
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e4ba7532a6a1c9c479d9f2a792eb781294c7b7d22f05b44305fc1b9ac980b82
3
+ size 939739269
source.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67336bd918550d52b7bd0940036ef81a5de8a57e47cb17a7ed616fa84e978628
3
+ size 803212
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
target.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67b72e3c21d85a89b92db1032a9a9f6c548dfaa976bf9ba2388085527b63dffa
3
+ size 879868
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"source_lang": "deu+eng+fra+por+spa", "target_lang": "mkh", "unk_token": "<unk>", "eos_token": "</s>", "pad_token": "<pad>", "model_max_length": 512, "sp_model_kwargs": {}, "separate_vocabs": false, "special_tokens_map_file": null, "name_or_path": "marian-models/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30/deu+eng+fra+por+spa-mkh", "tokenizer_class": "MarianTokenizer"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff