Spaces:
Running
Running
Update Space (evaluate main: 2253a6e1)
Browse files- README.md +75 -5
- app.py +6 -0
- nist_mt.py +132 -0
- requirements.txt +2 -0
- tests.py +34 -0
README.md
CHANGED
@@ -1,12 +1,82 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
colorTo: red
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 3.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: NIST_MT
|
3 |
+
emoji: 🤗
|
4 |
+
colorFrom: purple
|
5 |
colorTo: red
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.0.2
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
tags:
|
11 |
+
- evaluate
|
12 |
+
- metric
|
13 |
+
- machine-translation
|
14 |
+
description:
|
15 |
+
DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU score.
|
16 |
---
|
17 |
|
18 |
+
# Metric Card for NIST's MT metric
|
19 |
+
|
20 |
+
|
21 |
+
## Metric Description
|
22 |
+
DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU
|
23 |
+
score. The official script used by NIST to compute BLEU and NIST score is
|
24 |
+
mteval-14.pl. The main differences are:
|
25 |
+
|
26 |
+
- BLEU uses geometric mean of the ngram overlaps, NIST uses arithmetic mean.
|
27 |
+
- NIST has a different brevity penalty
|
28 |
+
- NIST score from mteval-14.pl has a self-contained tokenizer (in the Hugging Face implementation we rely on NLTK's
|
29 |
+
implementation of the NIST-specific tokenizer)
|
30 |
+
|
31 |
+
## Intended Uses
|
32 |
+
NIST was developed for machine translation evaluation.
|
33 |
+
|
34 |
+
## How to Use
|
35 |
+
|
36 |
+
```python
|
37 |
+
import evaluate
|
38 |
+
nist_mt = evaluate.load("nist_mt")
|
39 |
+
hypothesis1 = "It is a guide to action which ensures that the military always obeys the commands of the party"
|
40 |
+
reference1 = "It is a guide to action that ensures that the military will forever heed Party commands"
|
41 |
+
reference2 = "It is the guiding principle which guarantees the military forces always being under the command of the Party"
|
42 |
+
nist_mt.compute(hypothesis1, [reference1, reference2])
|
43 |
+
# {'nist_mt': 3.3709935957649324}
|
44 |
+
```
|
45 |
+
|
46 |
+
### Inputs
|
47 |
+
- **predictions**: tokenized predictions to score. For sentence-level NIST, a list of tokens (str);
|
48 |
+
for corpus-level NIST, a list (sentences) of lists of tokens (str)
|
49 |
+
- **references**: potentially multiple tokenized references for each prediction. For sentence-level NIST, a
|
50 |
+
list (multiple potential references) of list of tokens (str); for corpus-level NIST, a list (corpus) of lists
|
51 |
+
(multiple potential references) of lists of tokens (str)
|
52 |
+
- **n**: highest n-gram order
|
53 |
+
- **tokenize_kwargs**: arguments passed to the tokenizer (see: https://github.com/nltk/nltk/blob/90fa546ea600194f2799ee51eaf1b729c128711e/nltk/tokenize/nist.py#L139)
|
54 |
+
|
55 |
+
### Output Values
|
56 |
+
- **nist_mt** (`float`): NIST score
|
57 |
+
|
58 |
+
Output Example:
|
59 |
+
```python
|
60 |
+
{'nist_mt': 3.3709935957649324}
|
61 |
+
```
|
62 |
+
|
63 |
+
|
64 |
+
## Citation
|
65 |
+
```bibtex
|
66 |
+
@inproceedings{10.5555/1289189.1289273,
|
67 |
+
author = {Doddington, George},
|
68 |
+
title = {Automatic Evaluation of Machine Translation Quality Using N-Gram Co-Occurrence Statistics},
|
69 |
+
year = {2002},
|
70 |
+
publisher = {Morgan Kaufmann Publishers Inc.},
|
71 |
+
address = {San Francisco, CA, USA},
|
72 |
+
booktitle = {Proceedings of the Second International Conference on Human Language Technology Research},
|
73 |
+
pages = {138–145},
|
74 |
+
numpages = {8},
|
75 |
+
location = {San Diego, California},
|
76 |
+
series = {HLT '02}
|
77 |
+
}
|
78 |
+
```
|
79 |
+
|
80 |
+
## Further References
|
81 |
+
|
82 |
+
This Hugging Face implementation uses [the NLTK implementation](https://github.com/nltk/nltk/blob/develop/nltk/translate/nist_score.py)
|
app.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import evaluate
|
2 |
+
from evaluate.utils import launch_gradio_widget
|
3 |
+
|
4 |
+
|
5 |
+
module = evaluate.load("nist_mt")
|
6 |
+
launch_gradio_widget(module)
|
nist_mt.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
"""NLTK's NIST implementation on both the sentence and corpus level"""
|
15 |
+
from typing import Dict, Optional
|
16 |
+
|
17 |
+
import datasets
|
18 |
+
import nltk
|
19 |
+
from datasets import Sequence, Value
|
20 |
+
|
21 |
+
|
22 |
+
try:
|
23 |
+
nltk.data.find("perluniprops")
|
24 |
+
except LookupError:
|
25 |
+
nltk.download("perluniprops", quiet=True) # NISTTokenizer requirement
|
26 |
+
|
27 |
+
from nltk.tokenize.nist import NISTTokenizer
|
28 |
+
from nltk.translate.nist_score import corpus_nist, sentence_nist
|
29 |
+
|
30 |
+
import evaluate
|
31 |
+
|
32 |
+
|
33 |
+
_CITATION = """\
|
34 |
+
@inproceedings{10.5555/1289189.1289273,
|
35 |
+
author = {Doddington, George},
|
36 |
+
title = {Automatic Evaluation of Machine Translation Quality Using N-Gram Co-Occurrence Statistics},
|
37 |
+
year = {2002},
|
38 |
+
publisher = {Morgan Kaufmann Publishers Inc.},
|
39 |
+
address = {San Francisco, CA, USA},
|
40 |
+
booktitle = {Proceedings of the Second International Conference on Human Language Technology Research},
|
41 |
+
pages = {138–145},
|
42 |
+
numpages = {8},
|
43 |
+
location = {San Diego, California},
|
44 |
+
series = {HLT '02}
|
45 |
+
}
|
46 |
+
"""
|
47 |
+
|
48 |
+
_DESCRIPTION = """\
|
49 |
+
DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU
|
50 |
+
score. The official script used by NIST to compute BLEU and NIST score is
|
51 |
+
mteval-14.pl. The main differences are:
|
52 |
+
|
53 |
+
- BLEU uses geometric mean of the ngram precisions, NIST uses arithmetic mean.
|
54 |
+
- NIST has a different brevity penalty
|
55 |
+
- NIST score from mteval-14.pl has a self-contained tokenizer (in the Hugging Face implementation we rely on NLTK's
|
56 |
+
implementation of the NIST-specific tokenizer)
|
57 |
+
"""
|
58 |
+
|
59 |
+
|
60 |
+
_KWARGS_DESCRIPTION = """
|
61 |
+
Computes NIST score of translated segments against one or more references.
|
62 |
+
Args:
|
63 |
+
predictions: predictions to score (list of str)
|
64 |
+
references: potentially multiple references for each prediction (list of str or list of list of str)
|
65 |
+
n: highest n-gram order
|
66 |
+
lowercase: whether to lowercase the data (only applicable if 'western_lang' is True)
|
67 |
+
western_lang: whether the current language is a Western language, which will enable some specific tokenization
|
68 |
+
rules with respect to, e.g., punctuation
|
69 |
+
|
70 |
+
Returns:
|
71 |
+
'nist_mt': nist_mt score
|
72 |
+
Examples:
|
73 |
+
>>> nist_mt = evaluate.load("nist_mt")
|
74 |
+
>>> hypothesis = "It is a guide to action which ensures that the military always obeys the commands of the party"
|
75 |
+
>>> reference1 = "It is a guide to action that ensures that the military will forever heed Party commands"
|
76 |
+
>>> reference2 = "It is the guiding principle which guarantees the military forces always being under the command of the Party"
|
77 |
+
>>> reference3 = "It is the practical guide for the army always to heed the directions of the party"
|
78 |
+
>>> nist_mt.compute(predictions=[hypothesis], references=[[reference1, reference2, reference3]])
|
79 |
+
{'nist_mt': 3.3709935957649324}
|
80 |
+
>>> nist_mt.compute(predictions=[hypothesis], references=[reference1])
|
81 |
+
{'nist_mt': 2.4477124183006533}
|
82 |
+
"""
|
83 |
+
|
84 |
+
|
85 |
+
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
86 |
+
class NistMt(evaluate.Metric):
|
87 |
+
"""A wrapper around NLTK's NIST implementation."""
|
88 |
+
|
89 |
+
def _info(self):
|
90 |
+
return evaluate.MetricInfo(
|
91 |
+
module_type="metric",
|
92 |
+
description=_DESCRIPTION,
|
93 |
+
citation=_CITATION,
|
94 |
+
inputs_description=_KWARGS_DESCRIPTION,
|
95 |
+
features=[
|
96 |
+
datasets.Features(
|
97 |
+
{
|
98 |
+
"predictions": Value("string", id="prediction"),
|
99 |
+
"references": Sequence(Value("string", id="reference"), id="references"),
|
100 |
+
}
|
101 |
+
),
|
102 |
+
datasets.Features(
|
103 |
+
{
|
104 |
+
"predictions": Value("string", id="prediction"),
|
105 |
+
"references": Value("string", id="reference"),
|
106 |
+
}
|
107 |
+
),
|
108 |
+
],
|
109 |
+
homepage="https://www.nltk.org/api/nltk.translate.nist_score.html",
|
110 |
+
codebase_urls=["https://github.com/nltk/nltk/blob/develop/nltk/translate/nist_score.py"],
|
111 |
+
reference_urls=["https://en.wikipedia.org/wiki/NIST_(metric)"],
|
112 |
+
)
|
113 |
+
|
114 |
+
def _compute(self, predictions, references, n: int = 5, lowercase=False, western_lang=True):
|
115 |
+
tokenizer = NISTTokenizer()
|
116 |
+
|
117 |
+
# Account for single reference cases: references always need to have one more dimension than predictions
|
118 |
+
if isinstance(references[0], str):
|
119 |
+
references = [[ref] for ref in references]
|
120 |
+
|
121 |
+
predictions = [
|
122 |
+
tokenizer.tokenize(pred, return_str=False, lowercase=lowercase, western_lang=western_lang)
|
123 |
+
for pred in predictions
|
124 |
+
]
|
125 |
+
references = [
|
126 |
+
[
|
127 |
+
tokenizer.tokenize(ref, return_str=False, lowercase=lowercase, western_lang=western_lang)
|
128 |
+
for ref in ref_sentences
|
129 |
+
]
|
130 |
+
for ref_sentences in references
|
131 |
+
]
|
132 |
+
return {"nist_mt": corpus_nist(list_of_references=references, hypotheses=predictions, n=n)}
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
git+https://github.com/huggingface/evaluate@2253a6e12a4b4c2c05ef77b84ea6c0f1188ac926
|
2 |
+
nltk
|
tests.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from _pytest.fixtures import fixture
|
2 |
+
from nist_mt import Nist_mt
|
3 |
+
|
4 |
+
|
5 |
+
nist = Nist_mt()
|
6 |
+
|
7 |
+
|
8 |
+
@fixture
|
9 |
+
def hypothesis_sent():
|
10 |
+
return "It is a guide to action which ensures that the military always obeys the commands of the party"
|
11 |
+
|
12 |
+
|
13 |
+
@fixture
|
14 |
+
def reference_sent1():
|
15 |
+
return "It is a guide to action that ensures that the military will forever heed Party commands"
|
16 |
+
|
17 |
+
|
18 |
+
@fixture
|
19 |
+
def reference_sent2():
|
20 |
+
return (
|
21 |
+
"It is the guiding principle which guarantees the military forces always being under the command of the Party"
|
22 |
+
)
|
23 |
+
|
24 |
+
|
25 |
+
@fixture
|
26 |
+
def reference_sent3():
|
27 |
+
return "It is the practical guide for the army always to heed the directions of the party"
|
28 |
+
|
29 |
+
|
30 |
+
def test_nist_sentence(hypothesis_sent, reference_sent1, reference_sent2, reference_sent3):
|
31 |
+
nist_score = nist.compute(
|
32 |
+
predictions=[hypothesis_sent], references=[[reference_sent1, reference_sent2, reference_sent3]]
|
33 |
+
)
|
34 |
+
assert abs(nist_score["nist_mt"] - 3.3709935957649324) < 1e-6
|