Spaces:
Running
Running
File size: 3,344 Bytes
d807f7c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
# Source: https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/tokenizers/tokenizer_13a.py
# Copyright 2020 SacreBLEU Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from functools import lru_cache
class BaseTokenizer:
"""A base dummy tokenizer to derive from."""
def signature(self):
"""
Returns a signature for the tokenizer.
:return: signature string
"""
return "none"
def __call__(self, line):
"""
Tokenizes an input line with the tokenizer.
:param line: a segment to tokenize
:return: the tokenized line
"""
return line
class TokenizerRegexp(BaseTokenizer):
def signature(self):
return "re"
def __init__(self):
self._re = [
# language-dependent part (assuming Western languages)
(re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), r" \1 "),
# tokenize period and comma unless preceded by a digit
(re.compile(r"([^0-9])([\.,])"), r"\1 \2 "),
# tokenize period and comma unless followed by a digit
(re.compile(r"([\.,])([^0-9])"), r" \1 \2"),
# tokenize dash when preceded by a digit
(re.compile(r"([0-9])(-)"), r"\1 \2 "),
# one space only between words
# NOTE: Doing this in Python (below) is faster
# (re.compile(r'\s+'), r' '),
]
@lru_cache(maxsize=2**16)
def __call__(self, line):
"""Common post-processing tokenizer for `13a` and `zh` tokenizers.
:param line: a segment to tokenize
:return: the tokenized line
"""
for (_re, repl) in self._re:
line = _re.sub(repl, line)
# no leading or trailing spaces, single space within words
# return ' '.join(line.split())
# This line is changed with regards to the original tokenizer (seen above) to return individual words
return line.split()
class Tokenizer13a(BaseTokenizer):
def signature(self):
return "13a"
def __init__(self):
self._post_tokenizer = TokenizerRegexp()
@lru_cache(maxsize=2**16)
def __call__(self, line):
"""Tokenizes an input line using a relatively minimal tokenization
that is however equivalent to mteval-v13a, used by WMT.
:param line: a segment to tokenize
:return: the tokenized line
"""
# language-independent part:
line = line.replace("<skipped>", "")
line = line.replace("-\n", "")
line = line.replace("\n", " ")
if "&" in line:
line = line.replace(""", '"')
line = line.replace("&", "&")
line = line.replace("<", "<")
line = line.replace(">", ">")
return self._post_tokenizer(f" {line} ")
|