korscideberta / normalize.py
kkmkorea's picture
Upload 6 files
ee2270f
raw
history blame
5.68 kB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import regex
from itertools import chain
class MosesPunctNormalizer:
"""
This is a Python port of the Moses punctuation normalizer from
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl
"""
EXTRA_WHITESPACE = [ # lines 21 - 30
(r"\r", r""),
(r"\(", r" ("),
(r"\)", r") "),
(r" +", r" "),
(r"\) ([.!:?;,])", r")\g<1>"),
(r"\( ", r"("),
(r" \)", r")"),
(r"(\d) %", r"\g<1>%"),
(r" :", r":"),
(r" ;", r";"),
]
NORMALIZE_UNICODE_IF_NOT_PENN = [(r"`", r"'"), (r"''", r' " ')] # lines 33 - 34
NORMALIZE_UNICODE = [ # lines 37 - 50
("„", r'"'),
("“", r'"'),
("”", r'"'),
("–", r"-"),
("—", r" - "),
(r" +", r" "),
("´", r"'"),
("([a-zA-Z])‘([a-zA-Z])", r"\g<1>'\g<2>"),
("([a-zA-Z])’([a-zA-Z])", r"\g<1>'\g<2>"),
("‘", r"'"),
("‚", r"'"),
("’", r"'"),
(r"''", r'"'),
("´´", r'"'),
("…", r"..."),
]
FRENCH_QUOTES = [ # lines 52 - 57
("\u00A0«\u00A0", r'"'),
("«\u00A0", r'"'),
("«", r'"'),
("\u00A0»\u00A0", r'"'),
("\u00A0»", r'"'),
("»", r'"'),
]
HANDLE_PSEUDO_SPACES = [ # lines 59 - 67
("\u00A0%", r"%"),
("nº\u00A0", "nº "),
("\u00A0:", r":"),
("\u00A0ºC", " ºC"),
("\u00A0cm", r" cm"),
("\u00A0\\?", "?"),
("\u00A0\\!", "!"),
("\u00A0;", r";"),
(",\u00A0", r", "),
(r" +", r" "),
]
EN_QUOTATION_FOLLOWED_BY_COMMA = [(r'"([,.]+)', r'\g<1>"')]
DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA = [
(r',"', r'",'),
(r'(\.+)"(\s*[^<])', r'"\g<1>\g<2>'), # don't fix period at end of sentence
]
DE_ES_CZ_CS_FR = [
("(\\d)\u00A0(\\d)", r"\g<1>,\g<2>"),
]
OTHER = [
("(\\d)\u00A0(\\d)", r"\g<1>.\g<2>"),
]
# Regex substitutions from replace-unicode-punctuation.perl
# https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
REPLACE_UNICODE_PUNCTUATION = [
(",", ","),
(r"。\s*", ". "),
("、", ","),
("”", '"'),
("“", '"'),
("∶", ":"),
(":", ":"),
("?", "?"),
("《", '"'),
("》", '"'),
(")", ")"),
("!", "!"),
("(", "("),
(";", ";"),
("」", '"'),
("「", '"'),
("0", "0"),
("1", "1"),
("2", "2"),
("3", "3"),
("4", "4"),
("5", "5"),
("6", "6"),
("7", "7"),
("8", "8"),
("9", "9"),
(r".\s*", ". "),
("~", "~"),
("’", "'"),
("…", "..."),
("━", "-"),
("〈", "<"),
("〉", ">"),
("【", "["),
("】", "]"),
("%", "%"),
]
def __init__(
self,
lang="en",
penn=True,
norm_quote_commas=True,
norm_numbers=True,
pre_replace_unicode_punct=False,
post_remove_control_chars=False,
):
"""
:param language: The two-letter language code.
:type lang: str
:param penn: Normalize Penn Treebank style quotations.
:type penn: bool
:param norm_quote_commas: Normalize quotations and commas
:type norm_quote_commas: bool
:param norm_numbers: Normalize numbers
:type norm_numbers: bool
"""
self.substitutions = [
self.EXTRA_WHITESPACE,
self.NORMALIZE_UNICODE,
self.FRENCH_QUOTES,
self.HANDLE_PSEUDO_SPACES,
]
if penn: # Adds the penn substitutions after extra_whitespace regexes.
self.substitutions.insert(1, self.NORMALIZE_UNICODE_IF_NOT_PENN)
if norm_quote_commas:
if lang == "en":
self.substitutions.append(self.EN_QUOTATION_FOLLOWED_BY_COMMA)
elif lang in ["de", "es", "fr"]:
self.substitutions.append(self.DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA)
if norm_numbers:
if lang in ["de", "es", "cz", "cs", "fr"]:
self.substitutions.append(self.DE_ES_CZ_CS_FR)
else:
self.substitutions.append(self.OTHER)
self.substitutions = list(chain(*self.substitutions))
self.pre_replace_unicode_punct = pre_replace_unicode_punct
self.post_remove_control_chars = post_remove_control_chars
def normalize(self, text):
"""
Returns a string with normalized punctuation.
"""
# Optionally, replace unicode puncts BEFORE normalization.
if self.pre_replace_unicode_punct:
text = self.replace_unicode_punct(text)
# Actual normalization.
for regexp, substitution in self.substitutions:
# print(regexp, substitution)
text = re.sub(regexp, substitution, str(text))
# print(text)
# Optionally, replace unicode puncts BEFORE normalization.
if self.post_remove_control_chars:
text = self.remove_control_chars(text)
return text.strip()
def replace_unicode_punct(self, text):
for regexp, substitution in self.REPLACE_UNICODE_PUNCTUATION:
text = re.sub(regexp, substitution, str(text))
return text
def remove_control_chars(self, text):
return regex.sub(r"\p{C}", "", text)