Spaces:
Sleeping
Sleeping
# Natural Language Toolkit: Tokenizer Utilities | |
# | |
# Copyright (C) 2001-2023 NLTK Project | |
# Author: Steven Bird <[email protected]> | |
# URL: <https://www.nltk.org> | |
# For license information, see LICENSE.TXT | |
from re import finditer | |
from xml.sax.saxutils import escape, unescape | |
def string_span_tokenize(s, sep): | |
r""" | |
Return the offsets of the tokens in *s*, as a sequence of ``(start, end)`` | |
tuples, by splitting the string at each occurrence of *sep*. | |
>>> from nltk.tokenize.util import string_span_tokenize | |
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me | |
... two of them.\n\nThanks.''' | |
>>> list(string_span_tokenize(s, " ")) # doctest: +NORMALIZE_WHITESPACE | |
[(0, 4), (5, 12), (13, 17), (18, 26), (27, 30), (31, 36), (37, 37), | |
(38, 44), (45, 48), (49, 55), (56, 58), (59, 73)] | |
:param s: the string to be tokenized | |
:type s: str | |
:param sep: the token separator | |
:type sep: str | |
:rtype: iter(tuple(int, int)) | |
""" | |
if len(sep) == 0: | |
raise ValueError("Token delimiter must not be empty") | |
left = 0 | |
while True: | |
try: | |
right = s.index(sep, left) | |
if right != 0: | |
yield left, right | |
except ValueError: | |
if left != len(s): | |
yield left, len(s) | |
break | |
left = right + len(sep) | |
def regexp_span_tokenize(s, regexp): | |
r""" | |
Return the offsets of the tokens in *s*, as a sequence of ``(start, end)`` | |
tuples, by splitting the string at each successive match of *regexp*. | |
>>> from nltk.tokenize.util import regexp_span_tokenize | |
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me | |
... two of them.\n\nThanks.''' | |
>>> list(regexp_span_tokenize(s, r'\s')) # doctest: +NORMALIZE_WHITESPACE | |
[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), | |
(38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)] | |
:param s: the string to be tokenized | |
:type s: str | |
:param regexp: regular expression that matches token separators (must not be empty) | |
:type regexp: str | |
:rtype: iter(tuple(int, int)) | |
""" | |
left = 0 | |
for m in finditer(regexp, s): | |
right, next = m.span() | |
if right != left: | |
yield left, right | |
left = next | |
yield left, len(s) | |
def spans_to_relative(spans): | |
r""" | |
Return a sequence of relative spans, given a sequence of spans. | |
>>> from nltk.tokenize import WhitespaceTokenizer | |
>>> from nltk.tokenize.util import spans_to_relative | |
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me | |
... two of them.\n\nThanks.''' | |
>>> list(spans_to_relative(WhitespaceTokenizer().span_tokenize(s))) # doctest: +NORMALIZE_WHITESPACE | |
[(0, 4), (1, 7), (1, 4), (1, 5), (1, 2), (1, 3), (1, 5), (2, 6), | |
(1, 3), (1, 2), (1, 3), (1, 2), (1, 5), (2, 7)] | |
:param spans: a sequence of (start, end) offsets of the tokens | |
:type spans: iter(tuple(int, int)) | |
:rtype: iter(tuple(int, int)) | |
""" | |
prev = 0 | |
for left, right in spans: | |
yield left - prev, right - left | |
prev = right | |
class CJKChars: | |
""" | |
An object that enumerates the code points of the CJK characters as listed on | |
https://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane | |
This is a Python port of the CJK code point enumerations of Moses tokenizer: | |
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl#L309 | |
""" | |
# Hangul Jamo (1100β11FF) | |
Hangul_Jamo = (4352, 4607) # (ord(u"\u1100"), ord(u"\u11ff")) | |
# CJK Radicals Supplement (2E80β2EFF) | |
# Kangxi Radicals (2F00β2FDF) | |
# Ideographic Description Characters (2FF0β2FFF) | |
# CJK Symbols and Punctuation (3000β303F) | |
# Hiragana (3040β309F) | |
# Katakana (30A0β30FF) | |
# Bopomofo (3100β312F) | |
# Hangul Compatibility Jamo (3130β318F) | |
# Kanbun (3190β319F) | |
# Bopomofo Extended (31A0β31BF) | |
# CJK Strokes (31C0β31EF) | |
# Katakana Phonetic Extensions (31F0β31FF) | |
# Enclosed CJK Letters and Months (3200β32FF) | |
# CJK Compatibility (3300β33FF) | |
# CJK Unified Ideographs Extension A (3400β4DBF) | |
# Yijing Hexagram Symbols (4DC0β4DFF) | |
# CJK Unified Ideographs (4E00β9FFF) | |
# Yi Syllables (A000βA48F) | |
# Yi Radicals (A490βA4CF) | |
CJK_Radicals = (11904, 42191) # (ord(u"\u2e80"), ord(u"\ua4cf")) | |
# Phags-pa (A840βA87F) | |
Phags_Pa = (43072, 43135) # (ord(u"\ua840"), ord(u"\ua87f")) | |
# Hangul Syllables (AC00βD7AF) | |
Hangul_Syllables = (44032, 55215) # (ord(u"\uAC00"), ord(u"\uD7AF")) | |
# CJK Compatibility Ideographs (F900βFAFF) | |
CJK_Compatibility_Ideographs = (63744, 64255) # (ord(u"\uF900"), ord(u"\uFAFF")) | |
# CJK Compatibility Forms (FE30βFE4F) | |
CJK_Compatibility_Forms = (65072, 65103) # (ord(u"\uFE30"), ord(u"\uFE4F")) | |
# Range U+FF65βFFDC encodes halfwidth forms, of Katakana and Hangul characters | |
Katakana_Hangul_Halfwidth = (65381, 65500) # (ord(u"\uFF65"), ord(u"\uFFDC")) | |
# Supplementary Ideographic Plane 20000β2FFFF | |
Supplementary_Ideographic_Plane = ( | |
131072, | |
196607, | |
) # (ord(u"\U00020000"), ord(u"\U0002FFFF")) | |
ranges = [ | |
Hangul_Jamo, | |
CJK_Radicals, | |
Phags_Pa, | |
Hangul_Syllables, | |
CJK_Compatibility_Ideographs, | |
CJK_Compatibility_Forms, | |
Katakana_Hangul_Halfwidth, | |
Supplementary_Ideographic_Plane, | |
] | |
def is_cjk(character): | |
""" | |
Python port of Moses' code to check for CJK character. | |
>>> CJKChars().ranges | |
[(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607)] | |
>>> is_cjk(u'\u33fe') | |
True | |
>>> is_cjk(u'\uFE5F') | |
False | |
:param character: The character that needs to be checked. | |
:type character: char | |
:return: bool | |
""" | |
return any( | |
[ | |
start <= ord(character) <= end | |
for start, end in [ | |
(4352, 4607), | |
(11904, 42191), | |
(43072, 43135), | |
(44032, 55215), | |
(63744, 64255), | |
(65072, 65103), | |
(65381, 65500), | |
(131072, 196607), | |
] | |
] | |
) | |
def xml_escape(text): | |
""" | |
This function transforms the input text into an "escaped" version suitable | |
for well-formed XML formatting. | |
Note that the default xml.sax.saxutils.escape() function don't escape | |
some characters that Moses does so we have to manually add them to the | |
entities dictionary. | |
>>> input_str = ''')| & < > ' " ] [''' | |
>>> expected_output = ''')| & < > ' " ] [''' | |
>>> escape(input_str) == expected_output | |
True | |
>>> xml_escape(input_str) | |
')| & < > ' " ] [' | |
:param text: The text that needs to be escaped. | |
:type text: str | |
:rtype: str | |
""" | |
return escape( | |
text, | |
entities={ | |
r"'": r"'", | |
r'"': r""", | |
r"|": r"|", | |
r"[": r"[", | |
r"]": r"]", | |
}, | |
) | |
def xml_unescape(text): | |
""" | |
This function transforms the "escaped" version suitable | |
for well-formed XML formatting into humanly-readable string. | |
Note that the default xml.sax.saxutils.unescape() function don't unescape | |
some characters that Moses does so we have to manually add them to the | |
entities dictionary. | |
>>> from xml.sax.saxutils import unescape | |
>>> s = ')| & < > ' " ] [' | |
>>> expected = ''')| & < > \' " ] [''' | |
>>> xml_unescape(s) == expected | |
True | |
:param text: The text that needs to be unescaped. | |
:type text: str | |
:rtype: str | |
""" | |
return unescape( | |
text, | |
entities={ | |
r"'": r"'", | |
r""": r'"', | |
r"|": r"|", | |
r"[": r"[", | |
r"]": r"]", | |
}, | |
) | |
def align_tokens(tokens, sentence): | |
""" | |
This module attempt to find the offsets of the tokens in *s*, as a sequence | |
of ``(start, end)`` tuples, given the tokens and also the source string. | |
>>> from nltk.tokenize import TreebankWordTokenizer | |
>>> from nltk.tokenize.util import align_tokens | |
>>> s = str("The plane, bound for St Petersburg, crashed in Egypt's " | |
... "Sinai desert just 23 minutes after take-off from Sharm el-Sheikh " | |
... "on Saturday.") | |
>>> tokens = TreebankWordTokenizer().tokenize(s) | |
>>> expected = [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23), | |
... (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54), | |
... (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89), | |
... (90, 98), (99, 103), (104, 109), (110, 119), (120, 122), | |
... (123, 131), (131, 132)] | |
>>> output = list(align_tokens(tokens, s)) | |
>>> len(tokens) == len(expected) == len(output) # Check that length of tokens and tuples are the same. | |
True | |
>>> expected == list(align_tokens(tokens, s)) # Check that the output is as expected. | |
True | |
>>> tokens == [s[start:end] for start, end in output] # Check that the slices of the string corresponds to the tokens. | |
True | |
:param tokens: The list of strings that are the result of tokenization | |
:type tokens: list(str) | |
:param sentence: The original string | |
:type sentence: str | |
:rtype: list(tuple(int,int)) | |
""" | |
point = 0 | |
offsets = [] | |
for token in tokens: | |
try: | |
start = sentence.index(token, point) | |
except ValueError as e: | |
raise ValueError(f'substring "{token}" not found in "{sentence}"') from e | |
point = start + len(token) | |
offsets.append((start, point)) | |
return offsets | |