Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /tokenize /util.py

sunnychenxiwang

update nltk

d916065 12 months ago

raw

history blame

10.3 kB

	# Natural Language Toolkit: Tokenizer Utilities
	#
	# Copyright (C) 2001-2023 NLTK Project
	# Author: Steven Bird <[email protected]>
	# URL: <https://www.nltk.org>
	# For license information, see LICENSE.TXT

	from re import finditer
	from xml.sax.saxutils import escape, unescape


	def string_span_tokenize(s, sep):
	r"""
	Return the offsets of the tokens in s, as a sequence of ``(start, end)``
	tuples, by splitting the string at each occurrence of sep.

	>>> from nltk.tokenize.util import string_span_tokenize
	>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
	... two of them.\n\nThanks.'''
	>>> list(string_span_tokenize(s, " ")) # doctest: +NORMALIZE_WHITESPACE
	[(0, 4), (5, 12), (13, 17), (18, 26), (27, 30), (31, 36), (37, 37),
	(38, 44), (45, 48), (49, 55), (56, 58), (59, 73)]

	:param s: the string to be tokenized
	:type s: str
	:param sep: the token separator
	:type sep: str
	:rtype: iter(tuple(int, int))
	"""
	if len(sep) == 0:
	raise ValueError("Token delimiter must not be empty")
	left = 0
	while True:
	try:
	right = s.index(sep, left)
	if right != 0:
	yield left, right
	except ValueError:
	if left != len(s):
	yield left, len(s)
	break

	left = right + len(sep)


	def regexp_span_tokenize(s, regexp):
	r"""
	Return the offsets of the tokens in s, as a sequence of ``(start, end)``
	tuples, by splitting the string at each successive match of regexp.

	>>> from nltk.tokenize.util import regexp_span_tokenize
	>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
	... two of them.\n\nThanks.'''
	>>> list(regexp_span_tokenize(s, r'\s')) # doctest: +NORMALIZE_WHITESPACE
	[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36),
	(38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]

	:param s: the string to be tokenized
	:type s: str
	:param regexp: regular expression that matches token separators (must not be empty)
	:type regexp: str
	:rtype: iter(tuple(int, int))
	"""
	left = 0
	for m in finditer(regexp, s):
	right, next = m.span()
	if right != left:
	yield left, right
	left = next
	yield left, len(s)


	def spans_to_relative(spans):
	r"""
	Return a sequence of relative spans, given a sequence of spans.

	>>> from nltk.tokenize import WhitespaceTokenizer
	>>> from nltk.tokenize.util import spans_to_relative
	>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
	... two of them.\n\nThanks.'''
	>>> list(spans_to_relative(WhitespaceTokenizer().span_tokenize(s))) # doctest: +NORMALIZE_WHITESPACE
	[(0, 4), (1, 7), (1, 4), (1, 5), (1, 2), (1, 3), (1, 5), (2, 6),
	(1, 3), (1, 2), (1, 3), (1, 2), (1, 5), (2, 7)]

	:param spans: a sequence of (start, end) offsets of the tokens
	:type spans: iter(tuple(int, int))
	:rtype: iter(tuple(int, int))
	"""
	prev = 0
	for left, right in spans:
	yield left - prev, right - left
	prev = right


	class CJKChars:
	"""
	An object that enumerates the code points of the CJK characters as listed on
	https://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane

	This is a Python port of the CJK code point enumerations of Moses tokenizer:
	https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl#L309
	"""

	# Hangul Jamo (1100–11FF)
	Hangul_Jamo = (4352, 4607) # (ord(u"\u1100"), ord(u"\u11ff"))

	# CJK Radicals Supplement (2E80–2EFF)
	# Kangxi Radicals (2F00–2FDF)
	# Ideographic Description Characters (2FF0–2FFF)
	# CJK Symbols and Punctuation (3000–303F)
	# Hiragana (3040–309F)
	# Katakana (30A0–30FF)
	# Bopomofo (3100–312F)
	# Hangul Compatibility Jamo (3130–318F)
	# Kanbun (3190–319F)
	# Bopomofo Extended (31A0–31BF)
	# CJK Strokes (31C0–31EF)
	# Katakana Phonetic Extensions (31F0–31FF)
	# Enclosed CJK Letters and Months (3200–32FF)
	# CJK Compatibility (3300–33FF)
	# CJK Unified Ideographs Extension A (3400–4DBF)
	# Yijing Hexagram Symbols (4DC0–4DFF)
	# CJK Unified Ideographs (4E00–9FFF)
	# Yi Syllables (A000–A48F)
	# Yi Radicals (A490–A4CF)
	CJK_Radicals = (11904, 42191) # (ord(u"\u2e80"), ord(u"\ua4cf"))

	# Phags-pa (A840–A87F)
	Phags_Pa = (43072, 43135) # (ord(u"\ua840"), ord(u"\ua87f"))

	# Hangul Syllables (AC00–D7AF)
	Hangul_Syllables = (44032, 55215) # (ord(u"\uAC00"), ord(u"\uD7AF"))

	# CJK Compatibility Ideographs (F900–FAFF)
	CJK_Compatibility_Ideographs = (63744, 64255) # (ord(u"\uF900"), ord(u"\uFAFF"))

	# CJK Compatibility Forms (FE30–FE4F)
	CJK_Compatibility_Forms = (65072, 65103) # (ord(u"\uFE30"), ord(u"\uFE4F"))

	# Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
	Katakana_Hangul_Halfwidth = (65381, 65500) # (ord(u"\uFF65"), ord(u"\uFFDC"))

	# Supplementary Ideographic Plane 20000–2FFFF
	Supplementary_Ideographic_Plane = (
	131072,
	196607,
	) # (ord(u"\U00020000"), ord(u"\U0002FFFF"))

	ranges = [
	Hangul_Jamo,
	CJK_Radicals,
	Phags_Pa,
	Hangul_Syllables,
	CJK_Compatibility_Ideographs,
	CJK_Compatibility_Forms,
	Katakana_Hangul_Halfwidth,
	Supplementary_Ideographic_Plane,
	]


	def is_cjk(character):
	"""
	Python port of Moses' code to check for CJK character.

	>>> CJKChars().ranges
	[(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607)]
	>>> is_cjk(u'\u33fe')
	True
	>>> is_cjk(u'\uFE5F')
	False

	:param character: The character that needs to be checked.
	:type character: char
	:return: bool
	"""
	return any(
	[
	start <= ord(character) <= end
	for start, end in [
	(4352, 4607),
	(11904, 42191),
	(43072, 43135),
	(44032, 55215),
	(63744, 64255),
	(65072, 65103),
	(65381, 65500),
	(131072, 196607),
	]
	]
	)


	def xml_escape(text):
	"""
	This function transforms the input text into an "escaped" version suitable
	for well-formed XML formatting.

	Note that the default xml.sax.saxutils.escape() function don't escape
	some characters that Moses does so we have to manually add them to the
	entities dictionary.

	>>> input_str = ''')\| & < > ' " ] ['''
	>>> expected_output = ''')\| & < > ' " ] ['''
	>>> escape(input_str) == expected_output
	True
	>>> xml_escape(input_str)
	')\| & < > ' " ] ['

	:param text: The text that needs to be escaped.
	:type text: str
	:rtype: str
	"""
	return escape(
	text,
	entities={
	r"'": r"'",
	r'"': r""",
	r"\|": r"\|",
	r"[": r"[",
	r"]": r"]",
	},
	)


	def xml_unescape(text):
	"""
	This function transforms the "escaped" version suitable
	for well-formed XML formatting into humanly-readable string.

	Note that the default xml.sax.saxutils.unescape() function don't unescape
	some characters that Moses does so we have to manually add them to the
	entities dictionary.

	>>> from xml.sax.saxutils import unescape
	>>> s = ')\| & < > ' " ] ['
	>>> expected = ''')\| & < > \' " ] ['''
	>>> xml_unescape(s) == expected
	True

	:param text: The text that needs to be unescaped.
	:type text: str
	:rtype: str
	"""
	return unescape(
	text,
	entities={
	r"'": r"'",
	r""": r'"',
	r"\|": r"\|",
	r"[": r"[",
	r"]": r"]",
	},
	)


	def align_tokens(tokens, sentence):
	"""
	This module attempt to find the offsets of the tokens in s, as a sequence
	of ``(start, end)`` tuples, given the tokens and also the source string.

	>>> from nltk.tokenize import TreebankWordTokenizer
	>>> from nltk.tokenize.util import align_tokens
	>>> s = str("The plane, bound for St Petersburg, crashed in Egypt's "
	... "Sinai desert just 23 minutes after take-off from Sharm el-Sheikh "
	... "on Saturday.")
	>>> tokens = TreebankWordTokenizer().tokenize(s)
	>>> expected = [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23),
	... (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54),
	... (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89),
	... (90, 98), (99, 103), (104, 109), (110, 119), (120, 122),
	... (123, 131), (131, 132)]
	>>> output = list(align_tokens(tokens, s))
	>>> len(tokens) == len(expected) == len(output) # Check that length of tokens and tuples are the same.
	True
	>>> expected == list(align_tokens(tokens, s)) # Check that the output is as expected.
	True
	>>> tokens == [s[start:end] for start, end in output] # Check that the slices of the string corresponds to the tokens.
	True

	:param tokens: The list of strings that are the result of tokenization
	:type tokens: list(str)
	:param sentence: The original string
	:type sentence: str
	:rtype: list(tuple(int,int))
	"""
	point = 0
	offsets = []
	for token in tokens:
	try:
	start = sentence.index(token, point)
	except ValueError as e:
	raise ValueError(f'substring "{token}" not found in "{sentence}"') from e
	point = start + len(token)
	offsets.append((start, point))
	return offsets