Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /tokenize /simple.py

sunnychenxiwang

update nltk

d916065 12 months ago

raw

history blame

5.38 kB

	# Natural Language Toolkit: Simple Tokenizers
	#
	# Copyright (C) 2001-2023 NLTK Project
	# Author: Edward Loper <[email protected]>
	# Steven Bird <[email protected]>
	# URL: <https://www.nltk.org>
	# For license information, see LICENSE.TXT

	r"""
	Simple Tokenizers

	These tokenizers divide strings into substrings using the string
	``split()`` method.
	When tokenizing using a particular delimiter string, use
	the string ``split()`` method directly, as this is more efficient.

	The simple tokenizers are not available as separate functions;
	instead, you should just use the string ``split()`` method directly:

	>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
	>>> s.split() # doctest: +NORMALIZE_WHITESPACE
	['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
	'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
	>>> s.split(' ') # doctest: +NORMALIZE_WHITESPACE
	['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
	'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
	>>> s.split('\n') # doctest: +NORMALIZE_WHITESPACE
	['Good muffins cost $3.88', 'in New York. Please buy me',
	'two of them.', '', 'Thanks.']

	The simple tokenizers are mainly useful because they follow the
	standard ``TokenizerI`` interface, and so can be used with any code
	that expects a tokenizer. For example, these tokenizers can be used
	to specify the tokenization conventions when building a `CorpusReader`.

	"""

	from nltk.tokenize.api import StringTokenizer, TokenizerI
	from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize


	class SpaceTokenizer(StringTokenizer):
	r"""Tokenize a string using the space character as a delimiter,
	which is the same as ``s.split(' ')``.

	>>> from nltk.tokenize import SpaceTokenizer
	>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
	>>> SpaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
	['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
	'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
	"""

	_string = " "


	class TabTokenizer(StringTokenizer):
	r"""Tokenize a string use the tab character as a delimiter,
	the same as ``s.split('\t')``.

	>>> from nltk.tokenize import TabTokenizer
	>>> TabTokenizer().tokenize('a\tb c\n\t d')
	['a', 'b c\n', ' d']
	"""

	_string = "\t"


	class CharTokenizer(StringTokenizer):
	"""Tokenize a string into individual characters. If this functionality
	is ever required directly, use ``for char in string``.
	"""

	def tokenize(self, s):
	return list(s)

	def span_tokenize(self, s):
	yield from enumerate(range(1, len(s) + 1))


	class LineTokenizer(TokenizerI):
	r"""Tokenize a string into its lines, optionally discarding blank lines.
	This is similar to ``s.split('\n')``.

	>>> from nltk.tokenize import LineTokenizer
	>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
	>>> LineTokenizer(blanklines='keep').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
	['Good muffins cost $3.88', 'in New York. Please buy me',
	'two of them.', '', 'Thanks.']
	>>> # same as [l for l in s.split('\n') if l.strip()]:
	>>> LineTokenizer(blanklines='discard').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
	['Good muffins cost $3.88', 'in New York. Please buy me',
	'two of them.', 'Thanks.']

	:param blanklines: Indicates how blank lines should be handled. Valid values are:

	- ``discard``: strip blank lines out of the token list before returning it.
	A line is considered blank if it contains only whitespace characters.
	- ``keep``: leave all blank lines in the token list.
	- ``discard-eof``: if the string ends with a newline, then do not generate
	a corresponding token ``''`` after that newline.
	"""

	def __init__(self, blanklines="discard"):
	valid_blanklines = ("discard", "keep", "discard-eof")
	if blanklines not in valid_blanklines:
	raise ValueError(
	"Blank lines must be one of: %s" % " ".join(valid_blanklines)
	)

	self._blanklines = blanklines

	def tokenize(self, s):
	lines = s.splitlines()
	# If requested, strip off blank lines.
	if self._blanklines == "discard":
	lines = [l for l in lines if l.rstrip()]
	elif self._blanklines == "discard-eof":
	if lines and not lines[-1].strip():
	lines.pop()
	return lines

	# discard-eof not implemented
	def span_tokenize(self, s):
	if self._blanklines == "keep":
	yield from string_span_tokenize(s, r"\n")
	else:
	yield from regexp_span_tokenize(s, r"\n(\s+\n)*")


	######################################################################
	# { Tokenization Functions
	######################################################################
	# XXX: it is stated in module docs that there is no function versions


	def line_tokenize(text, blanklines="discard"):
	return LineTokenizer(blanklines).tokenize(text)