Spaces:
Sleeping
Sleeping
File size: 5,379 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# Natural Language Toolkit: Simple Tokenizers
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Edward Loper <[email protected]>
# Steven Bird <[email protected]>
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT
r"""
Simple Tokenizers
These tokenizers divide strings into substrings using the string
``split()`` method.
When tokenizing using a particular delimiter string, use
the string ``split()`` method directly, as this is more efficient.
The simple tokenizers are *not* available as separate functions;
instead, you should just use the string ``split()`` method directly:
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> s.split() # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
>>> s.split(' ') # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
>>> s.split('\n') # doctest: +NORMALIZE_WHITESPACE
['Good muffins cost $3.88', 'in New York. Please buy me',
'two of them.', '', 'Thanks.']
The simple tokenizers are mainly useful because they follow the
standard ``TokenizerI`` interface, and so can be used with any code
that expects a tokenizer. For example, these tokenizers can be used
to specify the tokenization conventions when building a `CorpusReader`.
"""
from nltk.tokenize.api import StringTokenizer, TokenizerI
from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize
class SpaceTokenizer(StringTokenizer):
r"""Tokenize a string using the space character as a delimiter,
which is the same as ``s.split(' ')``.
>>> from nltk.tokenize import SpaceTokenizer
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> SpaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
"""
_string = " "
class TabTokenizer(StringTokenizer):
r"""Tokenize a string use the tab character as a delimiter,
the same as ``s.split('\t')``.
>>> from nltk.tokenize import TabTokenizer
>>> TabTokenizer().tokenize('a\tb c\n\t d')
['a', 'b c\n', ' d']
"""
_string = "\t"
class CharTokenizer(StringTokenizer):
"""Tokenize a string into individual characters. If this functionality
is ever required directly, use ``for char in string``.
"""
def tokenize(self, s):
return list(s)
def span_tokenize(self, s):
yield from enumerate(range(1, len(s) + 1))
class LineTokenizer(TokenizerI):
r"""Tokenize a string into its lines, optionally discarding blank lines.
This is similar to ``s.split('\n')``.
>>> from nltk.tokenize import LineTokenizer
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> LineTokenizer(blanklines='keep').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good muffins cost $3.88', 'in New York. Please buy me',
'two of them.', '', 'Thanks.']
>>> # same as [l for l in s.split('\n') if l.strip()]:
>>> LineTokenizer(blanklines='discard').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good muffins cost $3.88', 'in New York. Please buy me',
'two of them.', 'Thanks.']
:param blanklines: Indicates how blank lines should be handled. Valid values are:
- ``discard``: strip blank lines out of the token list before returning it.
A line is considered blank if it contains only whitespace characters.
- ``keep``: leave all blank lines in the token list.
- ``discard-eof``: if the string ends with a newline, then do not generate
a corresponding token ``''`` after that newline.
"""
def __init__(self, blanklines="discard"):
valid_blanklines = ("discard", "keep", "discard-eof")
if blanklines not in valid_blanklines:
raise ValueError(
"Blank lines must be one of: %s" % " ".join(valid_blanklines)
)
self._blanklines = blanklines
def tokenize(self, s):
lines = s.splitlines()
# If requested, strip off blank lines.
if self._blanklines == "discard":
lines = [l for l in lines if l.rstrip()]
elif self._blanklines == "discard-eof":
if lines and not lines[-1].strip():
lines.pop()
return lines
# discard-eof not implemented
def span_tokenize(self, s):
if self._blanklines == "keep":
yield from string_span_tokenize(s, r"\n")
else:
yield from regexp_span_tokenize(s, r"\n(\s+\n)*")
######################################################################
# { Tokenization Functions
######################################################################
# XXX: it is stated in module docs that there is no function versions
def line_tokenize(text, blanklines="discard"):
return LineTokenizer(blanklines).tokenize(text)
|