File size: 5,379 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# Natural Language Toolkit: Simple Tokenizers
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Edward Loper <[email protected]>
#         Steven Bird <[email protected]>
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT

r"""

Simple Tokenizers



These tokenizers divide strings into substrings using the string

``split()`` method.

When tokenizing using a particular delimiter string, use

the string ``split()`` method directly, as this is more efficient.



The simple tokenizers are *not* available as separate functions;

instead, you should just use the string ``split()`` method directly:



    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."

    >>> s.split() # doctest: +NORMALIZE_WHITESPACE

    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',

    'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']

    >>> s.split(' ') # doctest: +NORMALIZE_WHITESPACE

    ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',

    'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']

    >>> s.split('\n') # doctest: +NORMALIZE_WHITESPACE

    ['Good muffins cost $3.88', 'in New York.  Please buy me',

    'two of them.', '', 'Thanks.']



The simple tokenizers are mainly useful because they follow the

standard ``TokenizerI`` interface, and so can be used with any code

that expects a tokenizer.  For example, these tokenizers can be used

to specify the tokenization conventions when building a `CorpusReader`.



"""

from nltk.tokenize.api import StringTokenizer, TokenizerI
from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize


class SpaceTokenizer(StringTokenizer):
    r"""Tokenize a string using the space character as a delimiter,

    which is the same as ``s.split(' ')``.



        >>> from nltk.tokenize import SpaceTokenizer

        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."

        >>> SpaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE

        ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',

        'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']

    """

    _string = " "


class TabTokenizer(StringTokenizer):
    r"""Tokenize a string use the tab character as a delimiter,

    the same as ``s.split('\t')``.



        >>> from nltk.tokenize import TabTokenizer

        >>> TabTokenizer().tokenize('a\tb c\n\t d')

        ['a', 'b c\n', ' d']

    """

    _string = "\t"


class CharTokenizer(StringTokenizer):
    """Tokenize a string into individual characters.  If this functionality

    is ever required directly, use ``for char in string``.

    """

    def tokenize(self, s):
        return list(s)

    def span_tokenize(self, s):
        yield from enumerate(range(1, len(s) + 1))


class LineTokenizer(TokenizerI):
    r"""Tokenize a string into its lines, optionally discarding blank lines.

    This is similar to ``s.split('\n')``.



        >>> from nltk.tokenize import LineTokenizer

        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."

        >>> LineTokenizer(blanklines='keep').tokenize(s) # doctest: +NORMALIZE_WHITESPACE

        ['Good muffins cost $3.88', 'in New York.  Please buy me',

        'two of them.', '', 'Thanks.']

        >>> # same as [l for l in s.split('\n') if l.strip()]:

        >>> LineTokenizer(blanklines='discard').tokenize(s) # doctest: +NORMALIZE_WHITESPACE

        ['Good muffins cost $3.88', 'in New York.  Please buy me',

        'two of them.', 'Thanks.']



    :param blanklines: Indicates how blank lines should be handled.  Valid values are:



        - ``discard``: strip blank lines out of the token list before returning it.

           A line is considered blank if it contains only whitespace characters.

        - ``keep``: leave all blank lines in the token list.

        - ``discard-eof``: if the string ends with a newline, then do not generate

           a corresponding token ``''`` after that newline.

    """

    def __init__(self, blanklines="discard"):
        valid_blanklines = ("discard", "keep", "discard-eof")
        if blanklines not in valid_blanklines:
            raise ValueError(
                "Blank lines must be one of: %s" % " ".join(valid_blanklines)
            )

        self._blanklines = blanklines

    def tokenize(self, s):
        lines = s.splitlines()
        # If requested, strip off blank lines.
        if self._blanklines == "discard":
            lines = [l for l in lines if l.rstrip()]
        elif self._blanklines == "discard-eof":
            if lines and not lines[-1].strip():
                lines.pop()
        return lines

    # discard-eof not implemented
    def span_tokenize(self, s):
        if self._blanklines == "keep":
            yield from string_span_tokenize(s, r"\n")
        else:
            yield from regexp_span_tokenize(s, r"\n(\s+\n)*")


######################################################################
# { Tokenization Functions
######################################################################
# XXX: it is stated in module docs that there is no function versions


def line_tokenize(text, blanklines="discard"):
    return LineTokenizer(blanklines).tokenize(text)