Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

File size: 5,379 Bytes

d916065

# Natural Language Toolkit: Simple Tokenizers
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Edward Loper <[email protected]>
#         Steven Bird <[email protected]>
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT

r"""

Simple Tokenizers



These tokenizers divide strings into substrings using the string

``split()`` method.

When tokenizing using a particular delimiter string, use

the string ``split()`` method directly, as this is more efficient.



The simple tokenizers are *not* available as separate functions;

instead, you should just use the string ``split()`` method directly:



    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."

    >>> s.split() # doctest: +NORMALIZE_WHITESPACE

    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',

    'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']

    >>> s.split(' ') # doctest: +NORMALIZE_WHITESPACE

    ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',

    'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']

    >>> s.split('\n') # doctest: +NORMALIZE_WHITESPACE

    ['Good muffins cost $3.88', 'in New York.  Please buy me',

    'two of them.', '', 'Thanks.']



The simple tokenizers are mainly useful because they follow the

standard ``TokenizerI`` interface, and so can be used with any code

that expects a tokenizer.  For example, these tokenizers can be used

to specify the tokenization conventions when building a `CorpusReader`.



"""

from nltk.tokenize.api import StringTokenizer, TokenizerI
from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize


class SpaceTokenizer(StringTokenizer):
    r"""Tokenize a string using the space character as a delimiter,

    which is the same as ``s.split(' ')``.



        >>> from nltk.tokenize import SpaceTokenizer

        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."

        >>> SpaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE

        ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',

        'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']

    """

    _string = " "


class TabTokenizer(StringTokenizer):
    r"""Tokenize a string use the tab character as a delimiter,

    the same as ``s.split('\t')``.



        >>> from nltk.tokenize import TabTokenizer

        >>> TabTokenizer().tokenize('a\tb c\n\t d')

        ['a', 'b c\n', ' d']

    """

    _string = "\t"


class CharTokenizer(StringTokenizer):
    """Tokenize a string into individual characters.  If this functionality

    is ever required directly, use ``for char in string``.

    """

    def tokenize(self, s):
        return list(s)

    def span_tokenize(self, s):
        yield from enumerate(range(1, len(s) + 1))


class LineTokenizer(TokenizerI):
    r"""Tokenize a string into its lines, optionally discarding blank lines.

    This is similar to ``s.split('\n')``.



        >>> from nltk.tokenize import LineTokenizer

        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."

        >>> LineTokenizer(blanklines='keep').tokenize(s) # doctest: +NORMALIZE_WHITESPACE

        ['Good muffins cost $3.88', 'in New York.  Please buy me',

        'two of them.', '', 'Thanks.']

        >>> # same as [l for l in s.split('\n') if l.strip()]:

        >>> LineTokenizer(blanklines='discard').tokenize(s) # doctest: +NORMALIZE_WHITESPACE

        ['Good muffins cost $3.88', 'in New York.  Please buy me',

        'two of them.', 'Thanks.']



    :param blanklines: Indicates how blank lines should be handled.  Valid values are:



        - ``discard``: strip blank lines out of the token list before returning it.

           A line is considered blank if it contains only whitespace characters.

        - ``keep``: leave all blank lines in the token list.

        - ``discard-eof``: if the string ends with a newline, then do not generate

           a corresponding token ``''`` after that newline.

    """

    def __init__(self, blanklines="discard"):
        valid_blanklines = ("discard", "keep", "discard-eof")
        if blanklines not in valid_blanklines:
            raise ValueError(
                "Blank lines must be one of: %s" % " ".join(valid_blanklines)
            )

        self._blanklines = blanklines

    def tokenize(self, s):
        lines = s.splitlines()
        # If requested, strip off blank lines.
        if self._blanklines == "discard":
            lines = [l for l in lines if l.rstrip()]
        elif self._blanklines == "discard-eof":
            if lines and not lines[-1].strip():
                lines.pop()
        return lines

    # discard-eof not implemented
    def span_tokenize(self, s):
        if self._blanklines == "keep":
            yield from string_span_tokenize(s, r"\n")
        else:
            yield from regexp_span_tokenize(s, r"\n(\s+\n)*")


######################################################################
# { Tokenization Functions
######################################################################
# XXX: it is stated in module docs that there is no function versions


def line_tokenize(text, blanklines="discard"):
    return LineTokenizer(blanklines).tokenize(text)