Spaces:
Sleeping
Sleeping
File size: 8,331 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 |
# Natural Language Toolkit: Tokenizers
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Edward Loper <[email protected]>
# Steven Bird <[email protected]>
# Trevor Cohn <[email protected]>
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT
r"""
Regular-Expression Tokenizers
A ``RegexpTokenizer`` splits a string into substrings using a regular expression.
For example, the following tokenizer forms tokens out of alphabetic sequences,
money expressions, and any other non-whitespace sequences:
>>> from nltk.tokenize import RegexpTokenizer
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
>>> tokenizer.tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
A ``RegexpTokenizer`` can use its regexp to match delimiters instead:
>>> tokenizer = RegexpTokenizer(r'\s+', gaps=True)
>>> tokenizer.tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
Note that empty tokens are not returned when the delimiter appears at
the start or end of the string.
The material between the tokens is discarded. For example,
the following tokenizer selects just the capitalized words:
>>> capword_tokenizer = RegexpTokenizer(r'[A-Z]\w+')
>>> capword_tokenizer.tokenize(s)
['Good', 'New', 'York', 'Please', 'Thanks']
This module contains several subclasses of ``RegexpTokenizer``
that use pre-defined regular expressions.
>>> from nltk.tokenize import BlanklineTokenizer
>>> # Uses '\s*\n\s*\n\s*':
>>> BlanklineTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.',
'Thanks.']
All of the regular expression tokenizers are also available as functions:
>>> from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize
>>> regexp_tokenize(s, pattern=r'\w+|\$[\d\.]+|\S+') # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
>>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
'.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
>>> blankline_tokenize(s)
['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.', 'Thanks.']
Caution: The function ``regexp_tokenize()`` takes the text as its
first argument, and the regular expression pattern as its second
argument. This differs from the conventions used by Python's
``re`` functions, where the pattern is always the first argument.
(This is for consistency with the other NLTK tokenizers.)
"""
import re
from nltk.tokenize.api import TokenizerI
from nltk.tokenize.util import regexp_span_tokenize
class RegexpTokenizer(TokenizerI):
r"""
A tokenizer that splits a string using a regular expression, which
matches either the tokens or the separators between tokens.
>>> tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
:type pattern: str
:param pattern: The pattern used to build this tokenizer.
(This pattern must not contain capturing parentheses;
Use non-capturing parentheses, e.g. (?:...), instead)
:type gaps: bool
:param gaps: True if this tokenizer's pattern should be used
to find separators between tokens; False if this
tokenizer's pattern should be used to find the tokens
themselves.
:type discard_empty: bool
:param discard_empty: True if any empty tokens `''`
generated by the tokenizer should be discarded. Empty
tokens can only be generated if `_gaps == True`.
:type flags: int
:param flags: The regexp flags used to compile this
tokenizer's pattern. By default, the following flags are
used: `re.UNICODE | re.MULTILINE | re.DOTALL`.
"""
def __init__(
self,
pattern,
gaps=False,
discard_empty=True,
flags=re.UNICODE | re.MULTILINE | re.DOTALL,
):
# If they gave us a regexp object, extract the pattern.
pattern = getattr(pattern, "pattern", pattern)
self._pattern = pattern
self._gaps = gaps
self._discard_empty = discard_empty
self._flags = flags
self._regexp = None
def _check_regexp(self):
if self._regexp is None:
self._regexp = re.compile(self._pattern, self._flags)
def tokenize(self, text):
self._check_regexp()
# If our regexp matches gaps, use re.split:
if self._gaps:
if self._discard_empty:
return [tok for tok in self._regexp.split(text) if tok]
else:
return self._regexp.split(text)
# If our regexp matches tokens, use re.findall:
else:
return self._regexp.findall(text)
def span_tokenize(self, text):
self._check_regexp()
if self._gaps:
for left, right in regexp_span_tokenize(text, self._regexp):
if not (self._discard_empty and left == right):
yield left, right
else:
for m in re.finditer(self._regexp, text):
yield m.span()
def __repr__(self):
return "{}(pattern={!r}, gaps={!r}, discard_empty={!r}, flags={!r})".format(
self.__class__.__name__,
self._pattern,
self._gaps,
self._discard_empty,
self._flags,
)
class WhitespaceTokenizer(RegexpTokenizer):
r"""
Tokenize a string on whitespace (space, tab, newline).
In general, users should use the string ``split()`` method instead.
>>> from nltk.tokenize import WhitespaceTokenizer
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> WhitespaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
"""
def __init__(self):
RegexpTokenizer.__init__(self, r"\s+", gaps=True)
class BlanklineTokenizer(RegexpTokenizer):
"""
Tokenize a string, treating any sequence of blank lines as a delimiter.
Blank lines are defined as lines containing no characters, except for
space or tab characters.
"""
def __init__(self):
RegexpTokenizer.__init__(self, r"\s*\n\s*\n\s*", gaps=True)
class WordPunctTokenizer(RegexpTokenizer):
r"""
Tokenize a text into a sequence of alphabetic and
non-alphabetic characters, using the regexp ``\w+|[^\w\s]+``.
>>> from nltk.tokenize import WordPunctTokenizer
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> WordPunctTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
'.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
"""
def __init__(self):
RegexpTokenizer.__init__(self, r"\w+|[^\w\s]+")
######################################################################
# { Tokenization Functions
######################################################################
def regexp_tokenize(
text,
pattern,
gaps=False,
discard_empty=True,
flags=re.UNICODE | re.MULTILINE | re.DOTALL,
):
"""
Return a tokenized copy of *text*. See :class:`.RegexpTokenizer`
for descriptions of the arguments.
"""
tokenizer = RegexpTokenizer(pattern, gaps, discard_empty, flags)
return tokenizer.tokenize(text)
blankline_tokenize = BlanklineTokenizer().tokenize
wordpunct_tokenize = WordPunctTokenizer().tokenize
|