Spaces:
Sleeping
Sleeping
# Natural Language Toolkit: Tokenizers | |
# | |
# Copyright (C) 2001-2023 NLTK Project | |
# Author: Edward Loper <[email protected]> | |
# Steven Bird <[email protected]> (minor additions) | |
# Contributors: matthewmc, clouds56 | |
# URL: <https://www.nltk.org/> | |
# For license information, see LICENSE.TXT | |
r""" | |
NLTK Tokenizer Package | |
Tokenizers divide strings into lists of substrings. For example, | |
tokenizers can be used to find the words and punctuation in a string: | |
>>> from nltk.tokenize import word_tokenize | |
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me | |
... two of them.\n\nThanks.''' | |
>>> word_tokenize(s) # doctest: +NORMALIZE_WHITESPACE | |
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', | |
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] | |
This particular tokenizer requires the Punkt sentence tokenization | |
models to be installed. NLTK also provides a simpler, | |
regular-expression based tokenizer, which splits text on whitespace | |
and punctuation: | |
>>> from nltk.tokenize import wordpunct_tokenize | |
>>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE | |
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', | |
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] | |
We can also operate at the level of sentences, using the sentence | |
tokenizer directly as follows: | |
>>> from nltk.tokenize import sent_tokenize, word_tokenize | |
>>> sent_tokenize(s) | |
['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.'] | |
>>> [word_tokenize(t) for t in sent_tokenize(s)] # doctest: +NORMALIZE_WHITESPACE | |
[['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'], | |
['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']] | |
Caution: when tokenizing a Unicode string, make sure you are not | |
using an encoded version of the string (it may be necessary to | |
decode it first, e.g. with ``s.decode("utf8")``. | |
NLTK tokenizers can produce token-spans, represented as tuples of integers | |
having the same semantics as string slices, to support efficient comparison | |
of tokenizers. (These methods are implemented as generators.) | |
>>> from nltk.tokenize import WhitespaceTokenizer | |
>>> list(WhitespaceTokenizer().span_tokenize(s)) # doctest: +NORMALIZE_WHITESPACE | |
[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44), | |
(45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)] | |
There are numerous ways to tokenize text. If you need more control over | |
tokenization, see the other methods provided in this package. | |
For further information, please see Chapter 3 of the NLTK book. | |
""" | |
import re | |
from nltk.data import load | |
from nltk.tokenize.casual import TweetTokenizer, casual_tokenize | |
from nltk.tokenize.destructive import NLTKWordTokenizer | |
from nltk.tokenize.legality_principle import LegalitySyllableTokenizer | |
from nltk.tokenize.mwe import MWETokenizer | |
from nltk.tokenize.punkt import PunktSentenceTokenizer | |
from nltk.tokenize.regexp import ( | |
BlanklineTokenizer, | |
RegexpTokenizer, | |
WhitespaceTokenizer, | |
WordPunctTokenizer, | |
blankline_tokenize, | |
regexp_tokenize, | |
wordpunct_tokenize, | |
) | |
from nltk.tokenize.repp import ReppTokenizer | |
from nltk.tokenize.sexpr import SExprTokenizer, sexpr_tokenize | |
from nltk.tokenize.simple import ( | |
LineTokenizer, | |
SpaceTokenizer, | |
TabTokenizer, | |
line_tokenize, | |
) | |
from nltk.tokenize.sonority_sequencing import SyllableTokenizer | |
from nltk.tokenize.stanford_segmenter import StanfordSegmenter | |
from nltk.tokenize.texttiling import TextTilingTokenizer | |
from nltk.tokenize.toktok import ToktokTokenizer | |
from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer | |
from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize | |
# Standard sentence tokenizer. | |
def sent_tokenize(text, language="english"): | |
""" | |
Return a sentence-tokenized copy of *text*, | |
using NLTK's recommended sentence tokenizer | |
(currently :class:`.PunktSentenceTokenizer` | |
for the specified language). | |
:param text: text to split into sentences | |
:param language: the model name in the Punkt corpus | |
""" | |
tokenizer = load(f"tokenizers/punkt/{language}.pickle") | |
return tokenizer.tokenize(text) | |
# Standard word tokenizer. | |
_treebank_word_tokenizer = NLTKWordTokenizer() | |
def word_tokenize(text, language="english", preserve_line=False): | |
""" | |
Return a tokenized copy of *text*, | |
using NLTK's recommended word tokenizer | |
(currently an improved :class:`.TreebankWordTokenizer` | |
along with :class:`.PunktSentenceTokenizer` | |
for the specified language). | |
:param text: text to split into words | |
:type text: str | |
:param language: the model name in the Punkt corpus | |
:type language: str | |
:param preserve_line: A flag to decide whether to sentence tokenize the text or not. | |
:type preserve_line: bool | |
""" | |
sentences = [text] if preserve_line else sent_tokenize(text, language) | |
return [ | |
token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent) | |
] | |