Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /tokenize /casual.py

sunnychenxiwang

update nltk

d916065 12 months ago

raw

history blame

16.1 kB

	#
	# Natural Language Toolkit: Twitter Tokenizer
	#
	# Copyright (C) 2001-2023 NLTK Project
	# Author: Christopher Potts <[email protected]>
	# Ewan Klein <[email protected]> (modifications)
	# Pierpaolo Pantone <> (modifications)
	# Tom Aarsen <> (modifications)
	# URL: <https://www.nltk.org/>
	# For license information, see LICENSE.TXT
	#


	"""
	Twitter-aware tokenizer, designed to be flexible and easy to adapt to new
	domains and tasks. The basic logic is this:

	1. The tuple REGEXPS defines a list of regular expression
	strings.

	2. The REGEXPS strings are put, in order, into a compiled
	regular expression object called WORD_RE, under the TweetTokenizer
	class.

	3. The tokenization is done by WORD_RE.findall(s), where s is the
	user-supplied string, inside the tokenize() method of the class
	TweetTokenizer.

	4. When instantiating Tokenizer objects, there are several options:
	* preserve_case. By default, it is set to True. If it is set to
	False, then the tokenizer will downcase everything except for
	emoticons.
	* reduce_len. By default, it is set to False. It specifies whether
	to replace repeated character sequences of length 3 or greater
	with sequences of length 3.
	* strip_handles. By default, it is set to False. It specifies
	whether to remove Twitter handles of text used in the
	`tokenize` method.
	* match_phone_numbers. By default, it is set to True. It indicates
	whether the `tokenize` method should look for phone numbers.
	"""


	######################################################################

	import html
	from typing import List

	import regex # https://github.com/nltk/nltk/issues/2409

	from nltk.tokenize.api import TokenizerI

	######################################################################
	# The following strings are components in the regular expression
	# that is used for tokenizing. It's important that phone_number
	# appears first in the final regex (since it can contain whitespace).
	# It also could matter that tags comes after emoticons, due to the
	# possibility of having text like
	#
	# <:\| and some text >:)
	#
	# Most importantly, the final element should always be last, since it
	# does a last ditch whitespace-based tokenization of whatever is left.

	# ToDo: Update with https://en.wikipedia.org/wiki/List_of_emoticons ?

	# This particular element is used in a couple ways, so we define it
	# with a name:
	EMOTICONS = r"""
	(?:
	[<>]?
	[:;=8] # eyes
	[\-o\*\']? # optional nose
	[\)\]\(\[dDpP/\:\}\{@\\|\\] # mouth
	\|
	[\)\]\(\[dDpP/\:\}\{@\\|\\] # mouth
	[\-o\*\']? # optional nose
	[:;=8] # eyes
	[<>]?
	\|
	</?3 # heart
	)"""

	# URL pattern due to John Gruber, modified by Tom Winzig. See
	# https://gist.github.com/winzig/8894715

	URLS = r""" # Capture 1: entire matched URL
	(?:
	https?: # URL protocol and colon
	(?:
	/{1,3} # 1-3 slashes
	\| # or
	[a-z0-9%] # Single letter or digit or '%'
	# (Trying not to match e.g. "URI::Escape")
	)
	\| # or
	# looks like domain name followed by a slash:
	[a-z0-9.\-]+[.]
	(?:[a-z]{2,13})
	/
	)
	(?: # One or more:
	[^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
	\| # or
	$[^\s()]?\([^\s()]+$[^\s()]?\) # balanced parens, one level deep: (...(...)...)
	\|
	$[^\s]+?$ # balanced parens, non-recursive: (...)
	)+
	(?: # End with:
	$[^\s()]?\([^\s()]+$[^\s()]?\) # balanced parens, one level deep: (...(...)...)
	\|
	$[^\s]+?$ # balanced parens, non-recursive: (...)
	\| # or
	[^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
	)
	\| # OR, the following to match naked domains:
	(?:
	(?<!@) # not preceded by a @, avoid matching foo@_gmail.com_
	[a-z0-9]+
	(?:[.\-][a-z0-9]+)*
	[.]
	(?:[a-z]{2,13})
	\b
	/?
	(?!@) # not succeeded by a @,
	# avoid matching "foo.na" in "[email protected]"
	)
	"""

	# emoji flag sequence
	# https://en.wikipedia.org/wiki/Regional_indicator_symbol
	# For regex simplicity, include all possible enclosed letter pairs,
	# not the ISO subset of two-letter regional indicator symbols.
	# See https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#Current_codes
	# Future regional flag support may be handled with the regex for
	# U+1F3F4 🏴 followed by emoji tag sequences:
	# r'\U0001F3F4[\U000E0000-\U000E007E]{5}\U000E007F'
	FLAGS = r"""
	(?:
	[\U0001F1E6-\U0001F1FF]{2} # all enclosed letter pairs
	\|
	# English flag
	\U0001F3F4\U000E0067\U000E0062\U000E0065\U000E006e\U000E0067\U000E007F
	\|
	# Scottish flag
	\U0001F3F4\U000E0067\U000E0062\U000E0073\U000E0063\U000E0074\U000E007F
	\|
	# For Wales? Why Richard, it profit a man nothing to give his soul for the whole world … but for Wales!
	\U0001F3F4\U000E0067\U000E0062\U000E0077\U000E006C\U000E0073\U000E007F
	)
	"""

	# Regex for recognizing phone numbers:
	PHONE_REGEX = r"""
	(?:
	(?: # (international)
	\+?[01]
	[ \-.\)]
	)?
	(?: # (area code)
	[\(]?
	\d{3}
	[ \-.\)]
	)?
	\d{3} # exchange
	[ \-.\)]
	\d{4} # base
	)"""

	# The components of the tokenizer:
	REGEXPS = (
	URLS,
	# ASCII Emoticons
	EMOTICONS,
	# HTML tags:
	r"""<[^>\s]+>""",
	# ASCII Arrows
	r"""[\-]+>\|<[\-]+""",
	# Twitter username:
	r"""(?:@[\w_]+)""",
	# Twitter hashtags:
	r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",
	# email addresses
	r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""",
	# Zero-Width-Joiner and Skin tone modifier emojis
	""".(?:
	[\U0001F3FB-\U0001F3FF]?(?:\u200d.[\U0001F3FB-\U0001F3FF]?)+
	\|
	[\U0001F3FB-\U0001F3FF]
	)""",
	# flags
	FLAGS,
	# Remaining word types:
	r"""
	(?:[^\W\d_](?:[^\W\d_]\|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
	\|
	(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
	\|
	(?:[\w_]+) # Words without apostrophes or dashes.
	\|
	(?:\.(?:\s*\.){1,}) # Ellipsis dots.
	\|
	(?:\S) # Everything else that isn't whitespace.
	""",
	)

	# Take the main components and add a phone regex as the second parameter
	REGEXPS_PHONE = (REGEXPS[0], PHONE_REGEX, *REGEXPS[1:])

	######################################################################
	# TweetTokenizer.WORD_RE and TweetTokenizer.PHONE_WORD_RE represent
	# the core tokenizing regexes. They are compiled lazily.

	# WORD_RE performs poorly on these patterns:
	HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}")

	# The emoticon string gets its own regex so that we can preserve case for
	# them as needed:
	EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE \| regex.I \| regex.UNICODE)

	# These are for regularizing HTML entities to Unicode:
	ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")

	# For stripping away handles from a tweet:
	HANDLES_RE = regex.compile(
	r"(?<![A-Za-z0-9_!@#\$%&*])@"
	r"(([A-Za-z0-9_]){15}(?!@)\|([A-Za-z0-9_]){1,14}(?![A-Za-z0-9_]*@))"
	)


	######################################################################
	# Functions for converting html entities
	######################################################################


	def _str_to_unicode(text, encoding=None, errors="strict"):
	if encoding is None:
	encoding = "utf-8"
	if isinstance(text, bytes):
	return text.decode(encoding, errors)
	return text


	def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8"):
	"""
	Remove entities from text by converting them to their
	corresponding unicode character.

	:param text: a unicode string or a byte string encoded in the given
	`encoding` (which defaults to 'utf-8').

	:param list keep: list of entity names which should not be replaced.\
	This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
	and named entities (such as `` `` or ``>``).

	:param bool remove_illegal: If `True`, entities that can't be converted are\
	removed. Otherwise, entities that can't be converted are kept "as
	is".

	:returns: A unicode string with the entities removed.

	See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py

	>>> from nltk.tokenize.casual import _replace_html_entities
	>>> _replace_html_entities(b'Price: £100')
	'Price: \\xa3100'
	>>> print(_replace_html_entities(b'Price: £100'))
	Price: £100
	>>>
	"""

	def _convert_entity(match):
	entity_body = match.group(3)
	if match.group(1):
	try:
	if match.group(2):
	number = int(entity_body, 16)
	else:
	number = int(entity_body, 10)
	# Numeric character references in the 80-9F range are typically
	# interpreted by browsers as representing the characters mapped
	# to bytes 80-9F in the Windows-1252 encoding. For more info
	# see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets
	if 0x80 <= number <= 0x9F:
	return bytes((number,)).decode("cp1252")
	except ValueError:
	number = None
	else:
	if entity_body in keep:
	return match.group(0)
	number = html.entities.name2codepoint.get(entity_body)
	if number is not None:
	try:
	return chr(number)
	except (ValueError, OverflowError):
	pass

	return "" if remove_illegal else match.group(0)

	return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding))


	######################################################################


	class TweetTokenizer(TokenizerI):
	r"""
	Tokenizer for tweets.

	>>> from nltk.tokenize import TweetTokenizer
	>>> tknzr = TweetTokenizer()
	>>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
	>>> tknzr.tokenize(s0) # doctest: +NORMALIZE_WHITESPACE
	['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->',
	'<--']

	Examples using `strip_handles` and `reduce_len parameters`:

	>>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
	>>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
	>>> tknzr.tokenize(s1)
	[':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
	"""

	# Values used to lazily compile WORD_RE and PHONE_WORD_RE,
	# which are the core tokenizing regexes.
	_WORD_RE = None
	_PHONE_WORD_RE = None

	######################################################################

	def __init__(
	self,
	preserve_case=True,
	reduce_len=False,
	strip_handles=False,
	match_phone_numbers=True,
	):
	"""
	Create a `TweetTokenizer` instance with settings for use in the `tokenize` method.

	:param preserve_case: Flag indicating whether to preserve the casing (capitalisation)
	of text used in the `tokenize` method. Defaults to True.
	:type preserve_case: bool
	:param reduce_len: Flag indicating whether to replace repeated character sequences
	of length 3 or greater with sequences of length 3. Defaults to False.
	:type reduce_len: bool
	:param strip_handles: Flag indicating whether to remove Twitter handles of text used
	in the `tokenize` method. Defaults to False.
	:type strip_handles: bool
	:param match_phone_numbers: Flag indicating whether the `tokenize` method should look
	for phone numbers. Defaults to True.
	:type match_phone_numbers: bool
	"""
	self.preserve_case = preserve_case
	self.reduce_len = reduce_len
	self.strip_handles = strip_handles
	self.match_phone_numbers = match_phone_numbers

	def tokenize(self, text: str) -> List[str]:
	"""Tokenize the input text.

	:param text: str
	:rtype: list(str)
	:return: a tokenized list of strings; joining this list returns\
	the original string if `preserve_case=False`.
	"""
	# Fix HTML character entities:
	text = _replace_html_entities(text)
	# Remove username handles
	if self.strip_handles:
	text = remove_handles(text)
	# Normalize word lengthening
	if self.reduce_len:
	text = reduce_lengthening(text)
	# Shorten problematic sequences of characters
	safe_text = HANG_RE.sub(r"\1\1\1", text)
	# Recognise phone numbers during tokenization
	if self.match_phone_numbers:
	words = self.PHONE_WORD_RE.findall(safe_text)
	else:
	words = self.WORD_RE.findall(safe_text)
	# Possibly alter the case, but avoid changing emoticons like :D into :d:
	if not self.preserve_case:
	words = list(
	map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)
	)
	return words

	@property
	def WORD_RE(self) -> "regex.Pattern":
	"""Core TweetTokenizer regex"""
	# Compiles the regex for this and all future instantiations of TweetTokenizer.
	if not type(self)._WORD_RE:
	type(self)._WORD_RE = regex.compile(
	f"({'\|'.join(REGEXPS)})",
	regex.VERBOSE \| regex.I \| regex.UNICODE,
	)
	return type(self)._WORD_RE

	@property
	def PHONE_WORD_RE(self) -> "regex.Pattern":
	"""Secondary core TweetTokenizer regex"""
	# Compiles the regex for this and all future instantiations of TweetTokenizer.
	if not type(self)._PHONE_WORD_RE:
	type(self)._PHONE_WORD_RE = regex.compile(
	f"({'\|'.join(REGEXPS_PHONE)})",
	regex.VERBOSE \| regex.I \| regex.UNICODE,
	)
	return type(self)._PHONE_WORD_RE


	######################################################################
	# Normalization Functions
	######################################################################


	def reduce_lengthening(text):
	"""
	Replace repeated character sequences of length 3 or greater with sequences
	of length 3.
	"""
	pattern = regex.compile(r"(.)\1{2,}")
	return pattern.sub(r"\1\1\1", text)


	def remove_handles(text):
	"""
	Remove Twitter username handles from text.
	"""
	# Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly
	return HANDLES_RE.sub(" ", text)


	######################################################################
	# Tokenization Function
	######################################################################


	def casual_tokenize(
	text,
	preserve_case=True,
	reduce_len=False,
	strip_handles=False,
	match_phone_numbers=True,
	):
	"""
	Convenience function for wrapping the tokenizer.
	"""
	return TweetTokenizer(
	preserve_case=preserve_case,
	reduce_len=reduce_len,
	strip_handles=strip_handles,
	match_phone_numbers=match_phone_numbers,
	).tokenize(text)


	###############################################################################