Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

File size: 16,101 Bytes

d916065

#
# Natural Language Toolkit: Twitter Tokenizer
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Christopher Potts <[email protected]>
#         Ewan Klein <[email protected]> (modifications)
#         Pierpaolo Pantone <> (modifications)
#         Tom Aarsen <> (modifications)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#


"""

Twitter-aware tokenizer, designed to be flexible and easy to adapt to new

domains and tasks. The basic logic is this:



1. The tuple REGEXPS defines a list of regular expression

   strings.



2. The REGEXPS strings are put, in order, into a compiled

   regular expression object called WORD_RE, under the TweetTokenizer

   class.



3. The tokenization is done by WORD_RE.findall(s), where s is the

   user-supplied string, inside the tokenize() method of the class

   TweetTokenizer.



4. When instantiating Tokenizer objects, there are several options:

    * preserve_case. By default, it is set to True. If it is set to

      False, then the tokenizer will downcase everything except for

      emoticons.

    * reduce_len. By default, it is set to False. It specifies whether

      to replace repeated character sequences of length 3 or greater

      with sequences of length 3.

    * strip_handles. By default, it is set to False. It specifies

      whether to remove Twitter handles of text used in the

      `tokenize` method.

    * match_phone_numbers. By default, it is set to True. It indicates

      whether the `tokenize` method should look for phone numbers.

"""


######################################################################

import html
from typing import List

import regex  # https://github.com/nltk/nltk/issues/2409

from nltk.tokenize.api import TokenizerI

######################################################################
# The following strings are components in the regular expression
# that is used for tokenizing. It's important that phone_number
# appears first in the final regex (since it can contain whitespace).
# It also could matter that tags comes after emoticons, due to the
# possibility of having text like
#
#     <:| and some text >:)
#
# Most importantly, the final element should always be last, since it
# does a last ditch whitespace-based tokenization of whatever is left.

# ToDo: Update with https://en.wikipedia.org/wiki/List_of_emoticons ?

# This particular element is used in a couple ways, so we define it
# with a name:
EMOTICONS = r"""

    (?:

      [<>]?

      [:;=8]                     # eyes

      [\-o\*\']?                 # optional nose

      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth

      |

      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth

      [\-o\*\']?                 # optional nose

      [:;=8]                     # eyes

      [<>]?

      |

      </?3                       # heart

    )"""

# URL pattern due to John Gruber, modified by Tom Winzig. See
# https://gist.github.com/winzig/8894715

URLS = r"""			# Capture 1: entire matched URL

  (?:

  https?:				# URL protocol and colon

    (?:

      /{1,3}				# 1-3 slashes

      |					#   or

      [a-z0-9%]				# Single letter or digit or '%'

                                       # (Trying not to match e.g. "URI::Escape")

    )

    |					#   or

                                       # looks like domain name followed by a slash:

    [a-z0-9.\-]+[.]

    (?:[a-z]{2,13})

    /

  )

  (?:					# One or more:

    [^\s()<>{}\[\]]+			# Run of non-space, non-()<>{}[]

    |					#   or

    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)

    |

    \([^\s]+?\)				# balanced parens, non-recursive: (...)

  )+

  (?:					# End with:

    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)

    |

    \([^\s]+?\)				# balanced parens, non-recursive: (...)

    |					#   or

    [^\s`!()\[\]{};:'".,<>?«»“”‘’]	# not a space or one of these punct chars

  )

  |					# OR, the following to match naked domains:

  (?:

  	(?<!@)			        # not preceded by a @, avoid matching foo@_gmail.com_

    [a-z0-9]+

    (?:[.\-][a-z0-9]+)*

    [.]

    (?:[a-z]{2,13})

    \b

    /?

    (?!@)			        # not succeeded by a @,

                            # avoid matching "foo.na" in "[email protected]"

  )

"""

# emoji flag sequence
# https://en.wikipedia.org/wiki/Regional_indicator_symbol
# For regex simplicity, include all possible enclosed letter pairs,
# not the ISO subset of two-letter regional indicator symbols.
# See https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#Current_codes
# Future regional flag support may be handled with the regex for
# U+1F3F4 🏴 followed by emoji tag sequences:
# r'\U0001F3F4[\U000E0000-\U000E007E]{5}\U000E007F'
FLAGS = r"""

  (?:

    [\U0001F1E6-\U0001F1FF]{2}  # all enclosed letter pairs

    |

    # English flag

    \U0001F3F4\U000E0067\U000E0062\U000E0065\U000E006e\U000E0067\U000E007F

    |

    # Scottish flag

    \U0001F3F4\U000E0067\U000E0062\U000E0073\U000E0063\U000E0074\U000E007F

    |

    # For Wales? Why Richard, it profit a man nothing to give his soul for the whole world … but for Wales!

    \U0001F3F4\U000E0067\U000E0062\U000E0077\U000E006C\U000E0073\U000E007F

  )

"""

# Regex for recognizing phone numbers:
PHONE_REGEX = r"""

    (?:

      (?:            # (international)

        \+?[01]

        [ *\-.\)]*

      )?

      (?:            # (area code)

        [\(]?

        \d{3}

        [ *\-.\)]*

      )?

      \d{3}          # exchange

      [ *\-.\)]*

      \d{4}          # base

    )"""

# The components of the tokenizer:
REGEXPS = (
    URLS,
    # ASCII Emoticons
    EMOTICONS,
    # HTML tags:
    r"""<[^>\s]+>""",
    # ASCII Arrows
    r"""[\-]+>|<[\-]+""",
    # Twitter username:
    r"""(?:@[\w_]+)""",
    # Twitter hashtags:
    r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",
    # email addresses
    r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""",
    # Zero-Width-Joiner and Skin tone modifier emojis
    """.(?:

        [\U0001F3FB-\U0001F3FF]?(?:\u200d.[\U0001F3FB-\U0001F3FF]?)+

        |

        [\U0001F3FB-\U0001F3FF]

    )""",
    # flags
    FLAGS,
    # Remaining word types:
    r"""

    (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.

    |

    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.

    |

    (?:[\w_]+)                     # Words without apostrophes or dashes.

    |

    (?:\.(?:\s*\.){1,})            # Ellipsis dots.

    |

    (?:\S)                         # Everything else that isn't whitespace.

    """,
)

# Take the main components and add a phone regex as the second parameter
REGEXPS_PHONE = (REGEXPS[0], PHONE_REGEX, *REGEXPS[1:])

######################################################################
# TweetTokenizer.WORD_RE and TweetTokenizer.PHONE_WORD_RE represent
# the core tokenizing regexes. They are compiled lazily.

# WORD_RE performs poorly on these patterns:
HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}")

# The emoticon string gets its own regex so that we can preserve case for
# them as needed:
EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE)

# These are for regularizing HTML entities to Unicode:
ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")

# For stripping away handles from a tweet:
HANDLES_RE = regex.compile(
    r"(?<![A-Za-z0-9_!@#\$%&*])@"
    r"(([A-Za-z0-9_]){15}(?!@)|([A-Za-z0-9_]){1,14}(?![A-Za-z0-9_]*@))"
)


######################################################################
# Functions for converting html entities
######################################################################


def _str_to_unicode(text, encoding=None, errors="strict"):
    if encoding is None:
        encoding = "utf-8"
    if isinstance(text, bytes):
        return text.decode(encoding, errors)
    return text


def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8"):
    """

    Remove entities from text by converting them to their

    corresponding unicode character.



    :param text: a unicode string or a byte string encoded in the given

    `encoding` (which defaults to 'utf-8').



    :param list keep:  list of entity names which should not be replaced.\

    This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)

    and named entities (such as ``&nbsp;`` or ``&gt;``).



    :param bool remove_illegal: If `True`, entities that can't be converted are\

    removed. Otherwise, entities that can't be converted are kept "as

    is".



    :returns: A unicode string with the entities removed.



    See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py



        >>> from nltk.tokenize.casual import _replace_html_entities

        >>> _replace_html_entities(b'Price: &pound;100')

        'Price: \\xa3100'

        >>> print(_replace_html_entities(b'Price: &pound;100'))

        Price: £100

        >>>

    """

    def _convert_entity(match):
        entity_body = match.group(3)
        if match.group(1):
            try:
                if match.group(2):
                    number = int(entity_body, 16)
                else:
                    number = int(entity_body, 10)
                # Numeric character references in the 80-9F range are typically
                # interpreted by browsers as representing the characters mapped
                # to bytes 80-9F in the Windows-1252 encoding. For more info
                # see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets
                if 0x80 <= number <= 0x9F:
                    return bytes((number,)).decode("cp1252")
            except ValueError:
                number = None
        else:
            if entity_body in keep:
                return match.group(0)
            number = html.entities.name2codepoint.get(entity_body)
        if number is not None:
            try:
                return chr(number)
            except (ValueError, OverflowError):
                pass

        return "" if remove_illegal else match.group(0)

    return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding))


######################################################################


class TweetTokenizer(TokenizerI):
    r"""

    Tokenizer for tweets.



        >>> from nltk.tokenize import TweetTokenizer

        >>> tknzr = TweetTokenizer()

        >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"

        >>> tknzr.tokenize(s0) # doctest: +NORMALIZE_WHITESPACE

        ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->',

         '<--']



    Examples using `strip_handles` and `reduce_len parameters`:



        >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

        >>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'

        >>> tknzr.tokenize(s1)

        [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']

    """

    # Values used to lazily compile WORD_RE and PHONE_WORD_RE,
    # which are the core tokenizing regexes.
    _WORD_RE = None
    _PHONE_WORD_RE = None

    ######################################################################

    def __init__(

        self,

        preserve_case=True,

        reduce_len=False,

        strip_handles=False,

        match_phone_numbers=True,

    ):
        """

        Create a `TweetTokenizer` instance with settings for use in the `tokenize` method.



        :param preserve_case: Flag indicating whether to preserve the casing (capitalisation)

            of text used in the `tokenize` method. Defaults to True.

        :type preserve_case: bool

        :param reduce_len: Flag indicating whether to replace repeated character sequences

            of length 3 or greater with sequences of length 3. Defaults to False.

        :type reduce_len: bool

        :param strip_handles: Flag indicating whether to remove Twitter handles of text used

            in the `tokenize` method. Defaults to False.

        :type strip_handles: bool

        :param match_phone_numbers: Flag indicating whether the `tokenize` method should look

            for phone numbers. Defaults to True.

        :type match_phone_numbers: bool

        """
        self.preserve_case = preserve_case
        self.reduce_len = reduce_len
        self.strip_handles = strip_handles
        self.match_phone_numbers = match_phone_numbers

    def tokenize(self, text: str) -> List[str]:
        """Tokenize the input text.



        :param text: str

        :rtype: list(str)

        :return: a tokenized list of strings; joining this list returns\

        the original string if `preserve_case=False`.

        """
        # Fix HTML character entities:
        text = _replace_html_entities(text)
        # Remove username handles
        if self.strip_handles:
            text = remove_handles(text)
        # Normalize word lengthening
        if self.reduce_len:
            text = reduce_lengthening(text)
        # Shorten problematic sequences of characters
        safe_text = HANG_RE.sub(r"\1\1\1", text)
        # Recognise phone numbers during tokenization
        if self.match_phone_numbers:
            words = self.PHONE_WORD_RE.findall(safe_text)
        else:
            words = self.WORD_RE.findall(safe_text)
        # Possibly alter the case, but avoid changing emoticons like :D into :d:
        if not self.preserve_case:
            words = list(
                map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)
            )
        return words

    @property
    def WORD_RE(self) -> "regex.Pattern":
        """Core TweetTokenizer regex"""
        # Compiles the regex for this and all future instantiations of TweetTokenizer.
        if not type(self)._WORD_RE:
            type(self)._WORD_RE = regex.compile(
                f"({'|'.join(REGEXPS)})",
                regex.VERBOSE | regex.I | regex.UNICODE,
            )
        return type(self)._WORD_RE

    @property
    def PHONE_WORD_RE(self) -> "regex.Pattern":
        """Secondary core TweetTokenizer regex"""
        # Compiles the regex for this and all future instantiations of TweetTokenizer.
        if not type(self)._PHONE_WORD_RE:
            type(self)._PHONE_WORD_RE = regex.compile(
                f"({'|'.join(REGEXPS_PHONE)})",
                regex.VERBOSE | regex.I | regex.UNICODE,
            )
        return type(self)._PHONE_WORD_RE


######################################################################
# Normalization Functions
######################################################################


def reduce_lengthening(text):
    """

    Replace repeated character sequences of length 3 or greater with sequences

    of length 3.

    """
    pattern = regex.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1\1", text)


def remove_handles(text):
    """

    Remove Twitter username handles from text.

    """
    # Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly
    return HANDLES_RE.sub(" ", text)


######################################################################
# Tokenization Function
######################################################################


def casual_tokenize(

    text,

    preserve_case=True,

    reduce_len=False,

    strip_handles=False,

    match_phone_numbers=True,

):
    """

    Convenience function for wrapping the tokenizer.

    """
    return TweetTokenizer(
        preserve_case=preserve_case,
        reduce_len=reduce_len,
        strip_handles=strip_handles,
        match_phone_numbers=match_phone_numbers,
    ).tokenize(text)


###############################################################################