Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
""" | |
emoji.tokenizer | |
~~~~~~~~~~~~~~~ | |
Components for detecting and tokenizing emoji in strings. | |
""" | |
from typing import NamedTuple, Dict, Union, Iterator, Any | |
from emoji import unicode_codes | |
__all__ = [ | |
'EmojiMatch', 'EmojiMatchZWJ', 'EmojiMatchZWJNonRGI', 'Token', | |
'tokenize', 'filter_tokens', | |
] | |
_ZWJ = '\u200D' | |
_SEARCH_TREE = None | |
class EmojiMatch: | |
""" | |
Represents a match of a "recommended for general interchange" (RGI) | |
emoji in a string. | |
""" | |
__slots__ = ('emoji', 'start', 'end', 'data') | |
def __init__(self, emoji: str, start: int, | |
end: int, data: Union[dict, None]): | |
self.emoji = emoji | |
"""The emoji substring""" | |
self.start = start | |
"""The start index of the match in the string""" | |
self.end = end | |
"""The end index of the match in the string""" | |
self.data = data | |
"""The entry from :data:`EMOJI_DATA` for this emoji or ``None`` if the emoji is non-RGI""" | |
def data_copy(self) -> Dict[str, Any]: | |
""" | |
Returns a copy of the data from :data:`EMOJI_DATA` for this match | |
with the additional keys ``match_start`` and ``match_end``. | |
""" | |
if self.data: | |
emj_data = self.data.copy() | |
emj_data['match_start'] = self.start | |
emj_data['match_end'] = self.end | |
return emj_data | |
else: | |
return { | |
'match_start': self.start, | |
'match_end': self.end | |
} | |
def is_zwj(self) -> bool: | |
""" | |
Checks if this is a ZWJ-emoji. | |
:returns: True if this is a ZWJ-emoji, False otherwise | |
""" | |
return _ZWJ in self.emoji | |
def split(self) -> Union['EmojiMatchZWJ', 'EmojiMatch']: | |
""" | |
Splits a ZWJ-emoji into its constituents. | |
:returns: An :class:`EmojiMatchZWJ` containing the "sub-emoji" if this is a ZWJ-emoji, otherwise self | |
""" | |
if self.is_zwj(): | |
return EmojiMatchZWJ(self) | |
else: | |
return self | |
def __repr__(self) -> str: | |
return f'{self.__class__.__name__}({self.emoji}, {self.start}:{self.end})' | |
class EmojiMatchZWJ(EmojiMatch): | |
""" | |
Represents a match of multiple emoji in a string that were joined by | |
zero-width-joiners (ZWJ/``\\u200D``).""" | |
__slots__ = ('emojis', ) | |
def __init__(self, match: EmojiMatch): | |
super().__init__(match.emoji, match.start, match.end, match.data) | |
self.emojis = [] | |
"""List of sub emoji as EmojiMatch objects""" | |
i = match.start | |
for e in match.emoji.split(_ZWJ): | |
m = EmojiMatch( | |
e, i, i+len(e), unicode_codes.EMOJI_DATA.get(e, None)) | |
self.emojis.append(m) | |
i += len(e) + 1 | |
def join(self) -> str: | |
""" | |
Joins a ZWJ-emoji into a string | |
""" | |
return _ZWJ.join(e.emoji for e in self.emojis) | |
def is_zwj(self) -> bool: | |
return True | |
def split(self) -> 'EmojiMatchZWJ': | |
return self | |
def __repr__(self) -> str: | |
return f'{self.__class__.__name__}({self.join()}, {self.start}:{self.end})' | |
class EmojiMatchZWJNonRGI(EmojiMatchZWJ): | |
""" | |
Represents a match of multiple emoji in a string that were joined by | |
zero-width-joiners (ZWJ/``\\u200D``). This class is only used for emoji | |
that are not "recommended for general interchange" (non-RGI) by Unicode.org. | |
The data property of this class is always None. | |
""" | |
def __init__(self, first_emoji_match: EmojiMatch, | |
second_emoji_match: EmojiMatch): | |
self.emojis = [first_emoji_match, second_emoji_match] | |
"""List of sub emoji as EmojiMatch objects""" | |
self._update() | |
def _update(self): | |
self.emoji = _ZWJ.join(e.emoji for e in self.emojis) | |
self.start = self.emojis[0].start | |
self.end = self.emojis[-1].end | |
self.data = None | |
def _add(self, next_emoji_match: EmojiMatch): | |
self.emojis.append(next_emoji_match) | |
self._update() | |
class Token(NamedTuple): | |
""" | |
A named tuple containing the matched string and its :class:`EmojiMatch` object if it is an emoji | |
or a single character that is not a unicode emoji. | |
""" | |
chars: str | |
value: Union[str, EmojiMatch] | |
def tokenize(string, keep_zwj: bool) -> Iterator[Token]: | |
""" | |
Finds unicode emoji in a string. Yields all normal characters as a named | |
tuple :class:`Token` ``(char, char)`` and all emoji as :class:`Token` ``(chars, EmojiMatch)``. | |
:param string: String contains unicode characters. MUST BE UNICODE. | |
:param keep_zwj: Should ZWJ-characters (``\\u200D``) that join non-RGI emoji be | |
skipped or should be yielded as normal characters | |
:return: An iterable of tuples :class:`Token` ``(char, char)`` or :class:`Token` ``(chars, EmojiMatch)`` | |
""" | |
tree = get_search_tree() | |
EMOJI_DATA = unicode_codes.EMOJI_DATA | |
# result: [ Token(oldsubstring0, EmojiMatch), Token(char1, char1), ... ] | |
result = [] | |
i = 0 | |
length = len(string) | |
ignore = [] # index of chars in string that are skipped, i.e. the ZWJ-char in non-RGI-ZWJ-sequences | |
while i < length: | |
consumed = False | |
char = string[i] | |
if i in ignore: | |
i += 1 | |
if char == _ZWJ and keep_zwj: | |
result.append(Token(char, char)) | |
continue | |
elif char in tree: | |
j = i + 1 | |
sub_tree = tree[char] | |
while j < length and string[j] in sub_tree: | |
if j in ignore: | |
break | |
sub_tree = sub_tree[string[j]] | |
j += 1 | |
if 'data' in sub_tree: | |
emj_data = sub_tree['data'] | |
code_points = string[i:j] | |
# We cannot yield the result here, we need to defer | |
# the call until we are sure that the emoji is finished | |
# i.e. we're not inside an ongoing ZWJ-sequence | |
match_obj = EmojiMatch(code_points, i, j, emj_data) | |
i = j - 1 | |
consumed = True | |
result.append(Token(code_points, match_obj)) | |
elif char == _ZWJ and result and result[-1].chars in EMOJI_DATA and i > 0 and string[i - 1] in tree: | |
# the current char is ZWJ and the last match was an emoji | |
ignore.append(i) | |
if EMOJI_DATA[result[-1].chars]["status"] == unicode_codes.STATUS["component"]: | |
# last match was a component, it could be ZWJ+EMOJI+COMPONENT | |
# or ZWJ+COMPONENT | |
i = i - sum(len(t.chars) for t in result[-2:]) | |
if string[i] == _ZWJ: | |
# It's ZWJ+COMPONENT, move one back | |
i += 1 | |
del result[-1] | |
else: | |
# It's ZWJ+EMOJI+COMPONENT, move two back | |
del result[-2:] | |
else: | |
# last match result[-1] was a normal emoji, move cursor | |
# before the emoji | |
i = i - len(result[-1].chars) | |
del result[-1] | |
continue | |
elif result: | |
yield from result | |
result = [] | |
if not consumed and char != '\uFE0E' and char != '\uFE0F': | |
result.append(Token(char, char)) | |
i += 1 | |
yield from result | |
def filter_tokens(matches: Iterator[Token], emoji_only: bool, join_emoji: bool) -> Iterator[Token]: | |
""" | |
Filters the output of `tokenize()` | |
:param matches: An iterable of tuples of the form ``(match_str, result)`` | |
where ``result`` is either an EmojiMatch or a string. | |
:param emoji_only: If True, only EmojiMatch are returned in the output. | |
If False all characters are returned | |
:param join_emoji: If True, multiple EmojiMatch are merged into | |
a single :class:`EmojiMatchZWJNonRGI` if they are separated only by a ZWJ. | |
:return: An iterable of tuples :class:`Token` ``(char, char)``, | |
:class:`Token` ``(chars, EmojiMatch)`` or :class:`Token` ``(chars, EmojiMatchZWJNonRGI)`` | |
""" | |
if not join_emoji and not emoji_only: | |
yield from matches | |
return | |
if not join_emoji: | |
for token in matches: | |
if token.chars != _ZWJ: | |
yield token | |
return | |
# Combine multiple EmojiMatch that are separated by ZWJs into | |
# a single EmojiMatchZWJNonRGI | |
previous_is_emoji = False | |
previous_is_zwj = False | |
pre_previous_is_emoji = False | |
accumulator = [] | |
for token in matches: | |
pre_previous_is_emoji = previous_is_emoji | |
if previous_is_emoji and token.value == _ZWJ: | |
previous_is_zwj = True | |
elif isinstance(token.value, EmojiMatch): | |
if pre_previous_is_emoji and previous_is_zwj: | |
if isinstance(accumulator[-1].value, EmojiMatchZWJNonRGI): | |
accumulator[-1].value._add(token.value) | |
accumulator[-1] = Token(accumulator[-1].chars + | |
_ZWJ + token.chars, accumulator[-1].value) | |
else: | |
prev = accumulator.pop() | |
accumulator.append( | |
Token(prev.chars + _ZWJ + token.chars, | |
EmojiMatchZWJNonRGI( | |
prev.value, | |
token.value))) | |
else: | |
accumulator.append(token) | |
previous_is_emoji = True | |
previous_is_zwj = False | |
else: | |
# Other character, not an emoji | |
previous_is_emoji = False | |
previous_is_zwj = False | |
yield from accumulator | |
if not emoji_only: | |
yield token | |
accumulator = [] | |
yield from accumulator | |
def get_search_tree() -> Dict[str, Any]: | |
""" | |
Generate a search tree for demojize(). | |
Example of a search tree:: | |
EMOJI_DATA = | |
{'a': {'en': ':Apple:'}, | |
'b': {'en': ':Bus:'}, | |
'ba': {'en': ':Bat:'}, | |
'band': {'en': ':Beatles:'}, | |
'bandit': {'en': ':Outlaw:'}, | |
'bank': {'en': ':BankOfEngland:'}, | |
'bb': {'en': ':BB-gun:'}, | |
'c': {'en': ':Car:'}} | |
_SEARCH_TREE = | |
{'a': {'data': {'en': ':Apple:'}}, | |
'b': {'a': {'data': {'en': ':Bat:'}, | |
'n': {'d': {'data': {'en': ':Beatles:'}, | |
'i': {'t': {'data': {'en': ':Outlaw:'}}}}, | |
'k': {'data': {'en': ':BankOfEngland:'}}}}, | |
'b': {'data': {'en': ':BB-gun:'}}, | |
'data': {'en': ':Bus:'}}, | |
'c': {'data': {'en': ':Car:'}}} | |
_SEARCH_TREE | |
/ | ⧵ | |
/ | ⧵ | |
a b c | |
| / | ⧵ | | |
| / | ⧵ | | |
:Apple: ba :Bus: bb :Car: | |
/ ⧵ | | |
/ ⧵ | | |
:Bat: ban :BB-gun: | |
/ ⧵ | |
/ ⧵ | |
band bank | |
/ ⧵ | | |
/ ⧵ | | |
bandi :Beatles: :BankOfEngland: | |
| | |
bandit | |
| | |
:Outlaw: | |
""" | |
global _SEARCH_TREE | |
if _SEARCH_TREE is None: | |
_SEARCH_TREE = {} | |
for emj in unicode_codes.EMOJI_DATA: | |
sub_tree = _SEARCH_TREE | |
lastidx = len(emj) - 1 | |
for i, char in enumerate(emj): | |
if char not in sub_tree: | |
sub_tree[char] = {} | |
sub_tree = sub_tree[char] | |
if i == lastidx: | |
sub_tree['data'] = unicode_codes.EMOJI_DATA[emj] | |
return _SEARCH_TREE | |