Pendrokar's picture
DeepMoji xVASynth Plugin
e1c08c5
raw
history blame
11.9 kB
"""
emoji.tokenizer
~~~~~~~~~~~~~~~
Components for detecting and tokenizing emoji in strings.
"""
from typing import NamedTuple, Dict, Union, Iterator, Any
from emoji import unicode_codes
__all__ = [
'EmojiMatch', 'EmojiMatchZWJ', 'EmojiMatchZWJNonRGI', 'Token',
'tokenize', 'filter_tokens',
]
_ZWJ = '\u200D'
_SEARCH_TREE = None
class EmojiMatch:
"""
Represents a match of a "recommended for general interchange" (RGI)
emoji in a string.
"""
__slots__ = ('emoji', 'start', 'end', 'data')
def __init__(self, emoji: str, start: int,
end: int, data: Union[dict, None]):
self.emoji = emoji
"""The emoji substring"""
self.start = start
"""The start index of the match in the string"""
self.end = end
"""The end index of the match in the string"""
self.data = data
"""The entry from :data:`EMOJI_DATA` for this emoji or ``None`` if the emoji is non-RGI"""
def data_copy(self) -> Dict[str, Any]:
"""
Returns a copy of the data from :data:`EMOJI_DATA` for this match
with the additional keys ``match_start`` and ``match_end``.
"""
if self.data:
emj_data = self.data.copy()
emj_data['match_start'] = self.start
emj_data['match_end'] = self.end
return emj_data
else:
return {
'match_start': self.start,
'match_end': self.end
}
def is_zwj(self) -> bool:
"""
Checks if this is a ZWJ-emoji.
:returns: True if this is a ZWJ-emoji, False otherwise
"""
return _ZWJ in self.emoji
def split(self) -> Union['EmojiMatchZWJ', 'EmojiMatch']:
"""
Splits a ZWJ-emoji into its constituents.
:returns: An :class:`EmojiMatchZWJ` containing the "sub-emoji" if this is a ZWJ-emoji, otherwise self
"""
if self.is_zwj():
return EmojiMatchZWJ(self)
else:
return self
def __repr__(self) -> str:
return f'{self.__class__.__name__}({self.emoji}, {self.start}:{self.end})'
class EmojiMatchZWJ(EmojiMatch):
"""
Represents a match of multiple emoji in a string that were joined by
zero-width-joiners (ZWJ/``\\u200D``)."""
__slots__ = ('emojis', )
def __init__(self, match: EmojiMatch):
super().__init__(match.emoji, match.start, match.end, match.data)
self.emojis = []
"""List of sub emoji as EmojiMatch objects"""
i = match.start
for e in match.emoji.split(_ZWJ):
m = EmojiMatch(
e, i, i+len(e), unicode_codes.EMOJI_DATA.get(e, None))
self.emojis.append(m)
i += len(e) + 1
def join(self) -> str:
"""
Joins a ZWJ-emoji into a string
"""
return _ZWJ.join(e.emoji for e in self.emojis)
def is_zwj(self) -> bool:
return True
def split(self) -> 'EmojiMatchZWJ':
return self
def __repr__(self) -> str:
return f'{self.__class__.__name__}({self.join()}, {self.start}:{self.end})'
class EmojiMatchZWJNonRGI(EmojiMatchZWJ):
"""
Represents a match of multiple emoji in a string that were joined by
zero-width-joiners (ZWJ/``\\u200D``). This class is only used for emoji
that are not "recommended for general interchange" (non-RGI) by Unicode.org.
The data property of this class is always None.
"""
def __init__(self, first_emoji_match: EmojiMatch,
second_emoji_match: EmojiMatch):
self.emojis = [first_emoji_match, second_emoji_match]
"""List of sub emoji as EmojiMatch objects"""
self._update()
def _update(self):
self.emoji = _ZWJ.join(e.emoji for e in self.emojis)
self.start = self.emojis[0].start
self.end = self.emojis[-1].end
self.data = None
def _add(self, next_emoji_match: EmojiMatch):
self.emojis.append(next_emoji_match)
self._update()
class Token(NamedTuple):
"""
A named tuple containing the matched string and its :class:`EmojiMatch` object if it is an emoji
or a single character that is not a unicode emoji.
"""
chars: str
value: Union[str, EmojiMatch]
def tokenize(string, keep_zwj: bool) -> Iterator[Token]:
"""
Finds unicode emoji in a string. Yields all normal characters as a named
tuple :class:`Token` ``(char, char)`` and all emoji as :class:`Token` ``(chars, EmojiMatch)``.
:param string: String contains unicode characters. MUST BE UNICODE.
:param keep_zwj: Should ZWJ-characters (``\\u200D``) that join non-RGI emoji be
skipped or should be yielded as normal characters
:return: An iterable of tuples :class:`Token` ``(char, char)`` or :class:`Token` ``(chars, EmojiMatch)``
"""
tree = get_search_tree()
EMOJI_DATA = unicode_codes.EMOJI_DATA
# result: [ Token(oldsubstring0, EmojiMatch), Token(char1, char1), ... ]
result = []
i = 0
length = len(string)
ignore = [] # index of chars in string that are skipped, i.e. the ZWJ-char in non-RGI-ZWJ-sequences
while i < length:
consumed = False
char = string[i]
if i in ignore:
i += 1
if char == _ZWJ and keep_zwj:
result.append(Token(char, char))
continue
elif char in tree:
j = i + 1
sub_tree = tree[char]
while j < length and string[j] in sub_tree:
if j in ignore:
break
sub_tree = sub_tree[string[j]]
j += 1
if 'data' in sub_tree:
emj_data = sub_tree['data']
code_points = string[i:j]
# We cannot yield the result here, we need to defer
# the call until we are sure that the emoji is finished
# i.e. we're not inside an ongoing ZWJ-sequence
match_obj = EmojiMatch(code_points, i, j, emj_data)
i = j - 1
consumed = True
result.append(Token(code_points, match_obj))
elif char == _ZWJ and result and result[-1].chars in EMOJI_DATA and i > 0 and string[i - 1] in tree:
# the current char is ZWJ and the last match was an emoji
ignore.append(i)
if EMOJI_DATA[result[-1].chars]["status"] == unicode_codes.STATUS["component"]:
# last match was a component, it could be ZWJ+EMOJI+COMPONENT
# or ZWJ+COMPONENT
i = i - sum(len(t.chars) for t in result[-2:])
if string[i] == _ZWJ:
# It's ZWJ+COMPONENT, move one back
i += 1
del result[-1]
else:
# It's ZWJ+EMOJI+COMPONENT, move two back
del result[-2:]
else:
# last match result[-1] was a normal emoji, move cursor
# before the emoji
i = i - len(result[-1].chars)
del result[-1]
continue
elif result:
yield from result
result = []
if not consumed and char != '\uFE0E' and char != '\uFE0F':
result.append(Token(char, char))
i += 1
yield from result
def filter_tokens(matches: Iterator[Token], emoji_only: bool, join_emoji: bool) -> Iterator[Token]:
"""
Filters the output of `tokenize()`
:param matches: An iterable of tuples of the form ``(match_str, result)``
where ``result`` is either an EmojiMatch or a string.
:param emoji_only: If True, only EmojiMatch are returned in the output.
If False all characters are returned
:param join_emoji: If True, multiple EmojiMatch are merged into
a single :class:`EmojiMatchZWJNonRGI` if they are separated only by a ZWJ.
:return: An iterable of tuples :class:`Token` ``(char, char)``,
:class:`Token` ``(chars, EmojiMatch)`` or :class:`Token` ``(chars, EmojiMatchZWJNonRGI)``
"""
if not join_emoji and not emoji_only:
yield from matches
return
if not join_emoji:
for token in matches:
if token.chars != _ZWJ:
yield token
return
# Combine multiple EmojiMatch that are separated by ZWJs into
# a single EmojiMatchZWJNonRGI
previous_is_emoji = False
previous_is_zwj = False
pre_previous_is_emoji = False
accumulator = []
for token in matches:
pre_previous_is_emoji = previous_is_emoji
if previous_is_emoji and token.value == _ZWJ:
previous_is_zwj = True
elif isinstance(token.value, EmojiMatch):
if pre_previous_is_emoji and previous_is_zwj:
if isinstance(accumulator[-1].value, EmojiMatchZWJNonRGI):
accumulator[-1].value._add(token.value)
accumulator[-1] = Token(accumulator[-1].chars +
_ZWJ + token.chars, accumulator[-1].value)
else:
prev = accumulator.pop()
accumulator.append(
Token(prev.chars + _ZWJ + token.chars,
EmojiMatchZWJNonRGI(
prev.value,
token.value)))
else:
accumulator.append(token)
previous_is_emoji = True
previous_is_zwj = False
else:
# Other character, not an emoji
previous_is_emoji = False
previous_is_zwj = False
yield from accumulator
if not emoji_only:
yield token
accumulator = []
yield from accumulator
def get_search_tree() -> Dict[str, Any]:
"""
Generate a search tree for demojize().
Example of a search tree::
EMOJI_DATA =
{'a': {'en': ':Apple:'},
'b': {'en': ':Bus:'},
'ba': {'en': ':Bat:'},
'band': {'en': ':Beatles:'},
'bandit': {'en': ':Outlaw:'},
'bank': {'en': ':BankOfEngland:'},
'bb': {'en': ':BB-gun:'},
'c': {'en': ':Car:'}}
_SEARCH_TREE =
{'a': {'data': {'en': ':Apple:'}},
'b': {'a': {'data': {'en': ':Bat:'},
'n': {'d': {'data': {'en': ':Beatles:'},
'i': {'t': {'data': {'en': ':Outlaw:'}}}},
'k': {'data': {'en': ':BankOfEngland:'}}}},
'b': {'data': {'en': ':BB-gun:'}},
'data': {'en': ':Bus:'}},
'c': {'data': {'en': ':Car:'}}}
_SEARCH_TREE
/ | ⧵
/ | ⧵
a b c
| / | ⧵ |
| / | ⧵ |
:Apple: ba :Bus: bb :Car:
/ ⧵ |
/ ⧵ |
:Bat: ban :BB-gun:
/ ⧵
/ ⧵
band bank
/ ⧵ |
/ ⧵ |
bandi :Beatles: :BankOfEngland:
|
bandit
|
:Outlaw:
"""
global _SEARCH_TREE
if _SEARCH_TREE is None:
_SEARCH_TREE = {}
for emj in unicode_codes.EMOJI_DATA:
sub_tree = _SEARCH_TREE
lastidx = len(emj) - 1
for i, char in enumerate(emj):
if char not in sub_tree:
sub_tree[char] = {}
sub_tree = sub_tree[char]
if i == lastidx:
sub_tree['data'] = unicode_codes.EMOJI_DATA[emj]
return _SEARCH_TREE