Spaces:

Pendrokar
/

xVASynth-TTS

Running on CPU Upgrade

App Files Files Community

xVASynth-TTS / resources /app /plugins /deepmoji_plugin /DeepMoji /emoji /tokenizer.py

Pendrokar

DeepMoji xVASynth Plugin

e1c08c5 over 1 year ago

raw

history blame

11.9 kB

	"""
	emoji.tokenizer
	~~~~~~~~~~~~~~~

	Components for detecting and tokenizing emoji in strings.

	"""
	from typing import NamedTuple, Dict, Union, Iterator, Any
	from emoji import unicode_codes


	__all__ = [
	'EmojiMatch', 'EmojiMatchZWJ', 'EmojiMatchZWJNonRGI', 'Token',
	'tokenize', 'filter_tokens',
	]

	_ZWJ = '\u200D'
	_SEARCH_TREE = None


	class EmojiMatch:
	"""
	Represents a match of a "recommended for general interchange" (RGI)
	emoji in a string.
	"""

	__slots__ = ('emoji', 'start', 'end', 'data')

	def __init__(self, emoji: str, start: int,
	end: int, data: Union[dict, None]):

	self.emoji = emoji
	"""The emoji substring"""

	self.start = start
	"""The start index of the match in the string"""

	self.end = end
	"""The end index of the match in the string"""

	self.data = data
	"""The entry from :data:`EMOJI_DATA` for this emoji or ``None`` if the emoji is non-RGI"""

	def data_copy(self) -> Dict[str, Any]:
	"""
	Returns a copy of the data from :data:`EMOJI_DATA` for this match
	with the additional keys ``match_start`` and ``match_end``.
	"""
	if self.data:
	emj_data = self.data.copy()
	emj_data['match_start'] = self.start
	emj_data['match_end'] = self.end
	return emj_data
	else:
	return {
	'match_start': self.start,
	'match_end': self.end
	}

	def is_zwj(self) -> bool:
	"""
	Checks if this is a ZWJ-emoji.

	:returns: True if this is a ZWJ-emoji, False otherwise
	"""

	return _ZWJ in self.emoji

	def split(self) -> Union['EmojiMatchZWJ', 'EmojiMatch']:
	"""
	Splits a ZWJ-emoji into its constituents.

	:returns: An :class:`EmojiMatchZWJ` containing the "sub-emoji" if this is a ZWJ-emoji, otherwise self
	"""

	if self.is_zwj():
	return EmojiMatchZWJ(self)
	else:
	return self

	def __repr__(self) -> str:
	return f'{self.__class__.__name__}({self.emoji}, {self.start}:{self.end})'


	class EmojiMatchZWJ(EmojiMatch):
	"""
	Represents a match of multiple emoji in a string that were joined by
	zero-width-joiners (ZWJ/``\\u200D``)."""

	__slots__ = ('emojis', )

	def __init__(self, match: EmojiMatch):
	super().__init__(match.emoji, match.start, match.end, match.data)

	self.emojis = []
	"""List of sub emoji as EmojiMatch objects"""

	i = match.start
	for e in match.emoji.split(_ZWJ):
	m = EmojiMatch(
	e, i, i+len(e), unicode_codes.EMOJI_DATA.get(e, None))
	self.emojis.append(m)
	i += len(e) + 1

	def join(self) -> str:
	"""
	Joins a ZWJ-emoji into a string
	"""

	return _ZWJ.join(e.emoji for e in self.emojis)

	def is_zwj(self) -> bool:
	return True

	def split(self) -> 'EmojiMatchZWJ':
	return self

	def __repr__(self) -> str:
	return f'{self.__class__.__name__}({self.join()}, {self.start}:{self.end})'


	class EmojiMatchZWJNonRGI(EmojiMatchZWJ):
	"""
	Represents a match of multiple emoji in a string that were joined by
	zero-width-joiners (ZWJ/``\\u200D``). This class is only used for emoji
	that are not "recommended for general interchange" (non-RGI) by Unicode.org.
	The data property of this class is always None.
	"""

	def __init__(self, first_emoji_match: EmojiMatch,
	second_emoji_match: EmojiMatch):

	self.emojis = [first_emoji_match, second_emoji_match]
	"""List of sub emoji as EmojiMatch objects"""

	self._update()

	def _update(self):
	self.emoji = _ZWJ.join(e.emoji for e in self.emojis)
	self.start = self.emojis[0].start
	self.end = self.emojis[-1].end
	self.data = None

	def _add(self, next_emoji_match: EmojiMatch):
	self.emojis.append(next_emoji_match)
	self._update()


	class Token(NamedTuple):
	"""
	A named tuple containing the matched string and its :class:`EmojiMatch` object if it is an emoji
	or a single character that is not a unicode emoji.
	"""
	chars: str
	value: Union[str, EmojiMatch]


	def tokenize(string, keep_zwj: bool) -> Iterator[Token]:
	"""
	Finds unicode emoji in a string. Yields all normal characters as a named
	tuple :class:`Token` ``(char, char)`` and all emoji as :class:`Token` ``(chars, EmojiMatch)``.

	:param string: String contains unicode characters. MUST BE UNICODE.
	:param keep_zwj: Should ZWJ-characters (``\\u200D``) that join non-RGI emoji be
	skipped or should be yielded as normal characters
	:return: An iterable of tuples :class:`Token` ``(char, char)`` or :class:`Token` ``(chars, EmojiMatch)``
	"""

	tree = get_search_tree()
	EMOJI_DATA = unicode_codes.EMOJI_DATA
	# result: [ Token(oldsubstring0, EmojiMatch), Token(char1, char1), ... ]
	result = []
	i = 0
	length = len(string)
	ignore = [] # index of chars in string that are skipped, i.e. the ZWJ-char in non-RGI-ZWJ-sequences
	while i < length:
	consumed = False
	char = string[i]
	if i in ignore:
	i += 1
	if char == _ZWJ and keep_zwj:
	result.append(Token(char, char))
	continue

	elif char in tree:
	j = i + 1
	sub_tree = tree[char]
	while j < length and string[j] in sub_tree:
	if j in ignore:
	break
	sub_tree = sub_tree[string[j]]
	j += 1
	if 'data' in sub_tree:
	emj_data = sub_tree['data']
	code_points = string[i:j]

	# We cannot yield the result here, we need to defer
	# the call until we are sure that the emoji is finished
	# i.e. we're not inside an ongoing ZWJ-sequence
	match_obj = EmojiMatch(code_points, i, j, emj_data)

	i = j - 1
	consumed = True
	result.append(Token(code_points, match_obj))

	elif char == _ZWJ and result and result[-1].chars in EMOJI_DATA and i > 0 and string[i - 1] in tree:
	# the current char is ZWJ and the last match was an emoji
	ignore.append(i)
	if EMOJI_DATA[result[-1].chars]["status"] == unicode_codes.STATUS["component"]:
	# last match was a component, it could be ZWJ+EMOJI+COMPONENT
	# or ZWJ+COMPONENT
	i = i - sum(len(t.chars) for t in result[-2:])
	if string[i] == _ZWJ:
	# It's ZWJ+COMPONENT, move one back
	i += 1
	del result[-1]
	else:
	# It's ZWJ+EMOJI+COMPONENT, move two back
	del result[-2:]
	else:
	# last match result[-1] was a normal emoji, move cursor
	# before the emoji
	i = i - len(result[-1].chars)
	del result[-1]
	continue

	elif result:
	yield from result
	result = []

	if not consumed and char != '\uFE0E' and char != '\uFE0F':
	result.append(Token(char, char))
	i += 1

	yield from result


	def filter_tokens(matches: Iterator[Token], emoji_only: bool, join_emoji: bool) -> Iterator[Token]:
	"""
	Filters the output of `tokenize()`

	:param matches: An iterable of tuples of the form ``(match_str, result)``
	where ``result`` is either an EmojiMatch or a string.
	:param emoji_only: If True, only EmojiMatch are returned in the output.
	If False all characters are returned
	:param join_emoji: If True, multiple EmojiMatch are merged into
	a single :class:`EmojiMatchZWJNonRGI` if they are separated only by a ZWJ.

	:return: An iterable of tuples :class:`Token` ``(char, char)``,
	:class:`Token` ``(chars, EmojiMatch)`` or :class:`Token` ``(chars, EmojiMatchZWJNonRGI)``
	"""

	if not join_emoji and not emoji_only:
	yield from matches
	return

	if not join_emoji:
	for token in matches:
	if token.chars != _ZWJ:
	yield token
	return

	# Combine multiple EmojiMatch that are separated by ZWJs into
	# a single EmojiMatchZWJNonRGI
	previous_is_emoji = False
	previous_is_zwj = False
	pre_previous_is_emoji = False
	accumulator = []
	for token in matches:
	pre_previous_is_emoji = previous_is_emoji
	if previous_is_emoji and token.value == _ZWJ:
	previous_is_zwj = True
	elif isinstance(token.value, EmojiMatch):
	if pre_previous_is_emoji and previous_is_zwj:
	if isinstance(accumulator[-1].value, EmojiMatchZWJNonRGI):
	accumulator[-1].value._add(token.value)
	accumulator[-1] = Token(accumulator[-1].chars +
	_ZWJ + token.chars, accumulator[-1].value)
	else:
	prev = accumulator.pop()
	accumulator.append(
	Token(prev.chars + _ZWJ + token.chars,
	EmojiMatchZWJNonRGI(
	prev.value,
	token.value)))
	else:
	accumulator.append(token)
	previous_is_emoji = True
	previous_is_zwj = False
	else:
	# Other character, not an emoji
	previous_is_emoji = False
	previous_is_zwj = False
	yield from accumulator
	if not emoji_only:
	yield token
	accumulator = []
	yield from accumulator


	def get_search_tree() -> Dict[str, Any]:
	"""
	Generate a search tree for demojize().
	Example of a search tree::

	EMOJI_DATA =
	{'a': {'en': ':Apple:'},
	'b': {'en': ':Bus:'},
	'ba': {'en': ':Bat:'},
	'band': {'en': ':Beatles:'},
	'bandit': {'en': ':Outlaw:'},
	'bank': {'en': ':BankOfEngland:'},
	'bb': {'en': ':BB-gun:'},
	'c': {'en': ':Car:'}}

	_SEARCH_TREE =
	{'a': {'data': {'en': ':Apple:'}},
	'b': {'a': {'data': {'en': ':Bat:'},
	'n': {'d': {'data': {'en': ':Beatles:'},
	'i': {'t': {'data': {'en': ':Outlaw:'}}}},
	'k': {'data': {'en': ':BankOfEngland:'}}}},
	'b': {'data': {'en': ':BB-gun:'}},
	'data': {'en': ':Bus:'}},
	'c': {'data': {'en': ':Car:'}}}

	_SEARCH_TREE
	/ \| ⧵
	/ \| ⧵
	a b c
	\| / \| ⧵ \|
	\| / \| ⧵ \|
	:Apple: ba :Bus: bb :Car:
	/ ⧵ \|
	/ ⧵ \|
	:Bat: ban :BB-gun:
	/ ⧵
	/ ⧵
	band bank
	/ ⧵ \|
	/ ⧵ \|
	bandi :Beatles: :BankOfEngland:
	\|
	bandit
	\|
	:Outlaw:


	"""
	global _SEARCH_TREE
	if _SEARCH_TREE is None:
	_SEARCH_TREE = {}
	for emj in unicode_codes.EMOJI_DATA:
	sub_tree = _SEARCH_TREE
	lastidx = len(emj) - 1
	for i, char in enumerate(emj):
	if char not in sub_tree:
	sub_tree[char] = {}
	sub_tree = sub_tree[char]
	if i == lastidx:
	sub_tree['data'] = unicode_codes.EMOJI_DATA[emj]
	return _SEARCH_TREE