File size: 11,876 Bytes
e1c08c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
"""
emoji.tokenizer
~~~~~~~~~~~~~~~

Components for detecting and tokenizing emoji in strings.

"""
from typing import NamedTuple, Dict, Union, Iterator, Any
from emoji import unicode_codes


__all__ = [
    'EmojiMatch', 'EmojiMatchZWJ', 'EmojiMatchZWJNonRGI', 'Token',
    'tokenize', 'filter_tokens',
]

_ZWJ = '\u200D'
_SEARCH_TREE = None


class EmojiMatch:
    """
    Represents a match of a "recommended for general interchange" (RGI)
    emoji in a string.
    """

    __slots__ = ('emoji', 'start', 'end', 'data')

    def __init__(self, emoji: str, start: int,
                 end: int, data: Union[dict, None]):

        self.emoji = emoji
        """The emoji substring"""

        self.start = start
        """The start index of the match in the string"""

        self.end = end
        """The end index of the match in the string"""

        self.data = data
        """The entry from :data:`EMOJI_DATA` for this emoji or ``None`` if the emoji is non-RGI"""

    def data_copy(self) -> Dict[str, Any]:
        """
        Returns a copy of the data from :data:`EMOJI_DATA` for this match
        with the additional keys ``match_start`` and ``match_end``.
        """
        if self.data:
            emj_data = self.data.copy()
            emj_data['match_start'] = self.start
            emj_data['match_end'] = self.end
            return emj_data
        else:
            return {
                'match_start': self.start,
                'match_end': self.end
            }

    def is_zwj(self) -> bool:
        """
        Checks if this is a ZWJ-emoji.

        :returns: True if this is a ZWJ-emoji, False otherwise
        """

        return _ZWJ in self.emoji

    def split(self) -> Union['EmojiMatchZWJ', 'EmojiMatch']:
        """
        Splits a ZWJ-emoji into its constituents.

        :returns: An :class:`EmojiMatchZWJ` containing the "sub-emoji" if this is a ZWJ-emoji, otherwise self
        """

        if self.is_zwj():
            return EmojiMatchZWJ(self)
        else:
            return self

    def __repr__(self) -> str:
        return f'{self.__class__.__name__}({self.emoji}, {self.start}:{self.end})'


class EmojiMatchZWJ(EmojiMatch):
    """
    Represents a match of multiple emoji in a string that were joined by
    zero-width-joiners (ZWJ/``\\u200D``)."""

    __slots__ = ('emojis', )

    def __init__(self, match: EmojiMatch):
        super().__init__(match.emoji, match.start, match.end, match.data)

        self.emojis = []
        """List of sub emoji as EmojiMatch objects"""

        i = match.start
        for e in match.emoji.split(_ZWJ):
            m = EmojiMatch(
                e, i, i+len(e), unicode_codes.EMOJI_DATA.get(e, None))
            self.emojis.append(m)
            i += len(e) + 1

    def join(self) -> str:
        """
        Joins a ZWJ-emoji into a string
        """

        return _ZWJ.join(e.emoji for e in self.emojis)

    def is_zwj(self) -> bool:
        return True

    def split(self) -> 'EmojiMatchZWJ':
        return self

    def __repr__(self) -> str:
        return f'{self.__class__.__name__}({self.join()}, {self.start}:{self.end})'


class EmojiMatchZWJNonRGI(EmojiMatchZWJ):
    """
    Represents a match of multiple emoji in a string that were joined by
    zero-width-joiners (ZWJ/``\\u200D``). This class is only used for emoji
    that are not "recommended for general interchange" (non-RGI) by Unicode.org.
    The data property of this class is always None.
    """

    def __init__(self, first_emoji_match: EmojiMatch,
                 second_emoji_match: EmojiMatch):

        self.emojis = [first_emoji_match, second_emoji_match]
        """List of sub emoji as EmojiMatch objects"""

        self._update()

    def _update(self):
        self.emoji = _ZWJ.join(e.emoji for e in self.emojis)
        self.start = self.emojis[0].start
        self.end = self.emojis[-1].end
        self.data = None

    def _add(self, next_emoji_match: EmojiMatch):
        self.emojis.append(next_emoji_match)
        self._update()


class Token(NamedTuple):
    """
    A named tuple containing the matched string and its :class:`EmojiMatch` object if it is an emoji
    or a single character that is not a unicode emoji.
    """
    chars: str
    value: Union[str, EmojiMatch]


def tokenize(string, keep_zwj: bool) -> Iterator[Token]:
    """
    Finds unicode emoji in a string. Yields all normal characters as a named
    tuple :class:`Token` ``(char, char)`` and all emoji as :class:`Token` ``(chars, EmojiMatch)``.

    :param string: String contains unicode characters. MUST BE UNICODE.
    :param keep_zwj: Should ZWJ-characters (``\\u200D``) that join non-RGI emoji be
        skipped or should be yielded as normal characters
    :return: An iterable of tuples :class:`Token` ``(char, char)`` or :class:`Token` ``(chars, EmojiMatch)``
    """

    tree = get_search_tree()
    EMOJI_DATA = unicode_codes.EMOJI_DATA
    # result: [ Token(oldsubstring0, EmojiMatch), Token(char1, char1), ... ]
    result = []
    i = 0
    length = len(string)
    ignore = []  # index of chars in string that are skipped, i.e. the ZWJ-char in non-RGI-ZWJ-sequences
    while i < length:
        consumed = False
        char = string[i]
        if i in ignore:
            i += 1
            if char == _ZWJ and keep_zwj:
                result.append(Token(char, char))
            continue

        elif char in tree:
            j = i + 1
            sub_tree = tree[char]
            while j < length and string[j] in sub_tree:
                if j in ignore:
                    break
                sub_tree = sub_tree[string[j]]
                j += 1
            if 'data' in sub_tree:
                emj_data = sub_tree['data']
                code_points = string[i:j]

                # We cannot yield the result here, we need to defer
                # the call until we are sure that the emoji is finished
                # i.e. we're not inside an ongoing ZWJ-sequence
                match_obj = EmojiMatch(code_points, i, j, emj_data)

                i = j - 1
                consumed = True
                result.append(Token(code_points, match_obj))

        elif char == _ZWJ and result and result[-1].chars in EMOJI_DATA and i > 0 and string[i - 1] in tree:
            # the current char is ZWJ and the last match was an emoji
            ignore.append(i)
            if EMOJI_DATA[result[-1].chars]["status"] == unicode_codes.STATUS["component"]:
                # last match was a component, it could be ZWJ+EMOJI+COMPONENT
                # or ZWJ+COMPONENT
                i = i - sum(len(t.chars) for t in result[-2:])
                if string[i] == _ZWJ:
                    # It's ZWJ+COMPONENT, move one back
                    i += 1
                    del result[-1]
                else:
                    # It's ZWJ+EMOJI+COMPONENT, move two back
                    del result[-2:]
            else:
                # last match result[-1] was a normal emoji, move cursor
                # before the emoji
                i = i - len(result[-1].chars)
                del result[-1]
            continue

        elif result:
            yield from result
            result = []

        if not consumed and char != '\uFE0E' and char != '\uFE0F':
            result.append(Token(char, char))
        i += 1

    yield from result


def filter_tokens(matches: Iterator[Token], emoji_only: bool, join_emoji: bool) -> Iterator[Token]:
    """
    Filters the output of `tokenize()`

    :param matches: An iterable of tuples of the form ``(match_str, result)``
        where ``result`` is either an EmojiMatch or a string.
    :param emoji_only: If True, only EmojiMatch are returned in the output.
        If False all characters are returned
    :param join_emoji: If True, multiple EmojiMatch are merged into
        a single :class:`EmojiMatchZWJNonRGI` if they are separated only by a ZWJ.

    :return: An iterable of tuples :class:`Token` ``(char, char)``,
        :class:`Token` ``(chars, EmojiMatch)`` or :class:`Token` ``(chars, EmojiMatchZWJNonRGI)``
    """

    if not join_emoji and not emoji_only:
        yield from matches
        return

    if not join_emoji:
        for token in matches:
            if token.chars != _ZWJ:
                yield token
        return

    # Combine multiple EmojiMatch that are separated by ZWJs into
    # a single EmojiMatchZWJNonRGI
    previous_is_emoji = False
    previous_is_zwj = False
    pre_previous_is_emoji = False
    accumulator = []
    for token in matches:
        pre_previous_is_emoji = previous_is_emoji
        if previous_is_emoji and token.value == _ZWJ:
            previous_is_zwj = True
        elif isinstance(token.value, EmojiMatch):
            if pre_previous_is_emoji and previous_is_zwj:
                if isinstance(accumulator[-1].value, EmojiMatchZWJNonRGI):
                    accumulator[-1].value._add(token.value)
                    accumulator[-1] = Token(accumulator[-1].chars +
                                            _ZWJ + token.chars, accumulator[-1].value)
                else:
                    prev = accumulator.pop()
                    accumulator.append(
                        Token(prev.chars + _ZWJ + token.chars,
                              EmojiMatchZWJNonRGI(
                                  prev.value,
                                  token.value)))
            else:
                accumulator.append(token)
            previous_is_emoji = True
            previous_is_zwj = False
        else:
            # Other character, not an emoji
            previous_is_emoji = False
            previous_is_zwj = False
            yield from accumulator
            if not emoji_only:
                yield token
            accumulator = []
    yield from accumulator


def get_search_tree() -> Dict[str, Any]:
    """
    Generate a search tree for demojize().
    Example of a search tree::

        EMOJI_DATA =
        {'a': {'en': ':Apple:'},
        'b': {'en': ':Bus:'},
        'ba': {'en': ':Bat:'},
        'band': {'en': ':Beatles:'},
        'bandit': {'en': ':Outlaw:'},
        'bank': {'en': ':BankOfEngland:'},
        'bb': {'en': ':BB-gun:'},
        'c': {'en': ':Car:'}}

        _SEARCH_TREE =
        {'a': {'data': {'en': ':Apple:'}},
        'b': {'a': {'data': {'en': ':Bat:'},
                    'n': {'d': {'data': {'en': ':Beatles:'},
                                'i': {'t': {'data': {'en': ':Outlaw:'}}}},
                        'k': {'data': {'en': ':BankOfEngland:'}}}},
            'b': {'data': {'en': ':BB-gun:'}},
            'data': {'en': ':Bus:'}},
        'c': {'data': {'en': ':Car:'}}}

                   _SEARCH_TREE
                 /     |        ⧵
               /       |          ⧵
            a          b             c
            |        / |  ⧵          |
            |       /  |    ⧵        |
        :Apple:   ba  :Bus:  bb     :Car:
                 /  ⧵         |
                /    ⧵        |
              :Bat:    ban     :BB-gun:
                     /     ⧵
                    /       ⧵
                 band       bank
                /   ⧵         |
               /     ⧵        |
            bandi :Beatles:  :BankOfEngland:
               |
            bandit
               |
           :Outlaw:


    """
    global _SEARCH_TREE
    if _SEARCH_TREE is None:
        _SEARCH_TREE = {}
        for emj in unicode_codes.EMOJI_DATA:
            sub_tree = _SEARCH_TREE
            lastidx = len(emj) - 1
            for i, char in enumerate(emj):
                if char not in sub_tree:
                    sub_tree[char] = {}
                sub_tree = sub_tree[char]
                if i == lastidx:
                    sub_tree['data'] = unicode_codes.EMOJI_DATA[emj]
    return _SEARCH_TREE