kkmkorea commited on
Commit
ee2270f
β€’
1 Parent(s): 1faa795

Upload 6 files

Browse files
korscideberta-abstractcls.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
normalize.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import re
5
+ import regex
6
+
7
+ from itertools import chain
8
+
9
+
10
+ class MosesPunctNormalizer:
11
+ """
12
+ This is a Python port of the Moses punctuation normalizer from
13
+ https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl
14
+ """
15
+
16
+ EXTRA_WHITESPACE = [ # lines 21 - 30
17
+ (r"\r", r""),
18
+ (r"\(", r" ("),
19
+ (r"\)", r") "),
20
+ (r" +", r" "),
21
+ (r"\) ([.!:?;,])", r")\g<1>"),
22
+ (r"\( ", r"("),
23
+ (r" \)", r")"),
24
+ (r"(\d) %", r"\g<1>%"),
25
+ (r" :", r":"),
26
+ (r" ;", r";"),
27
+ ]
28
+
29
+ NORMALIZE_UNICODE_IF_NOT_PENN = [(r"`", r"'"), (r"''", r' " ')] # lines 33 - 34
30
+
31
+ NORMALIZE_UNICODE = [ # lines 37 - 50
32
+ ("β€ž", r'"'),
33
+ ("β€œ", r'"'),
34
+ ("”", r'"'),
35
+ ("–", r"-"),
36
+ ("β€”", r" - "),
37
+ (r" +", r" "),
38
+ ("Β΄", r"'"),
39
+ ("([a-zA-Z])β€˜([a-zA-Z])", r"\g<1>'\g<2>"),
40
+ ("([a-zA-Z])’([a-zA-Z])", r"\g<1>'\g<2>"),
41
+ ("β€˜", r"'"),
42
+ ("β€š", r"'"),
43
+ ("’", r"'"),
44
+ (r"''", r'"'),
45
+ ("´´", r'"'),
46
+ ("…", r"..."),
47
+ ]
48
+
49
+ FRENCH_QUOTES = [ # lines 52 - 57
50
+ ("\u00A0Β«\u00A0", r'"'),
51
+ ("Β«\u00A0", r'"'),
52
+ ("Β«", r'"'),
53
+ ("\u00A0Β»\u00A0", r'"'),
54
+ ("\u00A0Β»", r'"'),
55
+ ("Β»", r'"'),
56
+ ]
57
+
58
+ HANDLE_PSEUDO_SPACES = [ # lines 59 - 67
59
+ ("\u00A0%", r"%"),
60
+ ("nΒΊ\u00A0", "nΒΊ "),
61
+ ("\u00A0:", r":"),
62
+ ("\u00A0ΒΊC", " ΒΊC"),
63
+ ("\u00A0cm", r" cm"),
64
+ ("\u00A0\\?", "?"),
65
+ ("\u00A0\\!", "!"),
66
+ ("\u00A0;", r";"),
67
+ (",\u00A0", r", "),
68
+ (r" +", r" "),
69
+ ]
70
+
71
+ EN_QUOTATION_FOLLOWED_BY_COMMA = [(r'"([,.]+)', r'\g<1>"')]
72
+
73
+ DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA = [
74
+ (r',"', r'",'),
75
+ (r'(\.+)"(\s*[^<])', r'"\g<1>\g<2>'), # don't fix period at end of sentence
76
+ ]
77
+
78
+ DE_ES_CZ_CS_FR = [
79
+ ("(\\d)\u00A0(\\d)", r"\g<1>,\g<2>"),
80
+ ]
81
+
82
+ OTHER = [
83
+ ("(\\d)\u00A0(\\d)", r"\g<1>.\g<2>"),
84
+ ]
85
+
86
+ # Regex substitutions from replace-unicode-punctuation.perl
87
+ # https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
88
+ REPLACE_UNICODE_PUNCTUATION = [
89
+ (",", ","),
90
+ (r"。\s*", ". "),
91
+ ("、", ","),
92
+ ("”", '"'),
93
+ ("β€œ", '"'),
94
+ ("∢", ":"),
95
+ (":", ":"),
96
+ ("?", "?"),
97
+ ("γ€Š", '"'),
98
+ ("》", '"'),
99
+ ("οΌ‰", ")"),
100
+ ("!", "!"),
101
+ ("(", "("),
102
+ ("οΌ›", ";"),
103
+ ("」", '"'),
104
+ ("γ€Œ", '"'),
105
+ ("0", "0"),
106
+ ("οΌ‘", "1"),
107
+ ("οΌ’", "2"),
108
+ ("οΌ“", "3"),
109
+ ("οΌ”", "4"),
110
+ ("οΌ•", "5"),
111
+ ("οΌ–", "6"),
112
+ ("οΌ—", "7"),
113
+ ("8", "8"),
114
+ ("οΌ™", "9"),
115
+ (r".\s*", ". "),
116
+ ("~", "~"),
117
+ ("’", "'"),
118
+ ("…", "..."),
119
+ ("━", "-"),
120
+ ("γ€ˆ", "<"),
121
+ ("〉", ">"),
122
+ ("【", "["),
123
+ ("】", "]"),
124
+ ("οΌ…", "%"),
125
+ ]
126
+
127
+ def __init__(
128
+ self,
129
+ lang="en",
130
+ penn=True,
131
+ norm_quote_commas=True,
132
+ norm_numbers=True,
133
+ pre_replace_unicode_punct=False,
134
+ post_remove_control_chars=False,
135
+ ):
136
+ """
137
+ :param language: The two-letter language code.
138
+ :type lang: str
139
+ :param penn: Normalize Penn Treebank style quotations.
140
+ :type penn: bool
141
+ :param norm_quote_commas: Normalize quotations and commas
142
+ :type norm_quote_commas: bool
143
+ :param norm_numbers: Normalize numbers
144
+ :type norm_numbers: bool
145
+ """
146
+ self.substitutions = [
147
+ self.EXTRA_WHITESPACE,
148
+ self.NORMALIZE_UNICODE,
149
+ self.FRENCH_QUOTES,
150
+ self.HANDLE_PSEUDO_SPACES,
151
+ ]
152
+
153
+ if penn: # Adds the penn substitutions after extra_whitespace regexes.
154
+ self.substitutions.insert(1, self.NORMALIZE_UNICODE_IF_NOT_PENN)
155
+
156
+ if norm_quote_commas:
157
+ if lang == "en":
158
+ self.substitutions.append(self.EN_QUOTATION_FOLLOWED_BY_COMMA)
159
+ elif lang in ["de", "es", "fr"]:
160
+ self.substitutions.append(self.DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA)
161
+
162
+ if norm_numbers:
163
+ if lang in ["de", "es", "cz", "cs", "fr"]:
164
+ self.substitutions.append(self.DE_ES_CZ_CS_FR)
165
+ else:
166
+ self.substitutions.append(self.OTHER)
167
+
168
+ self.substitutions = list(chain(*self.substitutions))
169
+
170
+ self.pre_replace_unicode_punct = pre_replace_unicode_punct
171
+ self.post_remove_control_chars = post_remove_control_chars
172
+
173
+ def normalize(self, text):
174
+ """
175
+ Returns a string with normalized punctuation.
176
+ """
177
+ # Optionally, replace unicode puncts BEFORE normalization.
178
+ if self.pre_replace_unicode_punct:
179
+ text = self.replace_unicode_punct(text)
180
+
181
+ # Actual normalization.
182
+ for regexp, substitution in self.substitutions:
183
+ # print(regexp, substitution)
184
+ text = re.sub(regexp, substitution, str(text))
185
+ # print(text)
186
+
187
+ # Optionally, replace unicode puncts BEFORE normalization.
188
+ if self.post_remove_control_chars:
189
+ text = self.remove_control_chars(text)
190
+
191
+ return text.strip()
192
+
193
+ def replace_unicode_punct(self, text):
194
+ for regexp, substitution in self.REPLACE_UNICODE_PUNCTUATION:
195
+ text = re.sub(regexp, substitution, str(text))
196
+ return text
197
+
198
+ def remove_control_chars(self, text):
199
+ return regex.sub(r"\p{C}", "", text)
tokenization_korscideberta.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team and Jangwon Park
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Tokenization classes for KoBERT model """
16
+
17
+
18
+ import logging
19
+ import os
20
+ import unicodedata
21
+ from shutil import copyfile
22
+
23
+ from transformers import PreTrainedTokenizer
24
+
25
+ #2023. 7. 28. ν˜•νƒœμ†Œ 뢄리(Mecab), μœ λ‹ˆμ½”λ“œ μ •κ·œν™” μΆ”κ°€
26
+ from konlpy.tag import Mecab
27
+ from unicode import join_jamos
28
+ from normalize import MosesPunctNormalizer
29
+ nor = MosesPunctNormalizer()
30
+
31
+ # μœ λ‹ˆμ½”λ“œ ν•œκΈ€ μ‹œμž‘ : 44032, 끝 : 55199
32
+ BASE_CODE, CHOSUNG, JUNGSUNG = 44032, 588, 28
33
+ # μ΄ˆμ„± 리슀트. 0 ~ 18
34
+ CHOSUNG_LIST = ['γ„±', 'γ„²', 'γ„΄', 'γ„·', 'γ„Έ', 'γ„Ή', 'ㅁ', 'γ…‚', 'γ…ƒ', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…‰', 'γ…Š', 'γ…‹', 'γ…Œ', 'ㅍ', 'γ…Ž']
35
+ # 쀑성 리슀트. 0 ~ 20
36
+ JUNGSUNG_LIST = ['ㅏ', 'ㅐ', 'γ…‘', 'γ…’', 'γ…“', 'γ…”', 'γ…•', 'γ…–', 'γ…—', 'γ…˜', 'γ…™', 'γ…š', 'γ…›', 'γ…œ', 'ㅝ', 'γ…ž', 'γ…Ÿ', 'γ… ', 'γ…‘', 'γ…’', 'γ…£']
37
+ # μ’…μ„± 리슀트. 0 ~ 27 + 1(1개 μ—†μŒ)
38
+ JONGSUNG_LIST = [' ', 'γ„±', 'γ„²', 'γ„³', 'γ„΄', 'γ„΅', 'γ„Ά', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»', 'γ„Ό', 'γ„½', 'γ„Ύ', 'γ„Ώ', 'γ…€', 'ㅁ', 'γ…‚', 'γ…„', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'γ…‹', 'γ…Œ', 'ㅍ', 'γ…Ž']
39
+ def splitjamo(string):
40
+ sp_list = list(string)
41
+ result = []
42
+ for keyword in sp_list:
43
+ # ν•œκΈ€ μ—¬λΆ€ check ν›„ 뢄리
44
+ if re.match('.*[γ„±-γ…Žγ…-γ…£κ°€-힣]+.*', keyword) is not None:
45
+ # μ΄ˆμ„±
46
+ char_code = ord(keyword) - BASE_CODE
47
+ char1 = int(char_code / CHOSUNG)
48
+ try:
49
+ result.append(CHOSUNG_LIST[char1])
50
+ except:
51
+ return string
52
+ #print("Err: "+str(char1))
53
+ # 쀑성
54
+ char2 = int((char_code - (CHOSUNG * char1)) / JUNGSUNG)
55
+ result.append(JUNGSUNG_LIST[char2])
56
+ # μ’…μ„±
57
+ char3 = int((char_code - (CHOSUNG * char1) - (JUNGSUNG * char2)))
58
+ result.append(JONGSUNG_LIST[char3])
59
+ else:
60
+ result.append(keyword)
61
+ return result
62
+ def has_coda(word):
63
+ return (ord(word[-1]) -44032)%28==0
64
+ def _replace_unicode(line):
65
+ if(line==None):
66
+ return ""
67
+ line = line.replace("β€”",'-').replace("―","-").replace("–","-").replace("οΌ‚",'"').replace("οΌ‡","'").replace("β€Ή","<").replace("β€Ί",">").replace("β€š","'").replace("β€›","'").replace("β€ž",'"').replace("β€Ÿ",'"').replace("Β«",'<').replace("Β»",'>').replace("˝",'"').replace("(",'(').replace("οΌ‰",')').replace("γ€Ž",'"').replace("』",'"').replace("β€œ",'"').replace("”",'"').replace("β€˜","'").replace("’","'").replace("γ€Š","<").replace("》",">").replace("γ€ˆ","<").replace("〉",">").replace("γ€Œ","'").replace("」","'").replace("【","[").replace("】","]").replace("γ€”","[").replace("〕","]").replace("οΌ»","[").replace("οΌ½","]").replace("ο½›","{").replace("}","}")
68
+ line=nor.replace_unicode_punct(line)
69
+ return line
70
+ def _mecab(line):
71
+ mecab = Mecab()
72
+ #μ°Έκ³ : VV동사 VAν˜•μš©μ‚¬ VX보쑰 μš©μ–Έ VCP긍정 지정사 VCNλΆ€μ • 지정사 JKS주격 쑰사 JKC보격 쑰사, … XSNλͺ…사 νŒŒμƒ 접미사 XSV동사 νŒŒμƒ 접미사 XSAν˜•μš©μ‚¬ νŒŒμƒ 접미사 EP선어말 μ–΄λ―Έ EFμ’…κ²° μ–΄λ―Έ ECμ—°κ²° μ–΄λ―Έ ETNλͺ…μ‚¬ν˜• μ „μ„± μ–΄λ―Έ ETMκ΄€ν˜•ν˜• μ „μ„± μ–΄λ―Έ
73
+
74
+ pdoc = []
75
+ morphs = []
76
+
77
+ poss = mecab.pos(line)
78
+ for pos in poss:
79
+ morphs.append(pos[0])
80
+ '''
81
+ pdoc.append(" ".join(morphs))
82
+ return pdoc
83
+ '''
84
+ return " ".join(morphs)
85
+
86
+ logger = logging.getLogger(__name__)
87
+
88
+ VOCAB_FILES_NAMES = {
89
+ "vocab_file": "spm.model",
90
+ "vocab_txt": "vocab.txt",
91
+ }
92
+
93
+ PRETRAINED_VOCAB_FILES_MAP = {
94
+ "vocab_file": {
95
+ "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/tokenizer_78b3253a26.model",
96
+ "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/tokenizer_78b3253a26.model",
97
+ "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/tokenizer_78b3253a26.model",
98
+ },
99
+ "vocab_txt": {
100
+ "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/vocab.txt",
101
+ "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/vocab.txt",
102
+ "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/vocab.txt",
103
+ },
104
+ }
105
+
106
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
107
+ "monologg/kobert": 512,
108
+ "monologg/kobert-lm": 512,
109
+ "monologg/distilkobert": 512,
110
+ }
111
+
112
+ PRETRAINED_INIT_CONFIGURATION = {
113
+ "monologg/kobert": {"do_lower_case": False},
114
+ "monologg/kobert-lm": {"do_lower_case": False},
115
+ "monologg/distilkobert": {"do_lower_case": False},
116
+ }
117
+
118
+ SPIECE_UNDERLINE = "▁"
119
+
120
+
121
+ class DebertaV2Tokenizer(PreTrainedTokenizer):
122
+ """
123
+ SentencePiece based tokenizer. Peculiarities:
124
+ - requires `SentencePiece <https://github.com/google/sentencepiece>`_
125
+ """
126
+
127
+ vocab_files_names = VOCAB_FILES_NAMES
128
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
129
+ pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
130
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
131
+
132
+ def __init__(
133
+ self,
134
+ vocab_file,
135
+ vocab_txt,
136
+ do_lower_case=False,
137
+ remove_space=True,
138
+ keep_accents=False,
139
+ unk_token="<unk>",
140
+ sep_token="<s>",
141
+ pad_token="<pad>",
142
+ cls_token="<cls>",
143
+ mask_token="<mask>",
144
+ **kwargs,
145
+ ):
146
+ super().__init__(
147
+ unk_token="<unk>",
148
+ sep_token=sep_token,
149
+ pad_token=pad_token,
150
+ cls_token=cls_token,
151
+ mask_token=mask_token,
152
+ **kwargs,
153
+ )
154
+
155
+ # Build vocab
156
+ self.token2idx = dict()
157
+ self.idx2token = []
158
+ with open(vocab_txt, "r", encoding="utf-8") as f:
159
+ for idx, token in enumerate(f):
160
+ token = token.strip()
161
+ self.token2idx[token] = idx
162
+ self.idx2token.append(token)
163
+
164
+ try:
165
+ import sentencepiece as spm
166
+ except ImportError:
167
+ logger.warning(
168
+ "You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
169
+ "pip install sentencepiece"
170
+ )
171
+
172
+ self.do_lower_case = do_lower_case
173
+ self.remove_space = remove_space
174
+ self.keep_accents = keep_accents
175
+ self.vocab_file = vocab_file
176
+ self.vocab_txt = vocab_txt
177
+
178
+ self.sp_model = spm.SentencePieceProcessor()
179
+ self.sp_model.Load(vocab_file)
180
+
181
+ @property
182
+ def vocab_size(self):
183
+ return len(self.idx2token)
184
+
185
+ def get_vocab(self):
186
+ return dict(self.token2idx, **self.added_tokens_encoder)
187
+
188
+ def __getstate__(self):
189
+ state = self.__dict__.copy()
190
+ state["sp_model"] = None
191
+ return state
192
+
193
+ def __setstate__(self, d):
194
+ self.__dict__ = d
195
+ try:
196
+ import sentencepiece as spm
197
+ except ImportError:
198
+ logger.warning(
199
+ "You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
200
+ "pip install sentencepiece"
201
+ )
202
+ self.sp_model = spm.SentencePieceProcessor()
203
+ self.sp_model.Load(self.vocab_file)
204
+
205
+ def preprocess_text(self, inputs):
206
+ if self.remove_space:
207
+ outputs = " ".join(inputs.strip().split())
208
+ else:
209
+ outputs = inputs
210
+ outputs = outputs.replace("``", '"').replace("''", '"')
211
+
212
+ if not self.keep_accents:
213
+ outputs = unicodedata.normalize("NFKD", outputs)
214
+ outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
215
+ if self.do_lower_case:
216
+ outputs = outputs.lower()
217
+
218
+ return outputs
219
+
220
+ def _tokenize(self, text):
221
+ """Tokenize a string."""
222
+ text = self.preprocess_text(text)
223
+ #print('text: '+text)
224
+ #logger.info("text ({}) ".format(text))
225
+ text = _replace_unicode(text) #μœ λ‹ˆμ½”λ“œ μ •κ·œν™”
226
+ text = _mecab(text) #ν˜•νƒœμ†Œ 뢄리
227
+ #print('text: '+str(text))
228
+ #logger.info("text ({}) ".format(text))
229
+ pieces = self.sp_model.encode(text, out_type=str)
230
+ new_pieces = []
231
+ for piece in pieces:
232
+ if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
233
+ cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
234
+ if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
235
+ if len(cur_pieces[0]) == 1:
236
+ cur_pieces = cur_pieces[1:]
237
+ else:
238
+ cur_pieces[0] = cur_pieces[0][1:]
239
+ cur_pieces.append(piece[-1])
240
+ new_pieces.extend(cur_pieces)
241
+ else:
242
+ new_pieces.append(piece)
243
+ '''
244
+ return_pieces = []
245
+ for n in new_pieces:
246
+ if(isinstance(n,list)):
247
+ for nn in n:
248
+ return_pieces.append(nn)
249
+ else:
250
+ return_pieces.append(n)
251
+ return return_pieces
252
+ '''
253
+ return new_pieces
254
+
255
+
256
+ def _convert_token_to_id(self, token):
257
+ """ Converts a token (str/unicode) in an id using the vocab. """
258
+ return self.token2idx.get(token, self.token2idx[self.unk_token])
259
+
260
+ def _convert_id_to_token(self, index):
261
+ """Converts an index (integer) in a token (string/unicode) using the vocab."""
262
+ return self.idx2token[index]
263
+
264
+ def convert_tokens_to_string(self, tokens):
265
+ """Converts a sequence of tokens (strings for sub-words) in a single string."""
266
+ out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
267
+ return out_string
268
+
269
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
270
+ """
271
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks
272
+ by concatenating and adding special tokens.
273
+ A KoBERT sequence has the following format:
274
+ single sequence: [CLS] X [SEP]
275
+ pair of sequences: [CLS] A [SEP] B [SEP]
276
+ """
277
+ if token_ids_1 is None:
278
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
279
+ cls = [self.cls_token_id]
280
+ sep = [self.sep_token_id]
281
+ return cls + token_ids_0 + sep + token_ids_1 + sep
282
+
283
+ def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
284
+ """
285
+ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
286
+ special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
287
+ Args:
288
+ token_ids_0: list of ids (must not contain special tokens)
289
+ token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
290
+ for sequence pairs
291
+ already_has_special_tokens: (default False) Set to True if the token list is already formated with
292
+ special tokens for the model
293
+ Returns:
294
+ A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
295
+ """
296
+
297
+ if already_has_special_tokens:
298
+ if token_ids_1 is not None:
299
+ raise ValueError(
300
+ "You should not supply a second sequence if the provided sequence of "
301
+ "ids is already formated with special tokens for the model."
302
+ )
303
+ return list(
304
+ map(
305
+ lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
306
+ token_ids_0,
307
+ )
308
+ )
309
+
310
+ if token_ids_1 is not None:
311
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
312
+ return [1] + ([0] * len(token_ids_0)) + [1]
313
+
314
+ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
315
+ """
316
+ Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
317
+ A KoBERT sequence pair mask has the following format:
318
+ 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
319
+ | first sequence | second sequence
320
+ if token_ids_1 is None, only returns the first portion of the mask (0's).
321
+ """
322
+ sep = [self.sep_token_id]
323
+ cls = [self.cls_token_id]
324
+ if token_ids_1 is None:
325
+ return len(cls + token_ids_0 + sep) * [0]
326
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
327
+
328
+ def save_vocabulary(self, save_directory):
329
+ """Save the sentencepiece vocabulary (copy original file) and special tokens file
330
+ to a directory.
331
+ """
332
+ if not os.path.isdir(save_directory):
333
+ logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
334
+ return
335
+
336
+ # 1. Save sentencepiece model
337
+ out_vocab_model = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
338
+
339
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_model):
340
+ copyfile(self.vocab_file, out_vocab_model)
341
+
342
+ # 2. Save vocab.txt
343
+ index = 0
344
+ out_vocab_txt = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_txt"])
345
+ with open(out_vocab_txt, "w", encoding="utf-8") as writer:
346
+ for token, token_index in sorted(self.token2idx.items(), key=lambda kv: kv[1]):
347
+ if index != token_index:
348
+ logger.warning(
349
+ "Saving vocabulary to {}: vocabulary indices are not consecutive."
350
+ " Please check that the vocabulary is not corrupted!".format(out_vocab_txt)
351
+ )
352
+ index = token_index
353
+ writer.write(token + "\n")
354
+ index += 1
355
+
356
+ return out_vocab_model, out_vocab_txt
tokenizer_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "do_lower_case": false,
3
+ "vocab_type": "spm"
4
+ }
unicode.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __all__ = ["split_syllable_char", "split_syllables",
2
+ "join_jamos", "join_jamos_char",
3
+ "CHAR_INITIALS", "CHAR_MEDIALS", "CHAR_FINALS"]
4
+
5
+ import itertools
6
+
7
+ INITIAL = 0x001
8
+ MEDIAL = 0x010
9
+ FINAL = 0x100
10
+ CHAR_LISTS = {
11
+ INITIAL: list(map(chr, [
12
+ 0x3131, 0x3132, 0x3134, 0x3137, 0x3138, 0x3139,
13
+ 0x3141, 0x3142, 0x3143, 0x3145, 0x3146, 0x3147,
14
+ 0x3148, 0x3149, 0x314a, 0x314b, 0x314c, 0x314d,
15
+ 0x314e
16
+ ])),
17
+ MEDIAL: list(map(chr, [
18
+ 0x314f, 0x3150, 0x3151, 0x3152, 0x3153, 0x3154,
19
+ 0x3155, 0x3156, 0x3157, 0x3158, 0x3159, 0x315a,
20
+ 0x315b, 0x315c, 0x315d, 0x315e, 0x315f, 0x3160,
21
+ 0x3161, 0x3162, 0x3163
22
+ ])),
23
+ FINAL: list(map(chr, [
24
+ 0x3131, 0x3132, 0x3133, 0x3134, 0x3135, 0x3136,
25
+ 0x3137, 0x3139, 0x313a, 0x313b, 0x313c, 0x313d,
26
+ 0x313e, 0x313f, 0x3140, 0x3141, 0x3142, 0x3144,
27
+ 0x3145, 0x3146, 0x3147, 0x3148, 0x314a, 0x314b,
28
+ 0x314c, 0x314d, 0x314e
29
+ ]))
30
+ }
31
+ CHAR_INITIALS = CHAR_LISTS[INITIAL]
32
+ CHAR_MEDIALS = CHAR_LISTS[MEDIAL]
33
+ CHAR_FINALS = CHAR_LISTS[FINAL]
34
+ CHAR_SETS = {k: set(v) for k, v in CHAR_LISTS.items()}
35
+ CHARSET = set(itertools.chain(*CHAR_SETS.values()))
36
+ CHAR_INDICES = {k: {c: i for i, c in enumerate(v)}
37
+ for k, v in CHAR_LISTS.items()}
38
+
39
+
40
+ def is_hangul_syllable(c):
41
+ return 0xac00 <= ord(c) <= 0xd7a3 # Hangul Syllables
42
+
43
+
44
+ def is_hangul_jamo(c):
45
+ return 0x1100 <= ord(c) <= 0x11ff # Hangul Jamo
46
+
47
+
48
+ def is_hangul_compat_jamo(c):
49
+ return 0x3130 <= ord(c) <= 0x318f # Hangul Compatibility Jamo
50
+
51
+
52
+ def is_hangul_jamo_exta(c):
53
+ return 0xa960 <= ord(c) <= 0xa97f # Hangul Jamo Extended-A
54
+
55
+
56
+ def is_hangul_jamo_extb(c):
57
+ return 0xd7b0 <= ord(c) <= 0xd7ff # Hangul Jamo Extended-B
58
+
59
+
60
+ def is_hangul(c):
61
+ return (is_hangul_syllable(c) or
62
+ is_hangul_jamo(c) or
63
+ is_hangul_compat_jamo(c) or
64
+ is_hangul_jamo_exta(c) or
65
+ is_hangul_jamo_extb(c))
66
+
67
+
68
+ def is_supported_hangul(c):
69
+ return is_hangul_syllable(c) or is_hangul_compat_jamo(c)
70
+
71
+
72
+ def check_hangul(c, jamo_only=False):
73
+ if not ((jamo_only or is_hangul_compat_jamo(c)) or is_supported_hangul(c)):
74
+ raise ValueError(f"'{c}' is not a supported hangul character. "
75
+ f"'Hangul Syllables' (0xac00 ~ 0xd7a3) and "
76
+ f"'Hangul Compatibility Jamos' (0x3130 ~ 0x318f) are "
77
+ f"supported at the moment.")
78
+
79
+
80
+ def get_jamo_type(c):
81
+ check_hangul(c)
82
+ assert is_hangul_compat_jamo(c), f"not a jamo: {ord(c):x}"
83
+ return sum(t for t, s in CHAR_SETS.items() if c in s)
84
+
85
+
86
+ def split_syllable_char(c):
87
+ """
88
+ Splits a given korean syllable into its components. Each component is
89
+ represented by Unicode in 'Hangul Compatibility Jamo' range.
90
+
91
+ Arguments:
92
+ c: A Korean character.
93
+
94
+ Returns:
95
+ A triple (initial, medial, final) of Hangul Compatibility Jamos.
96
+ If no jamo corresponds to a position, `None` is returned there.
97
+
98
+ Example:
99
+ >>> split_syllable_char("μ•ˆ")
100
+ ("γ…‡", "ㅏ", "γ„΄")
101
+ >>> split_syllable_char("κ³ ")
102
+ ("γ„±", "γ…—", None)
103
+ >>> split_syllable_char("γ…—")
104
+ (None, "γ…—", None)
105
+ >>> split_syllable_char("γ…‡")
106
+ ("γ…‡", None, None)
107
+ """
108
+ check_hangul(c)
109
+ if len(c) != 1:
110
+ raise ValueError("Input string must have exactly one character.")
111
+
112
+ init, med, final = None, None, None
113
+ if is_hangul_syllable(c):
114
+ offset = ord(c) - 0xac00
115
+ x = (offset - offset % 28) // 28
116
+ init, med, final = x // 21, x % 21, offset % 28
117
+ if not final:
118
+ final = None
119
+ else:
120
+ final -= 1
121
+ else:
122
+ pos = get_jamo_type(c)
123
+ if pos & INITIAL == INITIAL:
124
+ pos = INITIAL
125
+ elif pos & MEDIAL == MEDIAL:
126
+ pos = MEDIAL
127
+ elif pos & FINAL == FINAL:
128
+ pos = FINAL
129
+ idx = CHAR_INDICES[pos][c]
130
+ if pos == INITIAL:
131
+ init = idx
132
+ elif pos == MEDIAL:
133
+ med = idx
134
+ else:
135
+ final = idx
136
+ return tuple(CHAR_LISTS[pos][idx] if idx is not None else None
137
+ for pos, idx in
138
+ zip([INITIAL, MEDIAL, FINAL], [init, med, final]))
139
+
140
+
141
+ def split_syllables(s, ignore_err=True, pad=None):
142
+ """
143
+ Performs syllable-split on a string.
144
+
145
+ Arguments:
146
+ s (str): A string (possibly mixed with non-Hangul characters).
147
+ ignore_err (bool): If set False, it ensures that all characters in
148
+ the string are Hangul-splittable and throws a ValueError otherwise.
149
+ (default: True)
150
+ pad (str): Pad empty jamo positions (initial, medial, or final) with
151
+ `pad` character. This is useful for cases where fixed-length
152
+ strings are needed. (default: None)
153
+
154
+ Returns:
155
+ Hangul-split string
156
+
157
+ Example:
158
+ >>> split_syllables("μ•ˆλ…•ν•˜μ„Έμš”")
159
+ "ㅇㅏㄴㄴㅕㅇ��ㅏㅅㅔㅇㅛ"
160
+ >>> split_syllables("μ•ˆλ…•ν•˜μ„Έμš”~~", ignore_err=False)
161
+ ValueError: encountered an unsupported character: ~ (0x7e)
162
+ >>> split_syllables("μ•ˆλ…•ν•˜μ„Έμš”γ…›", pad="x")
163
+ 'γ…‡γ…γ„΄γ„΄γ…•γ…‡γ…Žγ…xγ……γ…”xγ…‡γ…›xxγ…›x'
164
+ """
165
+
166
+ def try_split(c):
167
+ try:
168
+ return split_syllable_char(c)
169
+ except ValueError:
170
+ if ignore_err:
171
+ return (c,)
172
+ raise ValueError(f"encountered an unsupported character: "
173
+ f"{c} (0x{ord(c):x})")
174
+
175
+ s = map(try_split, s)
176
+ if pad is not None:
177
+ tuples = map(lambda x: tuple(pad if y is None else y for y in x), s)
178
+ else:
179
+ tuples = map(lambda x: filter(None, x), s)
180
+ return "".join(itertools.chain(*tuples))
181
+
182
+
183
+ def join_jamos_char(init, med, final=None):
184
+ """
185
+ Combines jamos into a single syllable.
186
+
187
+ Arguments:
188
+ init (str): Initial jao.
189
+ med (str): Medial jamo.
190
+ final (str): Final jamo. If not supplied, the final syllable is made
191
+ without the final. (default: None)
192
+
193
+ Returns:
194
+ A Korean syllable.
195
+ """
196
+ chars = (init, med, final)
197
+ for c in filter(None, chars):
198
+ check_hangul(c, jamo_only=True)
199
+
200
+ idx = tuple(CHAR_INDICES[pos][c] if c is not None else c
201
+ for pos, c in zip((INITIAL, MEDIAL, FINAL), chars))
202
+ init_idx, med_idx, final_idx = idx
203
+ # final index must be shifted once as
204
+ # final index with 0 points to syllables without final
205
+ final_idx = 0 if final_idx is None else final_idx + 1
206
+ return chr(0xac00 + 28 * 21 * init_idx + 28 * med_idx + final_idx)
207
+
208
+
209
+ def join_jamos(s, ignore_err=True):
210
+ """
211
+ Combines a sequence of jamos to produce a sequence of syllables.
212
+
213
+ Arguments:
214
+ s (str): A string (possible mixed with non-jamo characters).
215
+ ignore_err (bool): If set False, it will ensure that all characters
216
+ will be consumed for the making of syllables. It will throw a
217
+ ValueError when it fails to do so. (default: True)
218
+
219
+ Returns:
220
+ A string
221
+
222
+ Example:
223
+ >>> join_jamos("γ…‡γ…γ„΄γ„΄γ…•γ…‡γ…Žγ…γ……γ…”γ…‡γ…›")
224
+ "μ•ˆλ…•ν•˜μ„Έμš”"
225
+ >>> join_jamos("γ…‡γ…γ„΄γ„΄γ„΄γ…•γ…‡γ…Žγ…γ……γ…”γ…‡γ…›")
226
+ "μ•ˆγ„΄λ…•ν•˜μ„Έμš”"
227
+ >>> join_jamos()
228
+ """
229
+ last_t = 0
230
+ queue = []
231
+ new_string = ""
232
+
233
+ def flush(n=0):
234
+ new_queue = []
235
+ while len(queue) > n:
236
+ new_queue.append(queue.pop())
237
+ if len(new_queue) == 1:
238
+ if not ignore_err:
239
+ raise ValueError(f"invalid jamo character: {new_queue[0]}")
240
+ result = new_queue[0]
241
+ elif len(new_queue) >= 2:
242
+ try:
243
+ result = join_jamos_char(*new_queue)
244
+ except (ValueError, KeyError):
245
+ # Invalid jamo combination
246
+ if not ignore_err:
247
+ raise ValueError(f"invalid jamo characters: {new_queue}")
248
+ result = "".join(new_queue)
249
+ else:
250
+ result = None
251
+ return result
252
+
253
+ for c in s:
254
+ if c not in CHARSET:
255
+ if queue:
256
+ new_c = flush() + c
257
+ else:
258
+ new_c = c
259
+ last_t = 0
260
+ else:
261
+ t = get_jamo_type(c)
262
+ new_c = None
263
+ if t & FINAL == FINAL:
264
+ if not (last_t == MEDIAL):
265
+ new_c = flush()
266
+ elif t == INITIAL:
267
+ new_c = flush()
268
+ elif t == MEDIAL:
269
+ if last_t & INITIAL == INITIAL:
270
+ new_c = flush(1)
271
+ else:
272
+ new_c = flush()
273
+ last_t = t
274
+ queue.insert(0, c)
275
+ if new_c:
276
+ new_string += new_c
277
+ if queue:
278
+ new_string += flush()
279
+ return new_string
vocab.txt ADDED
The diff for this file is too large to render. See raw diff