|
import random |
|
import torch |
|
import torch.nn as nn |
|
|
|
from jamotools import split_syllables, join_jamos |
|
from transformers import PretrainedConfig, PreTrainedModel, AutoConfig |
|
|
|
class HangulAugmentatorConfig(PretrainedConfig): |
|
model_type = "hangul_augmentator" |
|
|
|
def __init__( |
|
self, |
|
p=0.5, |
|
do_link_next=True, |
|
do_link_before=True, |
|
do_replace_similar=True, |
|
do_add_jong=True, |
|
**kwargs |
|
): |
|
super().__init__(**kwargs) |
|
self.p = p |
|
self.do_link_next = do_link_next |
|
self.do_link_before = do_link_before |
|
self.do_replace_similar = do_replace_similar |
|
self.do_add_jong = do_add_jong |
|
|
|
|
|
class HangulAugmentator(PreTrainedModel): |
|
config_class = HangulAugmentatorConfig |
|
|
|
def __init__(self, config): |
|
super().__init__(config) |
|
self.temp_module = torch.nn.Parameter(torch.ones(1)) |
|
|
|
self.ja_similar_dict = { |
|
'γ
': ['γ
', 'γ
'], |
|
'γ±': ['γ²', 'γ
'], |
|
'γ·': ['γΈ', 'γ
'], |
|
'γ²': ['γ²', 'γ
'], |
|
'γ
': ['γ
'], |
|
'γ
': ['γ
', 'γ
'], |
|
'γ
': ['γΈ', 'γ
', 'γ·'], |
|
'γ
': ['γ²', 'γ±'], |
|
'γ
': ['γ
', 'γ
'], |
|
'γ
': ['γ
', 'γ
'], |
|
'γΈ': ['γ
', 'γ·'], |
|
'γ
': ['γ
', 'γ
', 'γ
'], |
|
'γ
': ['γ
'], |
|
'γ
': ['γ
', 'γ
'], |
|
} |
|
self.mo_similar_dict = { |
|
'γ
': ['γ
'], |
|
'γ
': ['γ
'], |
|
'γ
': ['γ
', 'γ
'], |
|
'γ
': ['γ
'], |
|
'γ
': ['γ
', 'γ
'], |
|
'γ
‘': ['γ
'], |
|
'γ
£': ['γ
'], |
|
'γ
': ['γ
'], |
|
'γ
': [ 'γ
'], |
|
'γ
': ['γ
', 'γ
'], |
|
'γ
': ['γ
'], |
|
'γ
': ['γ
', 'γ
'], |
|
'γ
': ['γ
'], |
|
'γ
': ['γ
'], |
|
'γ
': ['γ
'], |
|
'γ
’': ['γ
'], |
|
'γ
': ['γ
'], |
|
'γ
': ['γ
', 'γ
', 'γ
'], |
|
'γ
': ['γ
'] |
|
} |
|
self.jong_link_dict = { |
|
'γ
': ['γ
', 'γ
'], |
|
'γ
': ['γ
', 'γ
'], |
|
'γ±': ['γ±', 'γ±'], |
|
'γ
': ['γ
', 'γ
'], |
|
'γ
': ['γ
', 'γ
'], |
|
'γ
': ['γ
', 'γ
'], |
|
'γ
': ['γ
', 'γ
'], |
|
'γ½': ['γΉ', 'γ
'], |
|
'γΏ': ['γΉ', 'γ
'], |
|
'γ΅': ['γ΄', 'γ
'], |
|
'γ²': ['γ²', 'γ²'], |
|
'γ
': ['γ
', 'γ
'], |
|
'γ΄': ['γ΄', 'γ΄'], |
|
'γ·': ['γ·', 'γ·'], |
|
'γ
': ['γΉ', 'γ
'], |
|
'γ
': ['γ
', 'γ
'], |
|
'γΊ': ['γΉ', 'γ±'], |
|
'γΌ': ['γΉ', 'γ
'], |
|
'γ
': ['γ
', 'γ
'], |
|
'γΆ': ['γ΄', 'γ
'], |
|
'γΉ': ['γΉ', 'γΉ'], |
|
'γ
': ['γ
', 'γ
'], |
|
'γ³': ['γ±', 'γ
'], |
|
'γ
': ['γ
', 'γ
'], |
|
'γΎ': ['γΉ', 'γ
'], |
|
'γ
': ['γ
', 'γ
'], |
|
'γ»': ['γΉ', 'γ
'] |
|
} |
|
|
|
self.jong_similar_dict = { |
|
'γΉ': [ |
|
'γ±', 'γ²', 'γ³', 'γ΄', 'γ΅', 'γΆ', 'γ·', 'γΊ', 'γ»', 'γΌ', 'γ½', |
|
'γΎ', 'γΏ', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
' |
|
], |
|
'γ²': [ |
|
'γ±', 'γ³', 'γ΄', 'γΉ', 'γ»', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
' |
|
], |
|
'γ
': [ |
|
'γ±', 'γ²', 'γ΄', 'γΆ', 'γ·', 'γΉ', 'γΊ', 'γ»', 'γ½', 'γ
', |
|
'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
' |
|
], |
|
'γ
': [ |
|
'γ±', 'γ²', 'γ³', 'γ΄', 'γΆ', 'γ·', 'γΉ', 'γΊ', 'γ»', 'γΌ', |
|
'γΎ', 'γΏ', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
' |
|
], |
|
'γ΄': [ |
|
'γ±', 'γ²', 'γ³', 'γ΅', 'γΆ', 'γ·', 'γΉ', 'γΊ', 'γ»', 'γΌ', |
|
'γΎ', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
' |
|
], |
|
'γ
': ['γ
'], |
|
'γ
': [ |
|
'γ±', 'γ²', 'γ³', 'γ΄', 'γ΅', 'γΆ', 'γ·', 'γΉ', 'γΊ', 'γ»', |
|
'γΌ', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
' |
|
], |
|
'γ
': ['γ
', 'γ
'], |
|
'γ
': [ |
|
'γ±', 'γ΄', 'γ΅', 'γ·', 'γΉ', 'γΊ', 'γ»', 'γΌ', 'γ
', |
|
'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
' |
|
], |
|
'γΆ': ['γ΄', 'γ΅'], |
|
'γ
': [ |
|
'γ±', 'γ²', 'γ΄', 'γ΅', 'γΆ', 'γ·', 'γΉ', 'γΊ', 'γ»', |
|
'γΌ', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
' |
|
], |
|
'γ±': [ |
|
'γ²', 'γ³', 'γ΄', 'γ΅', 'γΆ', 'γ·', 'γΉ', 'γΊ', 'γ»', |
|
'γΌ', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
' |
|
], |
|
'γ
': [ |
|
'γ±', 'γ²', 'γ΄', 'γ·', 'γΉ', 'γΊ', 'γ»', 'γ
', 'γ
', |
|
'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
' |
|
], |
|
'γΌ': ['γΉ', 'γΊ', 'γ»', 'γ
'], |
|
'γ·': [ |
|
'γ±', 'γ²', 'γ΄', 'γΉ', 'γΊ', 'γ»', 'γΌ', 'γ
', |
|
'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
' |
|
], |
|
'γ
': [ |
|
'γ±', 'γ΄', 'γΆ', 'γ·', 'γΉ', 'γΊ', 'γ»', 'γ
', |
|
'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
' |
|
], |
|
'γ
': ['γ±', 'γ³', 'γ΄', 'γΉ', 'γ»', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
'], |
|
'γ
': ['γ·', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
', 'γ
'], |
|
'γ΅': ['γ΄', 'γΆ'], |
|
'γ»': ['γΉ', 'γΊ', 'γ½', 'γ
'], |
|
'γ
': ['γΉ', 'γΊ', 'γ»'], |
|
'γΊ': ['γΉ', 'γ»', 'γΌ', 'γ
'], |
|
'γ
': ['γ±', 'γ³', 'γ
'], |
|
'γ³': ['γ±'], |
|
'γΎ': ['γΉ'], |
|
} |
|
|
|
def __call__(self, sentence): |
|
if self.config.do_link_next: |
|
sentence = self.link_next(sentence, self.config.p) |
|
if self.config.do_link_before: |
|
sentence = self.link_before(sentence, self.config.p) |
|
|
|
if self.config.do_replace_similar: |
|
chars = [] |
|
for char in sentence: |
|
if self.config.p>=random.random(): |
|
chars.append(self.replace_similar(char)) |
|
else: |
|
chars.append(char) |
|
sentence = ''.join(chars) |
|
|
|
if self.config.do_add_jong: |
|
chars = [] |
|
for char in sentence: |
|
if self.config.p>=random.random(): |
|
chars.append(self.add_jong(char)) |
|
else: |
|
chars.append(char) |
|
sentence = ''.join(chars) |
|
return sentence |
|
|
|
|
|
def _link_next(self, char1, char2): |
|
if len((char1+char2).strip())!=2: |
|
return char1, char2 |
|
if not (0xAC00<= ord(char1) <=0xD7A3 and 0xAC00<= ord(char2) <=0xD7A3): |
|
return char1, char2 |
|
char1_jamo = list(split_syllables(char1)) |
|
if len(char1_jamo)!=3: |
|
return char1, char2 |
|
char2_jamo = list(split_syllables(char2)) |
|
if char2_jamo[0]!='γ
': |
|
return char1, char2 |
|
new_jong, new_cho = self.jong_link_dict[char1_jamo[-1]] |
|
new_char1 = join_jamos(char1_jamo[:2] + [new_jong])[:1] |
|
new_char2 = join_jamos([new_cho] + char2_jamo[1:])[:1] |
|
return new_char1, new_char2 |
|
|
|
def link_next(self, sentence, p): |
|
chars = list(sentence) |
|
for i in range(len(chars)-1): |
|
if p>=random.random(): |
|
new_char1, new_char2 = self._link_next(chars[i], chars[i+1]) |
|
chars[i], chars[i+1] = new_char1, new_char2 |
|
new_sentence = ''.join(chars) |
|
return new_sentence |
|
|
|
def _link_before(self, char1, char2): |
|
if len((char1+char2).strip())!=2: |
|
return char1, char2 |
|
if not (0xAC00<= ord(char1) <=0xD7A3 and 0xAC00<= ord(char2) <=0xD7A3): |
|
return char1, char2 |
|
char1_jamo = list(split_syllables(char1)) |
|
if len(char1_jamo)!=2: |
|
return char1, char2 |
|
char2_jamo = list(split_syllables(char2)) |
|
new_char1 = join_jamos(char1_jamo[:2] + char2_jamo[:1])[:1] |
|
return new_char1, char2 |
|
|
|
def link_before(self, sentence, p): |
|
chars = list(sentence) |
|
for i in range(len(chars)-1): |
|
if p>=random.random(): |
|
new_char1, new_char2 = self._link_before(chars[i], chars[i+1]) |
|
chars[i], chars[i+1] = new_char1, new_char2 |
|
new_sentence = ''.join(chars) |
|
return new_sentence |
|
|
|
def replace_similar(self, char): |
|
if len(char.strip())!=1: |
|
return char |
|
if not 0xAC00<= ord(char) <=0xD7A3: |
|
return char |
|
jamo = list(split_syllables(char)) |
|
jamo[0] = random.choice(self.ja_similar_dict.get(jamo[0],jamo[0])) |
|
jamo[1] = random.choice(self.mo_similar_dict.get(jamo[1],jamo[1])) |
|
if len(jamo)==3: |
|
jamo[2] = random.choice(self.jong_similar_dict.get(jamo[2],jamo[2])) |
|
return join_jamos(jamo)[:1] |
|
|
|
def add_jong(self, char): |
|
if len(char.strip())!=1: |
|
return char |
|
if not 0xAC00<= ord(char) <=0xD7A3: |
|
return char |
|
jamo = list(split_syllables(char)) |
|
if len(jamo)==3: |
|
return char |
|
new_jong = random.choice(list(self.jong_link_dict.keys())) |
|
new_char = join_jamos(jamo[:2]+[new_jong])[:1] |
|
return new_char |
|
def _link_next(self, char1, char2): |
|
if len((char1+char2).strip())!=2: |
|
return char1, char2 |
|
if not (0xAC00<= ord(char1) <=0xD7A3 and 0xAC00<= ord(char2) <=0xD7A3): |
|
return char1, char2 |
|
char1_jamo = list(split_syllables(char1)) |
|
if len(char1_jamo)!=3: |
|
return char1, char2 |
|
char2_jamo = list(split_syllables(char2)) |
|
if char2_jamo[0]!='γ
': |
|
return char1, char2 |
|
new_jong, new_cho = self.jong_link_dict[char1_jamo[-1]] |
|
new_char1 = join_jamos(char1_jamo[:2] + [new_jong])[:1] |
|
new_char2 = join_jamos([new_cho] + char2_jamo[1:])[:1] |
|
return new_char1, new_char2 |
|
|
|
def link_next(self, sentence, p): |
|
chars = list(sentence) |
|
for i in range(len(chars)-1): |
|
if p>=random.random(): |
|
new_char1, new_char2 = self._link_next(chars[i], chars[i+1]) |
|
chars[i], chars[i+1] = new_char1, new_char2 |
|
new_sentence = ''.join(chars) |
|
return new_sentence |
|
|
|
def _link_before(self, char1, char2): |
|
if len((char1+char2).strip())!=2: |
|
return char1, char2 |
|
if not (0xAC00<= ord(char1) <=0xD7A3 and 0xAC00<= ord(char2) <=0xD7A3): |
|
return char1, char2 |
|
char1_jamo = list(split_syllables(char1)) |
|
if len(char1_jamo)!=2: |
|
return char1, char2 |
|
char2_jamo = list(split_syllables(char2)) |
|
new_char1 = join_jamos(char1_jamo[:2] + char2_jamo[:1])[:1] |
|
return new_char1, char2 |
|
|
|
def link_before(self, sentence, p): |
|
chars = list(sentence) |
|
for i in range(len(chars)-1): |
|
if p>=random.random(): |
|
new_char1, new_char2 = self._link_before(chars[i], chars[i+1]) |
|
chars[i], chars[i+1] = new_char1, new_char2 |
|
new_sentence = ''.join(chars) |
|
return new_sentence |
|
|
|
def replace_similar(self, char): |
|
if len(char.strip())!=1: |
|
return char |
|
if not 0xAC00<= ord(char) <=0xD7A3: |
|
return char |
|
jamo = list(split_syllables(char)) |
|
jamo[0] = random.choice(self.ja_similar_dict.get(jamo[0],jamo[0])) |
|
jamo[1] = random.choice(self.mo_similar_dict.get(jamo[1],jamo[1])) |
|
if len(jamo)==3: |
|
jamo[2] = random.choice(self.jong_similar_dict.get(jamo[2],jamo[2])) |
|
return join_jamos(jamo)[:1] |
|
|
|
def add_jong(self, char): |
|
if len(char.strip())!=1: |
|
return char |
|
if not 0xAC00<= ord(char) <=0xD7A3: |
|
return char |
|
jamo = list(split_syllables(char)) |
|
if len(jamo)==3: |
|
return char |
|
new_jong = random.choice(list(self.jong_link_dict.keys())) |
|
new_char = join_jamos(jamo[:2]+[new_jong])[:1] |
|
return new_char |