gemma2-2b-kor-deobfuscation / modeling_hangul_augmentator.py
jwengr's picture
Upload folder using huggingface_hub
831df33 verified
import random
import torch
import torch.nn as nn
from jamotools import split_syllables, join_jamos
from transformers import PretrainedConfig, PreTrainedModel, AutoConfig
class HangulAugmentatorConfig(PretrainedConfig):
model_type = "hangul_augmentator"
def __init__(
self,
p=0.5,
do_link_next=True,
do_link_before=True,
do_replace_similar=True,
do_add_jong=True,
**kwargs
):
super().__init__(**kwargs)
self.p = p
self.do_link_next = do_link_next
self.do_link_before = do_link_before
self.do_replace_similar = do_replace_similar
self.do_add_jong = do_add_jong
class HangulAugmentator(PreTrainedModel):
config_class = HangulAugmentatorConfig
def __init__(self, config):
super().__init__(config)
self.temp_module = torch.nn.Parameter(torch.ones(1))
self.ja_similar_dict = {
'γ…‚': ['γ…ƒ', 'ㅍ'],
'γ„±': ['γ„²', 'γ…‹'],
'γ„·': ['γ„Έ', 'γ…Œ'],
'γ„²': ['γ„²', 'γ…‹'],
'γ……': ['γ…†'],
'γ…ˆ': ['γ…‰', 'γ…Š'],
'γ…Œ': ['γ„Έ', 'γ…Œ', 'γ„·'],
'γ…‹': ['γ„²', 'γ„±'],
'ㅍ': ['γ…ƒ', 'γ…‚'],
'γ…ƒ': ['ㅍ', 'γ…‚'],
'γ„Έ': ['γ…Œ', 'γ„·'],
'γ…Š': ['γ…‰', 'γ…‰', 'γ…ˆ'],
'γ…†': ['γ……'],
'γ…‰': ['γ…‰', 'γ…ˆ'],
}
self.mo_similar_dict = {
'γ…•': ['γ…“'],
'ㅏ': ['γ…‘'],
'ㅐ': ['γ…”', 'γ…’'],
'γ…—': ['γ…›'],
'γ…™': ['γ…š', 'γ…ž'],
'γ…‘': ['γ…œ'],
'γ…£': ['γ…Ÿ'],
'γ…œ': ['γ… '],
'γ…“': [ 'γ…•'],
'γ…”': ['γ…–', 'γ…ž'],
'γ…›': ['γ…—'],
'γ…š': ['γ…™', 'γ…ž'],
'γ… ': ['γ…œ'],
'ㅝ': ['γ…“'],
'γ…–': ['γ…’'],
'γ…’': ['γ…Ÿ'],
'γ…‘': ['ㅏ'],
'γ…ž': ['γ…™', 'γ…š', 'γ…”'],
'γ…’': ['γ…–']
}
self.jong_link_dict = {
'γ…‚': ['γ…‚', 'γ…‚'],
'ㅍ': ['ㅍ', 'ㅍ'],
'γ„±': ['γ„±', 'γ„±'],
'γ…Š': ['γ…Š', 'γ…Š'],
'γ…Ž': ['γ…Ž', 'γ…Ž'],
'γ…‡': ['γ…‡', 'γ…‡'],
'γ…Œ': ['γ…Œ', 'γ…Œ'],
'γ„½': ['γ„Ή', 'γ……'],
'γ„Ώ': ['γ„Ή', 'ㅍ'],
'γ„΅': ['γ„΄', 'γ…ˆ'],
'γ„²': ['γ„²', 'γ„²'],
'γ…‹': ['γ…‹', 'γ…‹'],
'γ„΄': ['γ„΄', 'γ„΄'],
'γ„·': ['γ„·', 'γ„·'],
'γ…€': ['γ„Ή', 'γ…Ž'],
'γ…ˆ': ['γ…ˆ', 'γ…ˆ'],
'γ„Ί': ['γ„Ή', 'γ„±'],
'γ„Ό': ['γ„Ή', 'γ…‚'],
'γ……': ['γ……', 'γ……'],
'γ„Ά': ['γ„΄', 'γ…Ž'],
'γ„Ή': ['γ„Ή', 'γ„Ή'],
'ㅁ': ['ㅁ', 'ㅁ'],
'γ„³': ['γ„±', 'γ……'],
'γ…†': ['γ…†', 'γ…†'],
'γ„Ύ': ['γ„Ή', 'γ…Œ'],
'γ…„': ['γ…‚', 'γ……'],
'γ„»': ['γ„Ή', 'ㅁ']
}
self.jong_similar_dict = {
'γ„Ή': [
'γ„±', 'γ„²', 'γ„³', 'γ„΄', 'γ„΅', 'γ„Ά', 'γ„·', 'γ„Ί', 'γ„»', 'γ„Ό', 'γ„½',
'γ„Ύ', 'γ„Ώ', 'γ…€', 'ㅁ', 'γ…‚', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'γ…Œ', 'ㅍ', 'γ…Ž'
],
'γ„²': [
'γ„±', 'γ„³', 'γ„΄', 'γ„Ή', 'γ„»', 'ㅁ', 'γ…‚', 'γ……', 'γ…†', 'γ…‡', 'γ…‹', 'γ…Œ', 'γ…Ž'
],
'γ……': [
'γ„±', 'γ„²', 'γ„΄', 'γ„Ά', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»', 'γ„½', 'γ…€',
'ㅁ', 'γ…‚', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'γ…Œ', 'ㅍ', 'γ…Ž'
],
'ㅁ': [
'γ„±', 'γ„²', 'γ„³', 'γ„΄', 'γ„Ά', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»', 'γ„Ό',
'γ„Ύ', 'γ„Ώ', 'γ…€', 'γ…‚', 'γ…„', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'γ…‹', 'γ…Œ', 'ㅍ', 'γ…Ž'
],
'γ„΄': [
'γ„±', 'γ„²', 'γ„³', 'γ„΅', 'γ„Ά', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»', 'γ„Ό',
'γ„Ύ', 'γ…€', 'ㅁ', 'γ…‚', 'γ…„', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'γ…‹', 'γ…Œ', 'ㅍ', 'γ…Ž'
],
'γ…‡': ['γ…Ž'],
'γ…†': [
'γ„±', 'γ„²', 'γ„³', 'γ„΄', 'γ„΅', 'γ„Ά', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»',
'γ„Ό', 'γ…€', 'ㅁ', 'γ…‚', 'γ…„', 'γ……', 'γ…‡', 'γ…ˆ', 'γ…Š', 'γ…‹', 'γ…Œ', 'ㅍ', 'γ…Ž'
],
'γ…„': ['γ…‚', 'ㅍ'],
'γ…‚': [
'γ„±', 'γ„΄', 'γ„΅', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»', 'γ„Ό', 'γ…€',
'ㅁ', 'γ…„', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'γ…Œ', 'ㅍ', 'γ…Ž'
],
'γ„Ά': ['γ„΄', 'γ„΅'],
'ㅍ': [
'γ„±', 'γ„²', 'γ„΄', 'γ„΅', 'γ„Ά', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»',
'γ„Ό', 'γ…€', 'ㅁ', 'γ…‚', 'γ…„', 'γ……', 'γ…†', 'γ…‡', 'γ…Œ', 'γ…Ž'
],
'γ„±': [
'γ„²', 'γ„³', 'γ„΄', 'γ„΅', 'γ„Ά', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»',
'γ„Ό', 'γ…€', 'ㅁ', 'γ…‚', 'γ…„', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'γ…‹', 'γ…Œ', 'ㅍ', 'γ…Ž'
],
'γ…Œ': [
'γ„±', 'γ„²', 'γ„΄', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»', 'ㅁ', 'γ…‚',
'γ…„', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'ㅍ', 'γ…Ž'
],
'γ„Ό': ['γ„Ή', 'γ„Ί', 'γ„»', 'γ…€'],
'γ„·': [
'γ„±', 'γ„²', 'γ„΄', 'γ„Ή', 'γ„Ί', 'γ„»', 'γ„Ό', 'γ…€',
'ㅁ', 'γ…‚', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'γ…Œ'
],
'γ…ˆ': [
'γ„±', 'γ„΄', 'γ„Ά', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»', 'ㅁ',
'γ…‚', 'γ……', 'γ…†', 'γ…‡', 'γ…Š', 'γ…Œ', 'γ…Ž'
],
'γ…Ž': ['γ„±', 'γ„³', 'γ„΄', 'γ„Ή', 'γ„»', 'ㅁ', 'γ…‚', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'ㅍ'],
'γ…Š': ['γ„·', 'ㅁ', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Œ'],
'γ„΅': ['γ„΄', 'γ„Ά'],
'γ„»': ['γ„Ή', 'γ„Ί', 'γ„½', 'γ…€'],
'γ…€': ['γ„Ή', 'γ„Ί', 'γ„»'],
'γ„Ί': ['γ„Ή', 'γ„»', 'γ„Ό', 'γ…€'],
'γ…‹': ['γ„±', 'γ„³', 'γ…„'],
'γ„³': ['γ„±'],
'γ„Ύ': ['γ„Ή'],
}
def __call__(self, sentence):
if self.config.do_link_next:
sentence = self.link_next(sentence, self.config.p)
if self.config.do_link_before:
sentence = self.link_before(sentence, self.config.p)
if self.config.do_replace_similar:
chars = []
for char in sentence:
if self.config.p>=random.random():
chars.append(self.replace_similar(char))
else:
chars.append(char)
sentence = ''.join(chars)
if self.config.do_add_jong:
chars = []
for char in sentence:
if self.config.p>=random.random():
chars.append(self.add_jong(char))
else:
chars.append(char)
sentence = ''.join(chars)
return sentence
def _link_next(self, char1, char2):
if len((char1+char2).strip())!=2:
return char1, char2
if not (0xAC00<= ord(char1) <=0xD7A3 and 0xAC00<= ord(char2) <=0xD7A3):
return char1, char2
char1_jamo = list(split_syllables(char1))
if len(char1_jamo)!=3:
return char1, char2
char2_jamo = list(split_syllables(char2))
if char2_jamo[0]!='γ…‡':
return char1, char2
new_jong, new_cho = self.jong_link_dict[char1_jamo[-1]]
new_char1 = join_jamos(char1_jamo[:2] + [new_jong])[:1]
new_char2 = join_jamos([new_cho] + char2_jamo[1:])[:1]
return new_char1, new_char2
def link_next(self, sentence, p):
chars = list(sentence)
for i in range(len(chars)-1):
if p>=random.random():
new_char1, new_char2 = self._link_next(chars[i], chars[i+1])
chars[i], chars[i+1] = new_char1, new_char2
new_sentence = ''.join(chars)
return new_sentence
def _link_before(self, char1, char2):
if len((char1+char2).strip())!=2:
return char1, char2
if not (0xAC00<= ord(char1) <=0xD7A3 and 0xAC00<= ord(char2) <=0xD7A3):
return char1, char2
char1_jamo = list(split_syllables(char1))
if len(char1_jamo)!=2:
return char1, char2
char2_jamo = list(split_syllables(char2))
new_char1 = join_jamos(char1_jamo[:2] + char2_jamo[:1])[:1]
return new_char1, char2
def link_before(self, sentence, p):
chars = list(sentence)
for i in range(len(chars)-1):
if p>=random.random():
new_char1, new_char2 = self._link_before(chars[i], chars[i+1])
chars[i], chars[i+1] = new_char1, new_char2
new_sentence = ''.join(chars)
return new_sentence
def replace_similar(self, char):
if len(char.strip())!=1:
return char
if not 0xAC00<= ord(char) <=0xD7A3:
return char
jamo = list(split_syllables(char))
jamo[0] = random.choice(self.ja_similar_dict.get(jamo[0],jamo[0]))
jamo[1] = random.choice(self.mo_similar_dict.get(jamo[1],jamo[1]))
if len(jamo)==3:
jamo[2] = random.choice(self.jong_similar_dict.get(jamo[2],jamo[2]))
return join_jamos(jamo)[:1]
def add_jong(self, char):
if len(char.strip())!=1:
return char
if not 0xAC00<= ord(char) <=0xD7A3:
return char
jamo = list(split_syllables(char))
if len(jamo)==3:
return char
new_jong = random.choice(list(self.jong_link_dict.keys()))
new_char = join_jamos(jamo[:2]+[new_jong])[:1]
return new_char
def _link_next(self, char1, char2):
if len((char1+char2).strip())!=2:
return char1, char2
if not (0xAC00<= ord(char1) <=0xD7A3 and 0xAC00<= ord(char2) <=0xD7A3):
return char1, char2
char1_jamo = list(split_syllables(char1))
if len(char1_jamo)!=3:
return char1, char2
char2_jamo = list(split_syllables(char2))
if char2_jamo[0]!='γ…‡':
return char1, char2
new_jong, new_cho = self.jong_link_dict[char1_jamo[-1]]
new_char1 = join_jamos(char1_jamo[:2] + [new_jong])[:1]
new_char2 = join_jamos([new_cho] + char2_jamo[1:])[:1]
return new_char1, new_char2
def link_next(self, sentence, p):
chars = list(sentence)
for i in range(len(chars)-1):
if p>=random.random():
new_char1, new_char2 = self._link_next(chars[i], chars[i+1])
chars[i], chars[i+1] = new_char1, new_char2
new_sentence = ''.join(chars)
return new_sentence
def _link_before(self, char1, char2):
if len((char1+char2).strip())!=2:
return char1, char2
if not (0xAC00<= ord(char1) <=0xD7A3 and 0xAC00<= ord(char2) <=0xD7A3):
return char1, char2
char1_jamo = list(split_syllables(char1))
if len(char1_jamo)!=2:
return char1, char2
char2_jamo = list(split_syllables(char2))
new_char1 = join_jamos(char1_jamo[:2] + char2_jamo[:1])[:1]
return new_char1, char2
def link_before(self, sentence, p):
chars = list(sentence)
for i in range(len(chars)-1):
if p>=random.random():
new_char1, new_char2 = self._link_before(chars[i], chars[i+1])
chars[i], chars[i+1] = new_char1, new_char2
new_sentence = ''.join(chars)
return new_sentence
def replace_similar(self, char):
if len(char.strip())!=1:
return char
if not 0xAC00<= ord(char) <=0xD7A3:
return char
jamo = list(split_syllables(char))
jamo[0] = random.choice(self.ja_similar_dict.get(jamo[0],jamo[0]))
jamo[1] = random.choice(self.mo_similar_dict.get(jamo[1],jamo[1]))
if len(jamo)==3:
jamo[2] = random.choice(self.jong_similar_dict.get(jamo[2],jamo[2]))
return join_jamos(jamo)[:1]
def add_jong(self, char):
if len(char.strip())!=1:
return char
if not 0xAC00<= ord(char) <=0xD7A3:
return char
jamo = list(split_syllables(char))
if len(jamo)==3:
return char
new_jong = random.choice(list(self.jong_link_dict.keys()))
new_char = join_jamos(jamo[:2]+[new_jong])[:1]
return new_char