Spaces:
Running
Running
## vocab.txt | |
``` | |
るのは | |
よね | |
写真,寫真,冩真,写眞,寫眞,冩眞 | |
マイ | |
そん | |
女性,𠨰性,⼥性,女𧢱,𠨰𧢱,⼥𧢱 | |
内容,內容,内㣑,内㝐,内彮,内𠕺,內㣑,內㝐,內彮,內𠕺 | |
``` | |
怎么还有不同写法?? | |
## 文本归一化 | |
以下的normalization,在生成任务中并不好。 | |
``` | |
self.content_repatter1 = re.compile(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+$,%#]+)") | |
self.content_repatter2 = re.compile(r"[A-Za-z0-9\._+]*@[\-_0-9A-Za-z]+(\.[A-Za-z]+)*") | |
self.content_repatter3 = re.compile(r"[\(]{0,1}[0-9]{2,4}[\)\-\(]{0,1}[0-9]{2,4}[\)\-]{0,1}[0-9]{3,4}") | |
self.content_repatter4 = re.compile( | |
r"([12]\d{3}[/\-年])*(0?[1-9]|1[0-2])[/\-月]((0?[1-9]|[12][0-9]|3[01])日?)*(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*" | |
) | |
self.content_repatter5 = re.compile( | |
r"(明治|大正|昭和|平成|令和|㍾|㍽|㍼|㍻|\u32ff)\d{1,2}年(0?[1-9]|1[0-2])月(0?[1-9]|[12][0-9]|3[01])日(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*" | |
) | |
self.content_repatter6 = re.compile( | |
r"((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*億)*((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*万)*((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*千)*(0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*(千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+(\(税込\)|\(税抜\)|\+tax)*" | |
) | |
def clean_text(self, content): | |
content = self.content_repatter1.sub("<URL>", content) | |
content = self.content_repatter2.sub("<EMAIL>", content) | |
content = self.content_repatter3.sub("<TEL>", content) | |
content = self.content_repatter4.sub("<DATE>", content) | |
content = self.content_repatter5.sub("<DATE>", content) | |
content = self.content_repatter6.sub("<PRICE>", content) | |
content = content.translate(self.content_trans1) | |
while "<BLOCK><BLOCK>" in content: | |
content = content.replace("<BLOCK><BLOCK>", "<BLOCK>") | |
return content | |
def tokenize(self, text, clean=False): | |
text = text.replace(" ", "<SP>") | |
text = text.replace(" ", "<SP>") | |
text = text.replace("\r\n", "<BR>") | |
text = text.replace("\n", "<BR>") | |
text = text.replace("\r", "<BR>") | |
text = text.replace("\t", "<TAB>") | |
text = text.replace("—", "ー") | |
text = text.replace("−", "ー") | |
for k, v in self.emoji["emoji"].items(): | |
if k in text: | |
text = text.replace(k, v) | |
if clean: | |
text = self.clean_text(text) | |
``` |