|
|
|
import re |
|
import sys |
|
|
|
import pyopenjtalk |
|
|
|
from text import symbols |
|
|
|
|
|
_japanese_characters = re.compile( |
|
r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" |
|
) |
|
|
|
|
|
_japanese_marks = re.compile( |
|
r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" |
|
) |
|
|
|
|
|
_symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]] |
|
|
|
|
|
|
|
_real_sokuon = [ |
|
(re.compile("%s" % x[0]), x[1]) |
|
for x in [ |
|
(r"Q([↑↓]*[kg])", r"k#\1"), |
|
(r"Q([↑↓]*[tdjʧ])", r"t#\1"), |
|
(r"Q([↑↓]*[sʃ])", r"s\1"), |
|
(r"Q([↑↓]*[pb])", r"p#\1"), |
|
] |
|
] |
|
|
|
|
|
_real_hatsuon = [ |
|
(re.compile("%s" % x[0]), x[1]) |
|
for x in [ |
|
(r"N([↑↓]*[pbm])", r"m\1"), |
|
(r"N([↑↓]*[ʧʥj])", r"n^\1"), |
|
(r"N([↑↓]*[tdn])", r"n\1"), |
|
(r"N([↑↓]*[kg])", r"ŋ\1"), |
|
] |
|
] |
|
|
|
|
|
def post_replace_ph(ph): |
|
rep_map = { |
|
":": ",", |
|
";": ",", |
|
",": ",", |
|
"。": ".", |
|
"!": "!", |
|
"?": "?", |
|
"\n": ".", |
|
"·": ",", |
|
"、": ",", |
|
"...": "…", |
|
} |
|
if ph in rep_map.keys(): |
|
ph = rep_map[ph] |
|
if ph in symbols: |
|
return ph |
|
if ph not in symbols: |
|
ph = "UNK" |
|
return ph |
|
|
|
|
|
def symbols_to_japanese(text): |
|
for regex, replacement in _symbols_to_japanese: |
|
text = re.sub(regex, replacement, text) |
|
return text |
|
|
|
|
|
def preprocess_jap(text): |
|
"""Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html""" |
|
text = symbols_to_japanese(text) |
|
sentences = re.split(_japanese_marks, text) |
|
marks = re.findall(_japanese_marks, text) |
|
text = [] |
|
for i, sentence in enumerate(sentences): |
|
if re.match(_japanese_characters, sentence): |
|
p = pyopenjtalk.g2p(sentence) |
|
text += p.split(" ") |
|
|
|
if i < len(marks): |
|
text += [marks[i].replace(" ", "")] |
|
return text |
|
|
|
|
|
def text_normalize(text): |
|
|
|
return text |
|
|
|
|
|
def g2p(norm_text): |
|
phones = preprocess_jap(norm_text) |
|
phones = [post_replace_ph(i) for i in phones] |
|
|
|
return phones |
|
|
|
|
|
if __name__ == "__main__": |
|
for line in open("../../../Downloads/transcript_utf8.txt").readlines(): |
|
text = line.split(":")[1] |
|
phones = g2p(text) |
|
print(phones) |
|
|