import json def dump_json(file, path): """Save json object""" with open(path, 'w', encoding='utf-8') as f: json.dump(file, f, indent=4, ensure_ascii=False) print("Saved json to path: " + str(path)) def load_json(path): """load json object""" with open(path, 'rb') as f: data = json.load(f) print("Loaded json from path: " + str(path)) return data def lead_k_sentences(text, k=50): """Select the first k sentences from a Japanese document""" DELIMITER = '。' if DELIMITER in text: segments = [seg for seg in text.split(DELIMITER)[:k] if len(seg) > 0] return DELIMITER.join(segments) + DELIMITER else: return text import jsonlines def read_jsonlines(path): with jsonlines.open(path) as reader: lines = [obj for obj in reader] return lines def write_jsonlines(file, path): with jsonlines.open(path, 'w') as writer: writer.write_all(file) import re def normalize_text(s): s = str(s) # if not s.isupper(): # s = re.sub(r"(\w)([A-Z])", r"\1 \2", s) # Respace s = re.sub(r'\(.*\)', '', s) # Remove japanese brackets s = re.sub(r'\(.*\)', '', s) # Remove english brackets s = s.strip() s = s.replace(' ', '_') s = s.upper() return s