|
import argparse |
|
import yaml |
|
import os |
|
import pickle as pkl |
|
|
|
from tqdm import tqdm |
|
from pyarabic.araby import tokenize, strip_tashkeel, strip_tatweel |
|
|
|
def export(path, text): |
|
with open(path, 'w', encoding="utf-8") as fout: |
|
fout.write('\n'.join(text)) |
|
|
|
def segment(lines, stride, window_sz, min_window_sz): |
|
segments, mapping = [], [] |
|
real_seg_idx = 0 |
|
|
|
for sent_idx, line in tqdm(enumerate(lines), total=len(lines)): |
|
line: str = strip_tatweel(line) |
|
line = line.strip() |
|
tokens = tokenize(line) |
|
if len(tokens) == 0: continue |
|
if tokens[-1] == '\n': tokens = tokens[:-1] |
|
seg_idx, idx = 0, 0 |
|
while idx < len(tokens): |
|
window = tokens[idx:idx+window_sz] |
|
if window_sz == -1: window = tokens |
|
if len(window) < min_window_sz and seg_idx != 0: break |
|
|
|
segment = ' '.join(window) |
|
segments += [segment] |
|
char_offset = len(strip_tashkeel(' '.join(tokens[:idx]))) |
|
|
|
if seg_idx > 0: |
|
char_offset += 1 |
|
|
|
seg_tokens = tokenize(strip_tashkeel(segment)) |
|
|
|
j = 0 |
|
for st_idx, st in enumerate(seg_tokens): |
|
for _ in range(len(st)): |
|
mapping += [(sent_idx, real_seg_idx, st_idx, j+char_offset)] |
|
j += 1 |
|
j += 1 |
|
|
|
real_seg_idx += 1 |
|
seg_idx += 1 |
|
|
|
if stride == -1: break |
|
|
|
idx += (window_sz if stride >= window_sz else stride) |
|
|
|
return segments, mapping |
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser(description='Sentence Breaker') |
|
parser.add_argument('-c', '--config', type=str, |
|
default="config.yaml", help='Run Configs') |
|
parser.add_argument('-d', '--data_dir', type=str, |
|
default=None, help='Override for data path') |
|
args = parser.parse_args() |
|
|
|
with open(args.config, 'r', encoding="utf-8") as file: |
|
config = yaml.load(file, Loader=yaml.FullLoader) |
|
|
|
BASE_PATH = args.data_dir or config["paths"].get("base") |
|
|
|
stride = config["segment"]["stride"] |
|
window = config["segment"]["window"] |
|
min_window = config["segment"]["min-window"] |
|
export_map = config["segment"]["export-map"] |
|
|
|
for fpath in tqdm(config["segment"]["files"]): |
|
FILE_PATH = os.path.join(BASE_PATH, fpath) |
|
SAVE_PATH = os.path.join(BASE_PATH, fpath[:-4] + f"-{stride}-{window}.txt") |
|
MAP_PATH = os.path.join(BASE_PATH, fpath[:-4] + f"-{stride}-{window}.map") |
|
|
|
with open(FILE_PATH, 'r', encoding="utf-8") as fin: |
|
lines = fin.readlines() |
|
|
|
segments, mapping = segment(lines, stride, window, min_window) |
|
|
|
with open(SAVE_PATH, 'w', encoding="utf-8") as fout: |
|
fout.write('\n'.join(segments)) |
|
|
|
if not export_map: continue |
|
|
|
with open(MAP_PATH, 'w', encoding="utf-8") as fout: |
|
for sent_idx, seg_idx, word_idx, char_idx in mapping: |
|
fout.write(f"{sent_idx}, {seg_idx}, {word_idx}, {char_idx}\n") |
|
|