File size: 1,503 Bytes
74fc30d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from tqdm import tqdm
import sys


def remove_large_sentences(src_path, tgt_path):
    count = 0
    new_src_lines = []
    new_tgt_lines = []
    src_num_lines = sum(1 for line in open(src_path, "r", encoding="utf-8"))
    tgt_num_lines = sum(1 for line in open(tgt_path, "r", encoding="utf-8"))
    assert src_num_lines == tgt_num_lines
    with open(src_path, encoding="utf-8") as f1, open(tgt_path, encoding="utf-8") as f2:
        for src_line, tgt_line in tqdm(zip(f1, f2), total=src_num_lines):
            src_tokens = src_line.strip().split(" ")
            tgt_tokens = tgt_line.strip().split(" ")
            if len(src_tokens) > 200 or len(tgt_tokens) > 200:
                count += 1
                continue
            new_src_lines.append(src_line)
            new_tgt_lines.append(tgt_line)
    return count, new_src_lines, new_tgt_lines


def create_txt(outFile, lines, add_newline=False):
    outfile = open("{0}".format(outFile), "w", encoding="utf-8")
    for line in lines:
        if add_newline:
            outfile.write(line + "\n")
        else:
            outfile.write(line)
    outfile.close()


if __name__ == "__main__":

    src_path = sys.argv[1]
    tgt_path = sys.argv[2]
    new_src_path = sys.argv[3]
    new_tgt_path = sys.argv[4]

    count, new_src_lines, new_tgt_lines = remove_large_sentences(src_path, tgt_path)
    print(f'{count} lines removed due to seq_len > 200')
    create_txt(new_src_path, new_src_lines)
    create_txt(new_tgt_path, new_tgt_lines)