File size: 3,636 Bytes
74fc30d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import os
from tqdm import tqdm
import sys

LANGS = [
    "as",
    "bn",
    "gu",
    "hi",
    "kn",
    "ml",
    "mr",
    "or",
    "pa",
    "ta",
    "te",
    #"ur"
]


def add_token(sent, tag_infos):
    """ add special tokens specified by tag_infos to each element in list

    tag_infos: list of tuples (tag_type,tag)

    each tag_info results in a token of the form: __{tag_type}__{tag}__

    """

    tokens = []
    for tag_type, tag in tag_infos:
        token = '__' + tag_type + '__' + tag + '__'
        tokens.append(token)

    return ' '.join(tokens) + ' ' + sent


def concat_data(data_dir, outdir, lang_pair_list,
                out_src_lang='SRC', out_trg_lang='TGT', split='train'):
    """
    data_dir: input dir, contains directories for language pairs named l1-l2
    """
    os.makedirs(outdir, exist_ok=True)

    out_src_fname = '{}/{}.{}'.format(outdir, split, out_src_lang)
    out_trg_fname = '{}/{}.{}'.format(outdir, split, out_trg_lang)
#     out_meta_fname='{}/metadata.txt'.format(outdir)

    print()
    print(out_src_fname)
    print(out_trg_fname)
#     print(out_meta_fname)

    # concatenate train data
    if os.path.isfile(out_src_fname):
        os.unlink(out_src_fname)
    if os.path.isfile(out_trg_fname):
        os.unlink(out_trg_fname)
#     if os.path.isfile(out_meta_fname):
#         os.unlink(out_meta_fname)

    for src_lang, trg_lang in tqdm(lang_pair_list):
        print('src: {}, tgt:{}'.format(src_lang, trg_lang))

        in_src_fname = '{}/{}-{}/{}.{}'.format(
            data_dir, src_lang, trg_lang, split, src_lang)
        in_trg_fname = '{}/{}-{}/{}.{}'.format(
            data_dir, src_lang, trg_lang, split, trg_lang)
        
        if not os.path.exists(in_src_fname):
            continue
        if not os.path.exists(in_trg_fname):
            continue

        print(in_src_fname)
        os.system('cat {} >> {}'.format(in_src_fname, out_src_fname))

        print(in_trg_fname)
        os.system('cat {} >> {}'.format(in_trg_fname, out_trg_fname))


#     with open('{}/lang_pairs.txt'.format(outdir),'w',encoding='utf-8') as lpfile:
#         lpfile.write('\n'.join( [ '-'.join(x) for x in lang_pair_list ] ))

    corpus_stats(data_dir, outdir, lang_pair_list, split)


def corpus_stats(data_dir, outdir, lang_pair_list, split):
    """
    data_dir: input dir, contains directories for language pairs named l1-l2
    """

    with open('{}/{}_lang_pairs.txt'.format(outdir, split), 'w', encoding='utf-8') as lpfile:

        for src_lang, trg_lang in tqdm(lang_pair_list):
            print('src: {}, tgt:{}'.format(src_lang, trg_lang))

            in_src_fname = '{}/{}-{}/{}.{}'.format(
                data_dir, src_lang, trg_lang, split, src_lang)
    #         in_trg_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,trg_lang)
            if not os.path.exists(in_src_fname):
                continue

            print(in_src_fname)
            corpus_size = 0
            with open(in_src_fname, 'r', encoding='utf-8') as infile:
                corpus_size = sum(map(lambda x: 1, infile))

            lpfile.write('{}\t{}\t{}\n'.format(
                src_lang, trg_lang, corpus_size))


if __name__ == '__main__':

    in_dir = sys.argv[1]
    out_dir = sys.argv[2]
    src_lang = sys.argv[3]
    tgt_lang = sys.argv[4]
    split = sys.argv[5]
    lang_pair_list = []

    if src_lang == 'en':
        for lang in LANGS:
            lang_pair_list.append(['en', lang])
    else:
        for lang in LANGS:
            lang_pair_list.append([lang, 'en'])

    concat_data(in_dir, out_dir, lang_pair_list, split=split)