File size: 4,624 Bytes
8b0ab3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
from glob import glob
import re
import string
import argparse

import random
random.seed(42)

def replace_extra_chars(line):
    line = line.replace("(", "").replace(
        ")", ""
    )  # .replace('\u200d', ' ').replace('\ufeff', ' ').replace('\u200c', ' ').replace('\u200e', ' ')
    # line = line.replace('“', ' ').replace('”', ' ').replace(':', ' ')

    return line.strip()


def write_txt(content, filename):
    with open(filename, "w+", encoding="utf-8") as f:
        f.write(content)


def save_train_test_valid_split(annotations_txt, num_samples_valid, num_samples_test):
    with open(annotations_txt, encoding="utf-8") as f:
        all_lines = [line.strip() for line in f.readlines()]
    test_val_indices = random.sample(
        range(len(all_lines)), num_samples_valid + num_samples_test
    )
    valid_ix = test_val_indices[:num_samples_valid]
    test_ix = test_val_indices[num_samples_valid:]
    train = [line for i, line in enumerate(all_lines) if i not in test_val_indices]
    valid = [line for i, line in enumerate(all_lines) if i in valid_ix]
    test = [line for i, line in enumerate(all_lines) if i in test_ix]

    print(f"Num samples in train: {len(train)}")
    print(f"Num samples in valid: {len(valid)}")
    print(f"Num samples in test: {len(test)}")

    out_dir_path = "/".join(annotations_txt.split("/")[:-1])
    with open(os.path.join(out_dir_path, "train.txt"), "w+", encoding="utf-8") as f:
        for line in train:
            print(line, file=f)
    with open(os.path.join(out_dir_path, "valid.txt"), "w+", encoding="utf-8") as f:
        for line in valid:
            print(line, file=f)
    with open(os.path.join(out_dir_path, "test.txt"), "w+", encoding="utf-8") as f:
        for line in test:
            print(line, file=f)
    print(f"train, test and valid txts saved in {out_dir_path}")


def save_txts_from_txt_done_data(
    text_path,
    wav_path_for_annotations_txt,
    out_path_for_txts,
    num_samples_valid,
    num_samples_test,
):
    outfile = os.path.join(out_path_for_txts, "annotations.txt")
    with open(text_path) as file:
        file_lines = file.readlines()

    # print(file_lines[0])

    file_lines = [replace_extra_chars(line) for line in file_lines]
    # print(file_lines[0])

    fnames, ftexts = [], []
    for line in file_lines:
        elems = line.split('"')
        fnames.append(elems[0].strip())
        ftexts.append(elems[1].strip())

    all_chars = list(set("".join(ftexts)))
    punct_with_space = [i for i in all_chars if i in list(string.punctuation)] + [" "]
    chars = [i for i in all_chars if i not in punct_with_space if i.strip()]
    chars = "".join(chars)
    punct_with_space = "".join(punct_with_space)
    
    with open('../../config/glow/base_blank.json', 'r') as jfile:
        json_config = json.load(jfile)

    json_config["data"]["chars"] = chars
    json_config["data"]["punc"] = punct_with_space
    json_config["data"]["training_files"]=out_path_for_txts + '/train.txt'
    json_config["data"]["validation_files"] = out_path_for_txts + '/valid.txt'
    new_config_name = out_path_for_txts.split('/')[-1]
    with open(f'../../config/glow/{new_config_name}.json','w+') as jfile:
        json.dump(json_config, jfile)
    
    print(f"Characters: {chars}")
    print(f"Punctuation: {punct_with_space}")
    print(f"Config file is stored at ../../config/glow/{new_config_name}.json")

    outfile_f = open(outfile, "w+", encoding="utf-8")
    for f, t in zip(fnames, ftexts):
        print(
            os.path.join(wav_path_for_annotations_txt, f) + ".wav",
            t,
            sep="|",
            file=outfile_f,
        )
    outfile_f.close()
    write_txt(punct_with_space, os.path.join(out_path_for_txts, "punc.txt"))
    write_txt(chars, os.path.join(out_path_for_txts, "chars.txt"))

    save_train_test_valid_split(
        annotations_txt=outfile,
        num_samples_valid=num_samples_valid,
        num_samples_test=num_samples_test,
    )




if __name__ == "__main__":


    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--text-path", type=str, required=True)
    parser.add_argument("-o", "--output-path", type=str, required=True)
    parser.add_argument("-w", "--wav-path", type=str, required=True)
    parser.add_argument("-v", "--valid-samples", type=int, default = 100)
    parser.add_argument("-t", "--test-samples", type=int, default = 10)
    args = parser.parse_args()

    save_txts_from_txt_done_data(
        args.text_path,
        args.wav_path,
        args.output_path,
        args.valid_samples,
        args.test_samples,
    )