|
""" |
|
I didn't extract features from the test set of LibriSpeech, the features extracted |
|
from train-100 was split into train and test set into two separate folders. |
|
This was again done to read them easily using torch vision's Dataset Folder |
|
""" |
|
|
|
import os |
|
import shutil |
|
from pathlib import Path |
|
|
|
import numpy as np |
|
|
|
|
|
def assert_out_dir_exists(root, index): |
|
dir_ = root + '/' + str(index) |
|
|
|
if not os.path.exists(dir_): |
|
os.makedirs(dir_) |
|
print('crated dir {}'.format(dir_)) |
|
else: |
|
print('dir {} already exists'.format(dir_)) |
|
|
|
return dir_ |
|
|
|
|
|
def train_test_split(root, test_size=0.05): |
|
|
|
train_dir = root + '_train' |
|
test_dir = root + '_test' |
|
|
|
os.makedirs(train_dir) |
|
os.makedirs(test_dir) |
|
|
|
for label in os.listdir(root): |
|
files_iter = Path(root + '/' + label).glob('**/*.npy') |
|
files_ = [str(f) for f in files_iter] |
|
files_ = np.array(files_) |
|
|
|
assert_out_dir_exists(train_dir, label) |
|
assert_out_dir_exists(test_dir, label) |
|
|
|
choices = np.random.choice([0, 1], size=files_.shape[0], p=(1 - test_size, test_size)) |
|
train_files = files_[choices == 0] |
|
test_files = files_[choices == 1] |
|
|
|
for train_sample in train_files: |
|
src = train_sample |
|
dest = train_dir + '/' + label + '/' + train_sample.split('/')[-1] |
|
print('copying file {} to {}'.format(src, dest)) |
|
shutil.copyfile(train_sample, train_dir + '/' + label + '/' + train_sample.split('/')[-1]) |
|
|
|
for test_sample in test_files: |
|
src = test_sample |
|
dest = test_dir + '/' + label + '/' + test_sample.split('/')[-1] |
|
print('copying file {} to {}'.format(src, dest)) |
|
shutil.copyfile(test_sample, test_dir + '/' + label + '/' + test_sample.split('/')[-1]) |
|
|
|
print('done for label: {}'.format(label)) |
|
|
|
print('All done') |
|
|
|
|
|
if __name__ == '__main__': |
|
train_test_split('fbanks') |