speaker_identify / data_utils /test_train_folder_split.py
DuyTa's picture
Upload folder using huggingface_hub
f831146 verified
"""
I didn't extract features from the test set of LibriSpeech, the features extracted
from train-100 was split into train and test set into two separate folders.
This was again done to read them easily using torch vision's Dataset Folder
"""
import os
import shutil
from pathlib import Path
import numpy as np
def assert_out_dir_exists(root, index):
dir_ = root + '/' + str(index)
if not os.path.exists(dir_):
os.makedirs(dir_)
print('crated dir {}'.format(dir_))
else:
print('dir {} already exists'.format(dir_))
return dir_
def train_test_split(root, test_size=0.05):
# make two folders, train and test
train_dir = root + '_train'
test_dir = root + '_test'
os.makedirs(train_dir)
os.makedirs(test_dir)
for label in os.listdir(root):
files_iter = Path(root + '/' + label).glob('**/*.npy')
files_ = [str(f) for f in files_iter]
files_ = np.array(files_)
assert_out_dir_exists(train_dir, label)
assert_out_dir_exists(test_dir, label)
choices = np.random.choice([0, 1], size=files_.shape[0], p=(1 - test_size, test_size))
train_files = files_[choices == 0]
test_files = files_[choices == 1]
for train_sample in train_files:
src = train_sample
dest = train_dir + '/' + label + '/' + train_sample.split('/')[-1]
print('copying file {} to {}'.format(src, dest))
shutil.copyfile(train_sample, train_dir + '/' + label + '/' + train_sample.split('/')[-1])
for test_sample in test_files:
src = test_sample
dest = test_dir + '/' + label + '/' + test_sample.split('/')[-1]
print('copying file {} to {}'.format(src, dest))
shutil.copyfile(test_sample, test_dir + '/' + label + '/' + test_sample.split('/')[-1])
print('done for label: {}'.format(label))
print('All done')
if __name__ == '__main__':
train_test_split('fbanks')