torchnet / scripts /generate_lsr2_train.py
milselarch's picture
push to main
df07554
raw
history blame
885 Bytes
import os
CTC_SCALE = 2
lrs2_dirpath = '/media/milselarch/47FC4BC577667AAD/LRS2'
valid_lrs2_filepath = f'../data/LRS2-CTC{CTC_SCALE}-valid-pairs.txt'
filenames = ['train.txt', 'test.txt', 'val.txt']
valid_lrs2_pairs = set([
line.strip() for line in open(valid_lrs2_filepath).readlines()
])
for filename in filenames:
filepath = os.path.join(lrs2_dirpath, filename)
lines = open(filepath, 'r').readlines()
valid_lines = []
for line in lines:
line = line + ' '
line = line[:line.index(' ')].strip()
if line in valid_lrs2_pairs:
valid_lines.append(line)
valid_lines = sorted(valid_lines)
export_filename = f'../data/LRS2_CTC{CTC_SCALE}_{filename}'
open(export_filename, 'w').write('\n'.join(valid_lines))
print(f'<<< {filename} >>>')
print(f'VALID: {len(valid_lines)}')
print(f'TOTAL: {len(lines)}')