# coding=utf-8 # Copyright (C) 2020 ATHENA AUTHORS; Yiping Peng; Ne Luo # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== # Only support eager mode and TF>=2.0.0 # pylint: disable=no-member, invalid-name, relative-beyond-top-level # pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes ''' voxceleb 1 & 2 ''' import os import sys import zipfile import subprocess import hashlib import pandas from absl import logging import tensorflow as tf import soundfile as sf gfile = tf.compat.v1.gfile SUBSETS = { "vox1_dev_wav": ["http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa", "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab", "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac", "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad"], "vox1_test_wav": ["http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip"], "vox2_dev_aac": ["http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaa", "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partab", "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partac", "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partad", "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partae", "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaf", "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partag", "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partah"], "vox2_test_aac": ["http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_test_aac.zip"] } MD5SUM = { "vox1_dev_wav": "ae63e55b951748cc486645f532ba230b", "vox2_dev_aac": "bbc063c46078a602ca71605645c2a402", "vox1_test_wav": "185fdc63c3c739954633d50379a3d102", "vox2_test_aac": "0d2b3ea430a821c33263b5ea37ede312" } USER = { "user": "", "password": "" } speaker_id_dict = {} def download_and_extract(directory, subset, urls): """Download and extract the given split of dataset. Args: directory: the directory where to put the downloaded data. subset: subset name of the corpus. urls: the list of urls to download the data file. """ if not gfile.Exists(directory): gfile.MakeDirs(directory) try: for url in urls: zip_filepath = os.path.join(directory, url.split("/")[-1]) if os.path.exists(zip_filepath): continue logging.info("Downloading %s to %s" % (url, zip_filepath)) subprocess.call('wget %s --user %s --password %s -O %s' % (url, USER["user"], USER["password"], zip_filepath), shell=True) statinfo = os.stat(zip_filepath) logging.info( "Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size) ) # concatenate all parts into zip files if ".zip" not in zip_filepath: zip_filepath = "_".join(zip_filepath.split("_")[:-1]) subprocess.call('cat %s* > %s.zip' % (zip_filepath, zip_filepath), shell=True) zip_filepath += ".zip" extract_path = zip_filepath.strip(".zip") # check zip file md5sum md5 = hashlib.md5(open(zip_filepath, 'rb').read()).hexdigest() if md5 != MD5SUM[subset]: raise ValueError("md5sum of %s mismatch" % zip_filepath) with zipfile.ZipFile(zip_filepath, "r") as zfile: zfile.extractall(directory) extract_path_ori = os.path.join(directory, zfile.infolist()[0].filename) subprocess.call('mv %s %s' % (extract_path_ori, extract_path), shell=True) finally: # gfile.Remove(zip_filepath) pass def exec_cmd(cmd): """Run a command in a subprocess. Args: cmd: command line to be executed. Return: int, the return code. """ try: retcode = subprocess.call(cmd, shell=True) if retcode < 0: logging.info(f"Child was terminated by signal {retcode}") except OSError as e: logging.info(f"Execution failed: {e}") retcode = -999 return retcode def decode_aac_with_ffmpeg(aac_file, wav_file): """Decode a given AAC file into WAV using ffmpeg. Args: aac_file: file path to input AAC file. wav_file: file path to output WAV file. Return: bool, True if success. """ cmd = f"ffmpeg -i {aac_file} {wav_file}" logging.info(f"Decoding aac file using command line: {cmd}") ret = exec_cmd(cmd) if ret != 0: logging.error(f"Failed to decode aac file with retcode {ret}") logging.error("Please check your ffmpeg installation.") return False return True def convert_audio_and_make_label(input_dir, subset, output_dir, output_file): """Optionally convert AAC to WAV and make speaker labels. Args: input_dir: the directory which holds the input dataset. subset: the name of the specified subset. e.g. vox1_dev_wav output_dir: the directory to place the newly generated csv files. output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv """ logging.info("Preprocessing audio and label for subset %s" % subset) source_dir = os.path.join(input_dir, subset) files = [] # Convert all AAC file into WAV format. At the same time, generate the csv for root, _, filenames in gfile.Walk(source_dir): for filename in filenames: name, ext = os.path.splitext(filename) if ext.lower() == ".wav": _, ext2 = (os.path.splitext(name)) if ext2: continue wav_file = os.path.join(root, filename) elif ext.lower() == ".m4a": # Convert AAC to WAV. aac_file = os.path.join(root, filename) wav_file = aac_file + ".wav" if not gfile.Exists(wav_file): if not decode_aac_with_ffmpeg(aac_file, wav_file): raise RuntimeError("Audio decoding failed.") else: continue speaker_name = root.split(os.path.sep)[-2] if speaker_name not in speaker_id_dict: num = len(speaker_id_dict) speaker_id_dict[speaker_name] = num # wav_filesize = os.path.getsize(wav_file) wav_length = len(sf.read(wav_file)[0]) files.append( (os.path.abspath(wav_file), wav_length, speaker_id_dict[speaker_name], speaker_name) ) # Write to CSV file which contains four columns: # "wav_filename", "wav_length_ms", "speaker_id", "speaker_name". csv_file_path = os.path.join(output_dir, output_file) df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"]) df.to_csv(csv_file_path, index=False, sep="\t") logging.info("Successfully generated csv file {}".format(csv_file_path)) def processor(directory, subset, force_process): """ download and process """ urls = SUBSETS if subset not in urls: raise ValueError(subset, "is not in voxceleb") subset_csv = os.path.join(directory, subset + '.csv') if not force_process and os.path.exists(subset_csv): return subset_csv logging.info("Downloading and process the voxceleb in %s", directory) logging.info("Preparing subset %s", subset) download_and_extract(directory, subset, urls[subset]) convert_audio_and_make_label( directory, subset, directory, subset + ".csv" ) logging.info("Finished downloading and processing") return subset_csv if __name__ == "__main__": logging.set_verbosity(logging.INFO) if len(sys.argv) != 4: print("Usage: python prepare_data.py save_directory user password") sys.exit() DIR, USER["user"], USER["password"] = sys.argv[1], sys.argv[2], sys.argv[3] for SUBSET in SUBSETS: processor(DIR, SUBSET, False)