Spaces:
Running
Running
# Copyright 2018 The TensorFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# ============================================================================== | |
"""Download and preprocess LibriSpeech dataset for DeepSpeech model.""" | |
from __future__ import absolute_import | |
from __future__ import division | |
from __future__ import print_function | |
import codecs | |
import fnmatch | |
import os | |
import sys | |
import tarfile | |
import tempfile | |
import unicodedata | |
from absl import app as absl_app | |
from absl import flags as absl_flags | |
import pandas | |
from six.moves import urllib | |
from sox import Transformer | |
import tensorflow as tf | |
LIBRI_SPEECH_URLS = { | |
"train-clean-100": | |
"http://www.openslr.org/resources/12/train-clean-100.tar.gz", | |
"train-clean-360": | |
"http://www.openslr.org/resources/12/train-clean-360.tar.gz", | |
"train-other-500": | |
"http://www.openslr.org/resources/12/train-other-500.tar.gz", | |
"dev-clean": | |
"http://www.openslr.org/resources/12/dev-clean.tar.gz", | |
"dev-other": | |
"http://www.openslr.org/resources/12/dev-other.tar.gz", | |
"test-clean": | |
"http://www.openslr.org/resources/12/test-clean.tar.gz", | |
"test-other": | |
"http://www.openslr.org/resources/12/test-other.tar.gz" | |
} | |
def download_and_extract(directory, url): | |
"""Download and extract the given split of dataset. | |
Args: | |
directory: the directory where to extract the tarball. | |
url: the url to download the data file. | |
""" | |
if not tf.gfile.Exists(directory): | |
tf.gfile.MakeDirs(directory) | |
_, tar_filepath = tempfile.mkstemp(suffix=".tar.gz") | |
try: | |
tf.logging.info("Downloading %s to %s" % (url, tar_filepath)) | |
def _progress(count, block_size, total_size): | |
sys.stdout.write("\r>> Downloading {} {:.1f}%".format( | |
tar_filepath, 100.0 * count * block_size / total_size)) | |
sys.stdout.flush() | |
urllib.request.urlretrieve(url, tar_filepath, _progress) | |
print() | |
statinfo = os.stat(tar_filepath) | |
tf.logging.info( | |
"Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size)) | |
with tarfile.open(tar_filepath, "r") as tar: | |
tar.extractall(directory) | |
finally: | |
tf.gfile.Remove(tar_filepath) | |
def convert_audio_and_split_transcript(input_dir, source_name, target_name, | |
output_dir, output_file): | |
"""Convert FLAC to WAV and split the transcript. | |
For audio file, convert the format from FLAC to WAV using the sox.Transformer | |
library. | |
For transcripts, each line contains the sequence id and the corresponding | |
transcript (separated by space): | |
Input data format: seq-id transcript_of_seq-id | |
For example: | |
1-2-0 transcript_of_1-2-0.flac | |
1-2-1 transcript_of_1-2-1.flac | |
... | |
Each sequence id has a corresponding .flac file. | |
Parse the transcript file and generate a new csv file which has three columns: | |
"wav_filename": the absolute path to a wav file. | |
"wav_filesize": the size of the corresponding wav file. | |
"transcript": the transcript for this audio segement. | |
Args: | |
input_dir: the directory which holds the input dataset. | |
source_name: the name of the specified dataset. e.g. test-clean | |
target_name: the directory name for the newly generated audio files. | |
e.g. test-clean-wav | |
output_dir: the directory to place the newly generated csv files. | |
output_file: the name of the newly generated csv file. e.g. test-clean.csv | |
""" | |
tf.logging.info("Preprocessing audio and transcript for %s" % source_name) | |
source_dir = os.path.join(input_dir, source_name) | |
target_dir = os.path.join(input_dir, target_name) | |
if not tf.gfile.Exists(target_dir): | |
tf.gfile.MakeDirs(target_dir) | |
files = [] | |
tfm = Transformer() | |
# Convert all FLAC file into WAV format. At the same time, generate the csv | |
# file. | |
for root, _, filenames in tf.gfile.Walk(source_dir): | |
for filename in fnmatch.filter(filenames, "*.trans.txt"): | |
trans_file = os.path.join(root, filename) | |
with codecs.open(trans_file, "r", "utf-8") as fin: | |
for line in fin: | |
seqid, transcript = line.split(" ", 1) | |
# We do a encode-decode transformation here because the output type | |
# of encode is a bytes object, we need convert it to string. | |
transcript = unicodedata.normalize("NFKD", transcript).encode( | |
"ascii", "ignore").decode("ascii", "ignore").strip().lower() | |
# Convert FLAC to WAV. | |
flac_file = os.path.join(root, seqid + ".flac") | |
wav_file = os.path.join(target_dir, seqid + ".wav") | |
if not tf.gfile.Exists(wav_file): | |
tfm.build(flac_file, wav_file) | |
wav_filesize = os.path.getsize(wav_file) | |
files.append((os.path.abspath(wav_file), wav_filesize, transcript)) | |
# Write to CSV file which contains three columns: | |
# "wav_filename", "wav_filesize", "transcript". | |
csv_file_path = os.path.join(output_dir, output_file) | |
df = pandas.DataFrame( | |
data=files, columns=["wav_filename", "wav_filesize", "transcript"]) | |
df.to_csv(csv_file_path, index=False, sep="\t") | |
tf.logging.info("Successfully generated csv file {}".format(csv_file_path)) | |
def download_and_process_datasets(directory, datasets): | |
"""Download and pre-process the specified list of LibriSpeech dataset. | |
Args: | |
directory: the directory to put all the downloaded and preprocessed data. | |
datasets: list of dataset names that will be downloaded and processed. | |
""" | |
tf.logging.info("Preparing LibriSpeech dataset: {}".format( | |
",".join(datasets))) | |
for dataset in datasets: | |
tf.logging.info("Preparing dataset %s", dataset) | |
dataset_dir = os.path.join(directory, dataset) | |
download_and_extract(dataset_dir, LIBRI_SPEECH_URLS[dataset]) | |
convert_audio_and_split_transcript( | |
dataset_dir + "/LibriSpeech", dataset, dataset + "-wav", | |
dataset_dir + "/LibriSpeech", dataset + ".csv") | |
def define_data_download_flags(): | |
"""Define flags for data downloading.""" | |
absl_flags.DEFINE_string( | |
"data_dir", "/tmp/librispeech_data", | |
"Directory to download data and extract the tarball") | |
absl_flags.DEFINE_bool("train_only", False, | |
"If true, only download the training set") | |
absl_flags.DEFINE_bool("dev_only", False, | |
"If true, only download the dev set") | |
absl_flags.DEFINE_bool("test_only", False, | |
"If true, only download the test set") | |
def main(_): | |
if not tf.gfile.Exists(FLAGS.data_dir): | |
tf.gfile.MakeDirs(FLAGS.data_dir) | |
if FLAGS.train_only: | |
download_and_process_datasets( | |
FLAGS.data_dir, | |
["train-clean-100", "train-clean-360", "train-other-500"]) | |
elif FLAGS.dev_only: | |
download_and_process_datasets(FLAGS.data_dir, ["dev-clean", "dev-other"]) | |
elif FLAGS.test_only: | |
download_and_process_datasets(FLAGS.data_dir, ["test-clean", "test-other"]) | |
else: | |
# By default we download the entire dataset. | |
download_and_process_datasets(FLAGS.data_dir, LIBRI_SPEECH_URLS.keys()) | |
if __name__ == "__main__": | |
tf.logging.set_verbosity(tf.logging.INFO) | |
define_data_download_flags() | |
FLAGS = absl_flags.FLAGS | |
absl_app.run(main) | |