Spaces:

NCTCMumbai
/

NCTC

Running

App Files Files Community

NCTC / models /research /deep_speech /data /download.py

NCTCMumbai

Upload 2571 files

0b8359d over 1 year ago

raw

history blame contribute delete

7.72 kB

	# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	"""Download and preprocess LibriSpeech dataset for DeepSpeech model."""

	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import codecs
	import fnmatch
	import os
	import sys
	import tarfile
	import tempfile
	import unicodedata

	from absl import app as absl_app
	from absl import flags as absl_flags
	import pandas
	from six.moves import urllib
	from sox import Transformer
	import tensorflow as tf

	LIBRI_SPEECH_URLS = {
	"train-clean-100":
	"http://www.openslr.org/resources/12/train-clean-100.tar.gz",
	"train-clean-360":
	"http://www.openslr.org/resources/12/train-clean-360.tar.gz",
	"train-other-500":
	"http://www.openslr.org/resources/12/train-other-500.tar.gz",
	"dev-clean":
	"http://www.openslr.org/resources/12/dev-clean.tar.gz",
	"dev-other":
	"http://www.openslr.org/resources/12/dev-other.tar.gz",
	"test-clean":
	"http://www.openslr.org/resources/12/test-clean.tar.gz",
	"test-other":
	"http://www.openslr.org/resources/12/test-other.tar.gz"
	}


	def download_and_extract(directory, url):
	"""Download and extract the given split of dataset.

	Args:
	directory: the directory where to extract the tarball.
	url: the url to download the data file.
	"""

	if not tf.gfile.Exists(directory):
	tf.gfile.MakeDirs(directory)

	_, tar_filepath = tempfile.mkstemp(suffix=".tar.gz")

	try:
	tf.logging.info("Downloading %s to %s" % (url, tar_filepath))

	def _progress(count, block_size, total_size):
	sys.stdout.write("\r>> Downloading {} {:.1f}%".format(
	tar_filepath, 100.0 * count * block_size / total_size))
	sys.stdout.flush()

	urllib.request.urlretrieve(url, tar_filepath, _progress)
	print()
	statinfo = os.stat(tar_filepath)
	tf.logging.info(
	"Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size))
	with tarfile.open(tar_filepath, "r") as tar:
	tar.extractall(directory)
	finally:
	tf.gfile.Remove(tar_filepath)


	def convert_audio_and_split_transcript(input_dir, source_name, target_name,
	output_dir, output_file):
	"""Convert FLAC to WAV and split the transcript.

	For audio file, convert the format from FLAC to WAV using the sox.Transformer
	library.
	For transcripts, each line contains the sequence id and the corresponding
	transcript (separated by space):
	Input data format: seq-id transcript_of_seq-id
	For example:
	1-2-0 transcript_of_1-2-0.flac
	1-2-1 transcript_of_1-2-1.flac
	...

	Each sequence id has a corresponding .flac file.
	Parse the transcript file and generate a new csv file which has three columns:
	"wav_filename": the absolute path to a wav file.
	"wav_filesize": the size of the corresponding wav file.
	"transcript": the transcript for this audio segement.

	Args:
	input_dir: the directory which holds the input dataset.
	source_name: the name of the specified dataset. e.g. test-clean
	target_name: the directory name for the newly generated audio files.
	e.g. test-clean-wav
	output_dir: the directory to place the newly generated csv files.
	output_file: the name of the newly generated csv file. e.g. test-clean.csv
	"""

	tf.logging.info("Preprocessing audio and transcript for %s" % source_name)
	source_dir = os.path.join(input_dir, source_name)
	target_dir = os.path.join(input_dir, target_name)

	if not tf.gfile.Exists(target_dir):
	tf.gfile.MakeDirs(target_dir)

	files = []
	tfm = Transformer()
	# Convert all FLAC file into WAV format. At the same time, generate the csv
	# file.
	for root, _, filenames in tf.gfile.Walk(source_dir):
	for filename in fnmatch.filter(filenames, "*.trans.txt"):
	trans_file = os.path.join(root, filename)
	with codecs.open(trans_file, "r", "utf-8") as fin:
	for line in fin:
	seqid, transcript = line.split(" ", 1)
	# We do a encode-decode transformation here because the output type
	# of encode is a bytes object, we need convert it to string.
	transcript = unicodedata.normalize("NFKD", transcript).encode(
	"ascii", "ignore").decode("ascii", "ignore").strip().lower()

	# Convert FLAC to WAV.
	flac_file = os.path.join(root, seqid + ".flac")
	wav_file = os.path.join(target_dir, seqid + ".wav")
	if not tf.gfile.Exists(wav_file):
	tfm.build(flac_file, wav_file)
	wav_filesize = os.path.getsize(wav_file)

	files.append((os.path.abspath(wav_file), wav_filesize, transcript))

	# Write to CSV file which contains three columns:
	# "wav_filename", "wav_filesize", "transcript".
	csv_file_path = os.path.join(output_dir, output_file)
	df = pandas.DataFrame(
	data=files, columns=["wav_filename", "wav_filesize", "transcript"])
	df.to_csv(csv_file_path, index=False, sep="\t")
	tf.logging.info("Successfully generated csv file {}".format(csv_file_path))


	def download_and_process_datasets(directory, datasets):
	"""Download and pre-process the specified list of LibriSpeech dataset.

	Args:
	directory: the directory to put all the downloaded and preprocessed data.
	datasets: list of dataset names that will be downloaded and processed.
	"""

	tf.logging.info("Preparing LibriSpeech dataset: {}".format(
	",".join(datasets)))
	for dataset in datasets:
	tf.logging.info("Preparing dataset %s", dataset)
	dataset_dir = os.path.join(directory, dataset)
	download_and_extract(dataset_dir, LIBRI_SPEECH_URLS[dataset])
	convert_audio_and_split_transcript(
	dataset_dir + "/LibriSpeech", dataset, dataset + "-wav",
	dataset_dir + "/LibriSpeech", dataset + ".csv")


	def define_data_download_flags():
	"""Define flags for data downloading."""
	absl_flags.DEFINE_string(
	"data_dir", "/tmp/librispeech_data",
	"Directory to download data and extract the tarball")
	absl_flags.DEFINE_bool("train_only", False,
	"If true, only download the training set")
	absl_flags.DEFINE_bool("dev_only", False,
	"If true, only download the dev set")
	absl_flags.DEFINE_bool("test_only", False,
	"If true, only download the test set")


	def main(_):
	if not tf.gfile.Exists(FLAGS.data_dir):
	tf.gfile.MakeDirs(FLAGS.data_dir)

	if FLAGS.train_only:
	download_and_process_datasets(
	FLAGS.data_dir,
	["train-clean-100", "train-clean-360", "train-other-500"])
	elif FLAGS.dev_only:
	download_and_process_datasets(FLAGS.data_dir, ["dev-clean", "dev-other"])
	elif FLAGS.test_only:
	download_and_process_datasets(FLAGS.data_dir, ["test-clean", "test-other"])
	else:
	# By default we download the entire dataset.
	download_and_process_datasets(FLAGS.data_dir, LIBRI_SPEECH_URLS.keys())


	if __name__ == "__main__":
	tf.logging.set_verbosity(tf.logging.INFO)
	define_data_download_flags()
	FLAGS = absl_flags.FLAGS
	absl_app.run(main)