Spaces:
Sleeping
Sleeping
File size: 4,224 Bytes
bc3753a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
"""
Script for extracting DeepSpeech features from audio file.
"""
import os
import argparse
import numpy as np
import pandas as pd
from deepspeech_store import get_deepspeech_model_file
from deepspeech_features import conv_audios_to_deepspeech
def parse_args():
"""
Create python script parameters.
Returns
-------
ArgumentParser
Resulted args.
"""
parser = argparse.ArgumentParser(
description="Extract DeepSpeech features from audio file",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"--input",
type=str,
required=True,
help="path to input audio file or directory")
parser.add_argument(
"--output",
type=str,
help="path to output file with DeepSpeech features")
parser.add_argument(
"--deepspeech",
type=str,
help="path to DeepSpeech 0.1.0 frozen model")
parser.add_argument(
"--metainfo",
type=str,
help="path to file with meta-information")
args = parser.parse_args()
return args
def extract_features(in_audios,
out_files,
deepspeech_pb_path,
metainfo_file_path=None):
"""
Real extract audio from video file.
Parameters
----------
in_audios : list of str
Paths to input audio files.
out_files : list of str
Paths to output files with DeepSpeech features.
deepspeech_pb_path : str
Path to DeepSpeech 0.1.0 frozen model.
metainfo_file_path : str, default None
Path to file with meta-information.
"""
#deepspeech_pb_path="/disk4/keyu/DeepSpeech/deepspeech-0.9.2-models.pbmm"
if metainfo_file_path is None:
num_frames_info = [None] * len(in_audios)
else:
train_df = pd.read_csv(
metainfo_file_path,
sep="\t",
index_col=False,
dtype={"Id": np.int, "File": np.unicode, "Count": np.int})
num_frames_info = train_df["Count"].values
assert (len(num_frames_info) == len(in_audios))
for i, in_audio in enumerate(in_audios):
if not out_files[i]:
file_stem, _ = os.path.splitext(in_audio)
out_files[i] = file_stem + ".npy"
#print(out_files[i])
conv_audios_to_deepspeech(
audios=in_audios,
out_files=out_files,
num_frames_info=num_frames_info,
deepspeech_pb_path=deepspeech_pb_path)
def main():
"""
Main body of script.
"""
args = parse_args()
in_audio = os.path.expanduser(args.input)
if not os.path.exists(in_audio):
raise Exception("Input file/directory doesn't exist: {}".format(in_audio))
deepspeech_pb_path = args.deepspeech
#add
deepspeech_pb_path = True
args.deepspeech = '~/.tensorflow/models/deepspeech-0_1_0-b90017e8.pb'
#deepspeech_pb_path="/disk4/keyu/DeepSpeech/deepspeech-0.9.2-models.pbmm"
if deepspeech_pb_path is None:
deepspeech_pb_path = ""
if deepspeech_pb_path:
deepspeech_pb_path = os.path.expanduser(args.deepspeech)
if not os.path.exists(deepspeech_pb_path):
deepspeech_pb_path = get_deepspeech_model_file()
if os.path.isfile(in_audio):
extract_features(
in_audios=[in_audio],
out_files=[args.output],
deepspeech_pb_path=deepspeech_pb_path,
metainfo_file_path=args.metainfo)
else:
audio_file_paths = []
for file_name in os.listdir(in_audio):
if not os.path.isfile(os.path.join(in_audio, file_name)):
continue
_, file_ext = os.path.splitext(file_name)
if file_ext.lower() == ".wav":
audio_file_path = os.path.join(in_audio, file_name)
audio_file_paths.append(audio_file_path)
audio_file_paths = sorted(audio_file_paths)
out_file_paths = [""] * len(audio_file_paths)
extract_features(
in_audios=audio_file_paths,
out_files=out_file_paths,
deepspeech_pb_path=deepspeech_pb_path,
metainfo_file_path=args.metainfo)
if __name__ == "__main__":
main()
|