|
import argparse |
|
import glob |
|
import multiprocessing |
|
import os |
|
import pathlib |
|
|
|
import torch |
|
from tqdm import tqdm |
|
|
|
from TTS.utils.vad import get_vad_model_and_utils, remove_silence |
|
|
|
torch.set_num_threads(1) |
|
|
|
|
|
def adjust_path_and_remove_silence(audio_path): |
|
output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, "")) |
|
|
|
if os.path.exists(output_path) and not args.force: |
|
return output_path, False |
|
|
|
|
|
pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True) |
|
|
|
output_path, is_speech = remove_silence( |
|
model_and_utils, |
|
audio_path, |
|
output_path, |
|
trim_just_beginning_and_end=args.trim_just_beginning_and_end, |
|
use_cuda=args.use_cuda, |
|
) |
|
return output_path, is_speech |
|
|
|
|
|
def preprocess_audios(): |
|
files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True)) |
|
print("> Number of files: ", len(files)) |
|
if not args.force: |
|
print("> Ignoring files that already exist in the output idrectory.") |
|
|
|
if args.trim_just_beginning_and_end: |
|
print("> Trimming just the beginning and the end with nonspeech parts.") |
|
else: |
|
print("> Trimming all nonspeech parts.") |
|
|
|
filtered_files = [] |
|
if files: |
|
|
|
|
|
|
|
|
|
if args.num_processes > 1: |
|
with multiprocessing.Pool(processes=args.num_processes) as pool: |
|
results = list( |
|
tqdm( |
|
pool.imap_unordered(adjust_path_and_remove_silence, files), |
|
total=len(files), |
|
desc="Processing audio files", |
|
) |
|
) |
|
for output_path, is_speech in results: |
|
if not is_speech: |
|
filtered_files.append(output_path) |
|
else: |
|
for f in tqdm(files): |
|
output_path, is_speech = adjust_path_and_remove_silence(f) |
|
if not is_speech: |
|
filtered_files.append(output_path) |
|
|
|
|
|
with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f: |
|
for file in filtered_files: |
|
f.write(str(file) + "\n") |
|
else: |
|
print("> No files Found !") |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser( |
|
description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True" |
|
) |
|
parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True) |
|
parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="") |
|
parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files") |
|
parser.add_argument( |
|
"-g", |
|
"--glob", |
|
type=str, |
|
default="**/*.wav", |
|
help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav", |
|
) |
|
parser.add_argument( |
|
"-t", |
|
"--trim_just_beginning_and_end", |
|
type=bool, |
|
default=True, |
|
help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True", |
|
) |
|
parser.add_argument( |
|
"-c", |
|
"--use_cuda", |
|
type=bool, |
|
default=False, |
|
help="If True use cuda", |
|
) |
|
parser.add_argument( |
|
"--use_onnx", |
|
type=bool, |
|
default=False, |
|
help="If True use onnx", |
|
) |
|
parser.add_argument( |
|
"--num_processes", |
|
type=int, |
|
default=1, |
|
help="Number of processes to use", |
|
) |
|
args = parser.parse_args() |
|
|
|
if args.output_dir == "": |
|
args.output_dir = args.input_dir |
|
|
|
|
|
model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx) |
|
preprocess_audios() |
|
|