Spaces:

MBZUAI
/

artst-demo-asr

Runtime error

App Files Files Community

artst-demo-asr / SpeechT5 /SpeechLM /speechlm /data_process /get_t2u_manifest_textonly.py

amupd

SpeechT5 upload

62e9ca6 about 1 year ago

raw

history blame

2.45 kB

	# ----------------------------------------------------------------------------
	# SpeechLM: Enhanced Speech Pre-Training with Unpaired Textual Data (https://arxiv.org/abs/2209.15329)
	# Github source: https://github.com/microsoft/SpeechT5/tree/main/SpeechLM
	# Code based on fairseq: https://github.com/facebookresearch/fairseq/tree/272c4c5197250997148fb12c0db6306035f166a4
	#
	# Copyright (c) 2022 Microsoft
	# Licensed under The MIT License [see LICENSE for details]
	# ----------------------------------------------------------------------------

	import argparse
	import logging
	from pathlib import Path
	from collections import defaultdict

	import pandas as pd
	from tqdm import tqdm
	import numpy as np
	from examples.speech_to_text.data_utils import save_df_to_tsv


	log = logging.getLogger(__name__)

	def get_duration(fa_phone):
	"""fa_phone: force-aligned phone, 1-D numpy"""
	same = np.concatenate(([True], fa_phone[:-1] != fa_phone[1:], [True]))
	index = np.where(same)[0]
	count = np.diff(index)
	return count

	def process(args):
	# assert "train" in args.splits
	out_root = Path(args.output_root).absolute()
	out_root.mkdir(exist_ok=True)

	print("Fetching data...")
	audio_manifest_root = Path(args.audio_manifest_root).absolute()
	for s in args.splits:
	manifest = defaultdict(list)
	with open(audio_manifest_root / f"{s}.phn") as f1:
	for i, reduced_phone in tqdm(enumerate(f1)):
	reduced_phone = reduced_phone.strip()
	uttid = f"librilm-{i}"
	speaker = uttid.split("-")[0]

	manifest["id"].append(uttid)
	manifest["speaker"].append(speaker)
	manifest["n_frames"].append(len(reduced_phone))
	manifest["tgt_text"].append(reduced_phone)
	manifest["unit"].append(0)
	save_df_to_tsv(
	pd.DataFrame.from_dict(manifest),
	out_root / f"{s}.tsv"
	)

	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--audio-manifest-root", "-m", type=str)
	parser.add_argument("--output-root", "-o", required=True, type=str)
	parser.add_argument("--splits", "-s", type=str, nargs="+",
	default=["train", "dev", "test"])
	parser.add_argument("--add-fastspeech-targets", action="store_true")
	args = parser.parse_args()

	process(args)

	if __name__ == "__main__":
	main()