artst-demo-asr / SpeechT5 /SpeechLM /speechlm /data_process /get_t2u_manifest_textonly.py
amupd's picture
SpeechT5 upload
62e9ca6
raw
history blame
2.45 kB
# ----------------------------------------------------------------------------
# SpeechLM: Enhanced Speech Pre-Training with Unpaired Textual Data (https://arxiv.org/abs/2209.15329)
# Github source: https://github.com/microsoft/SpeechT5/tree/main/SpeechLM
# Code based on fairseq: https://github.com/facebookresearch/fairseq/tree/272c4c5197250997148fb12c0db6306035f166a4
#
# Copyright (c) 2022 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# ----------------------------------------------------------------------------
import argparse
import logging
from pathlib import Path
from collections import defaultdict
import pandas as pd
from tqdm import tqdm
import numpy as np
from examples.speech_to_text.data_utils import save_df_to_tsv
log = logging.getLogger(__name__)
def get_duration(fa_phone):
"""fa_phone: force-aligned phone, 1-D numpy"""
same = np.concatenate(([True], fa_phone[:-1] != fa_phone[1:], [True]))
index = np.where(same)[0]
count = np.diff(index)
return count
def process(args):
# assert "train" in args.splits
out_root = Path(args.output_root).absolute()
out_root.mkdir(exist_ok=True)
print("Fetching data...")
audio_manifest_root = Path(args.audio_manifest_root).absolute()
for s in args.splits:
manifest = defaultdict(list)
with open(audio_manifest_root / f"{s}.phn") as f1:
for i, reduced_phone in tqdm(enumerate(f1)):
reduced_phone = reduced_phone.strip()
uttid = f"librilm-{i}"
speaker = uttid.split("-")[0]
manifest["id"].append(uttid)
manifest["speaker"].append(speaker)
manifest["n_frames"].append(len(reduced_phone))
manifest["tgt_text"].append(reduced_phone)
manifest["unit"].append(0)
save_df_to_tsv(
pd.DataFrame.from_dict(manifest),
out_root / f"{s}.tsv"
)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--audio-manifest-root", "-m", type=str)
parser.add_argument("--output-root", "-o", required=True, type=str)
parser.add_argument("--splits", "-s", type=str, nargs="+",
default=["train", "dev", "test"])
parser.add_argument("--add-fastspeech-targets", action="store_true")
args = parser.parse_args()
process(args)
if __name__ == "__main__":
main()