import os import sys import json import argparse sys.path.append( os.path.abspath( os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir) ) ) from megatron.data import indexed_dataset def main(args): prefixes = set() for basename in os.listdir(args.input): prefix, ext = os.path.splitext(basename) if prefix in prefixes: continue if not os.path.isfile(os.path.join(args.input, basename)): continue ext_pair = ".bin" if ext == ".idx" else ".idx" assert os.path.isfile( os.path.join(args.input, prefix) + ext_pair ), f"ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}" prefixes.add(prefix) builder = None for prefix in sorted(prefixes): if builder is None: dataset = indexed_dataset.make_dataset( os.path.join(args.input, prefix), "infer" ) if isinstance(dataset, indexed_dataset.MMapIndexedDataset): builder = indexed_dataset.MMapIndexedDatasetBuilder( args.output_prefix + ".bin", dtype=dataset._index.dtype ) else: builder = indexed_dataset.IndexedDatasetBuilder( args.output_prefix + ".bin" ) del dataset builder.merge_file_(os.path.join(args.input, prefix)) builder.finalize(args.output_prefix + ".idx") if __name__ == "__main__": parser = argparse.ArgumentParser() group = parser.add_argument_group(title="input data") group.add_argument( "--input", type=str, required=True, help="Path to directory containing all document files to merge", ) group = parser.add_argument_group(title="output data") group.add_argument( "--output-prefix", type=str, required=True, help="Path to binary output file without suffix", ) args = parser.parse_args() assert os.path.isdir( args.input ), f"ERROR: {args.input} is not a directory or does not exist" assert os.path.isdir( os.path.dirname(args.output_prefix) ), f"ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist" main(args)