# Script counts tokens in a pretokenized dataset from preprocess_data.py
# Necessary for setting batch size, train_iters, etc

import sys
import os

## Necessary for the import
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
sys.path.insert(0, project_root)

from megatron.data import indexed_dataset
import numpy as np

if len(sys.argv) < 2:
    print(
        "Usage: python dataset_token_count.py /absolute/file/path/to/dataset1 /absolute/file/path/to/dataset2 ..."
    )
    sys.exit(1)

# Access the command-line arguments
arguments = sys.argv[1:]

for arg in arguments:
    print("Checking file", arg)
    try:
        dataset = indexed_dataset.make_dataset(arg, "mmap")
        size = np.sum(dataset.sizes)
        print("Dataset size in tokens is", size)
    except AttributeError:
        print("Dataset could not be loaded", arg)