|
|
|
|
|
|
|
import sys |
|
import os |
|
|
|
|
|
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) |
|
sys.path.insert(0, project_root) |
|
|
|
from megatron.data import indexed_dataset |
|
import numpy as np |
|
|
|
if len(sys.argv) < 2: |
|
print( |
|
"Usage: python dataset_token_count.py /absolute/file/path/to/dataset1 /absolute/file/path/to/dataset2 ..." |
|
) |
|
sys.exit(1) |
|
|
|
|
|
arguments = sys.argv[1:] |
|
|
|
for arg in arguments: |
|
print("Checking file", arg) |
|
try: |
|
dataset = indexed_dataset.make_dataset(arg, "mmap") |
|
size = np.sum(dataset.sizes) |
|
print("Dataset size in tokens is", size) |
|
except AttributeError: |
|
print("Dataset could not be loaded", arg) |
|
|