|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import sys |
|
from pathlib import Path |
|
|
|
|
|
wd = Path(__file__).parent.parent.resolve() |
|
sys.path.append(str(wd)) |
|
|
|
import numpy as np |
|
import requests |
|
|
|
|
|
def prepare(destination_path: Path = Path("data/shakespeare")) -> None: |
|
"""Prepare the "Tiny Shakespeare" dataset.""" |
|
destination_path.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
input_file_path = destination_path / "input.txt" |
|
if not input_file_path.exists(): |
|
data_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt" |
|
with open(input_file_path, "w") as f: |
|
f.write(requests.get(data_url).text) |
|
|
|
with open(input_file_path) as f: |
|
data = f.read() |
|
n = len(data) |
|
train_data = data[: int(n * 0.9)] |
|
val_data = data[int(n * 0.9) :] |
|
|
|
from lit_llama import Tokenizer |
|
|
|
Tokenizer.train(input=input_file_path, destination=destination_path, vocab_size=100) |
|
tokenizer = Tokenizer(destination_path / "tokenizer.model") |
|
train_ids = tokenizer.encode(train_data) |
|
val_ids = tokenizer.encode(val_data) |
|
print(f"train has {len(train_ids):,} tokens") |
|
print(f"val has {len(val_ids):,} tokens") |
|
|
|
|
|
train_ids = np.array(train_ids, dtype=np.uint16) |
|
val_ids = np.array(val_ids, dtype=np.uint16) |
|
train_ids.tofile(destination_path / "train.bin") |
|
val_ids.tofile(destination_path / "val.bin") |
|
|
|
|
|
if __name__ == "__main__": |
|
from jsonargparse import CLI |
|
|
|
CLI(prepare) |
|
|