khoicrtp
/

test_model

Model card Files Files and versions Community

test_model / scripts /prepare_shakespeare.py

khoicrtp's picture

init

12001a9 over 1 year ago

2.66 kB

	# MIT License

	# Copyright (c) 2022 Andrej Karpathy

	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:

	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.
	import sys
	from pathlib import Path

	# support running without installing as a package
	wd = Path(__file__).parent.parent.resolve()
	sys.path.append(str(wd))

	import numpy as np
	import requests


	def prepare(destination_path: Path = Path("data/shakespeare")) -> None:
	"""Prepare the "Tiny Shakespeare" dataset."""
	destination_path.mkdir(parents=True, exist_ok=True)

	# download the tiny shakespeare dataset
	input_file_path = destination_path / "input.txt"
	if not input_file_path.exists():
	data_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
	with open(input_file_path, "w") as f:
	f.write(requests.get(data_url).text)

	with open(input_file_path) as f:
	data = f.read()
	n = len(data)
	train_data = data[: int(n * 0.9)]
	val_data = data[int(n * 0.9) :]

	from lit_llama import Tokenizer

	Tokenizer.train(input=input_file_path, destination=destination_path, vocab_size=100)
	tokenizer = Tokenizer(destination_path / "tokenizer.model")
	train_ids = tokenizer.encode(train_data)
	val_ids = tokenizer.encode(val_data)
	print(f"train has {len(train_ids):,} tokens")
	print(f"val has {len(val_ids):,} tokens")

	# export to bin files
	train_ids = np.array(train_ids, dtype=np.uint16)
	val_ids = np.array(val_ids, dtype=np.uint16)
	train_ids.tofile(destination_path / "train.bin")
	val_ids.tofile(destination_path / "val.bin")


	if __name__ == "__main__":
	from jsonargparse import CLI

	CLI(prepare)