import os def prepare_enwik8(input_file, output_dir): """ Prepare enwik8 dataset from enwik9: - Extract first 100M bytes for enwik8 - Split into train (90M), val (5M), and test (5M) """ # Create output directory if it doesn't exist os.makedirs(output_dir, exist_ok=True) # Read first 100M bytes from enwik9 with open(input_file, 'rb') as f: data = f.read(100_000_000) # Read exactly 100M bytes # Split the data train_data = data[:90_000_000] # First 90M bytes val_data = data[90_000_000:95_000_000] # Next 5M bytes test_data = data[95_000_000:] # Last 5M bytes # Save splits splits = { 'train.bin': train_data, 'val.bin': val_data, 'test.bin': test_data } for name, split_data in splits.items(): with open(os.path.join(output_dir, name), 'wb') as f: f.write(split_data) print(f"Saved {name} ({len(split_data):,} bytes)") if __name__ == "__main__": input_file = "enwik9/enwik9" output_dir = "data" prepare_enwik8(input_file, output_dir) print("Dataset preparation completed!")