File size: 1,146 Bytes
d278d9d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
import os
def prepare_enwik8(input_file, output_dir):
"""
Prepare enwik8 dataset from enwik9:
- Extract first 100M bytes for enwik8
- Split into train (90M), val (5M), and test (5M)
"""
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Read first 100M bytes from enwik9
with open(input_file, 'rb') as f:
data = f.read(100_000_000) # Read exactly 100M bytes
# Split the data
train_data = data[:90_000_000] # First 90M bytes
val_data = data[90_000_000:95_000_000] # Next 5M bytes
test_data = data[95_000_000:] # Last 5M bytes
# Save splits
splits = {
'train.bin': train_data,
'val.bin': val_data,
'test.bin': test_data
}
for name, split_data in splits.items():
with open(os.path.join(output_dir, name), 'wb') as f:
f.write(split_data)
print(f"Saved {name} ({len(split_data):,} bytes)")
if __name__ == "__main__":
input_file = "enwik9/enwik9"
output_dir = "data"
prepare_enwik8(input_file, output_dir)
print("Dataset preparation completed!")
|