eyad-silx
/

llm

Model card Files Files and versions Community

llm

File size: 1,146 Bytes

d278d9d

import os

def prepare_enwik8(input_file, output_dir):
    """
    Prepare enwik8 dataset from enwik9:
    - Extract first 100M bytes for enwik8
    - Split into train (90M), val (5M), and test (5M)
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Read first 100M bytes from enwik9
    with open(input_file, 'rb') as f:
        data = f.read(100_000_000)  # Read exactly 100M bytes
    
    # Split the data
    train_data = data[:90_000_000]  # First 90M bytes
    val_data = data[90_000_000:95_000_000]  # Next 5M bytes
    test_data = data[95_000_000:]  # Last 5M bytes
    
    # Save splits
    splits = {
        'train.bin': train_data,
        'val.bin': val_data,
        'test.bin': test_data
    }
    
    for name, split_data in splits.items():
        with open(os.path.join(output_dir, name), 'wb') as f:
            f.write(split_data)
        print(f"Saved {name} ({len(split_data):,} bytes)")

if __name__ == "__main__":
    input_file = "enwik9/enwik9"
    output_dir = "data"
    prepare_enwik8(input_file, output_dir)
    print("Dataset preparation completed!")