File size: 5,777 Bytes
f1c672a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import unittest
from pathlib import Path
from hindi_bpe import HindiBPE, preprocess_hindi_text
from data_loader import load_hindi_dataset
import pandas as pd
import plotly.express as px
from typing import List, Dict
import time
class HindiBPETest:
def __init__(self, vocab_size: int = 4500, num_articles: int = 1000):
self.vocab_size = vocab_size
self.num_articles = num_articles
self.bpe = None
self.text = None
self.encoded = None
self.stats = {}
def load_data(self) -> str:
"""Load and preprocess the dataset."""
print("\nStep 1: Loading dataset...")
start_time = time.time()
# Load train split
self.text = load_hindi_dataset(
split="train",
num_files=self.num_articles
)
self.text = preprocess_hindi_text(self.text)
# Get validation text for testing
self.valid_text = load_hindi_dataset(
split="valid",
num_files=min(self.num_articles // 5, 100) # 20% of train size or max 100
)
self.stats['load_time'] = time.time() - start_time
self.stats['original_length'] = len(self.text)
self.stats['valid_length'] = len(self.valid_text)
print(f"Loading completed in {self.stats['load_time']:.2f} seconds")
return self.text
def train_tokenizer(self) -> HindiBPE:
"""Train the BPE tokenizer."""
print("\nStep 2: Training BPE tokenizer...")
start_time = time.time()
self.bpe = HindiBPE(vocab_size=self.vocab_size)
self.bpe.train(self.text)
self.stats['train_time'] = time.time() - start_time
self.stats['vocab_size'] = len(self.bpe.vocab)
print(f"Training completed in {self.stats['train_time']:.2f} seconds")
return self.bpe
def encode_text(self) -> List[str]:
"""Encode the text using trained tokenizer."""
print("\nStep 3: Encoding text...")
start_time = time.time()
self.encoded = self.bpe.encode(self.text)
self.stats['encode_time'] = time.time() - start_time
self.stats['encoded_length'] = sum(len(token) for token in self.encoded)
self.stats['compression_ratio'] = self.stats['original_length'] / self.stats['encoded_length']
print(f"Encoding completed in {self.stats['encode_time']:.2f} seconds")
return self.encoded
def save_visualizations(self, output_dir: str = "output"):
"""Generate and save visualizations."""
print("\nStep 4: Generating visualizations...")
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True)
# Token length distribution
token_lengths = [len(token) for token in self.bpe.vocab]
df = pd.DataFrame({'Length': token_lengths})
fig = px.histogram(df, x='Length',
title='Token Length Distribution',
labels={'Length': 'Token Length', 'count': 'Frequency'})
fig.write_html(output_dir / "token_distribution.html")
# Compression visualization
comp_df = pd.DataFrame({
'Stage': ['Original', 'Encoded'],
'Size': [self.stats['original_length'], self.stats['encoded_length']]
})
fig = px.bar(comp_df, x='Stage', y='Size',
title='Text Compression Comparison')
fig.write_html(output_dir / "compression.html")
# Save statistics to CSV
pd.DataFrame([self.stats]).to_csv(output_dir / "stats.csv")
print(f"Visualizations saved to {output_dir}")
def print_summary(self):
"""Print summary of the tokenization process."""
print("\nTokenization Summary:")
print("-" * 50)
print(f"Dataset size: {self.stats['original_length']:,} characters")
print(f"Vocabulary size: {self.stats['vocab_size']:,} tokens")
print(f"Compression ratio: {self.stats['compression_ratio']:.2f}")
print(f"\nProcessing times:")
print(f"Loading: {self.stats['load_time']:.2f} seconds")
print(f"Training: {self.stats['train_time']:.2f} seconds")
print(f"Encoding: {self.stats['encode_time']:.2f} seconds")
def run_full_pipeline(self) -> Dict:
"""Run the complete tokenization pipeline."""
self.load_data()
self.train_tokenizer()
self.encode_text()
self.save_visualizations()
self.print_summary()
return self.stats
def main():
# Example usage
test = HindiBPETest(vocab_size=4500, num_articles=1000)
stats = test.run_full_pipeline()
# Test tokenization on a sample text
sample_text = """
भारत एक विशाल देश है। यहाँ की संस्कृति बहुत पुरानी है।
हिंदी भारत की प्रमुख भाषाओं में से एक है।
"""
print("\nTesting tokenization on sample text:")
tokens = test.bpe.encode(sample_text)
print(f"Original text: {sample_text}")
print(f"Tokens: {tokens}")
decoded = test.bpe.decode(tokens)
print(f"Decoded text: {decoded}")
# Verify compression ratio requirement
if stats['compression_ratio'] >= 3.2:
print("\nSuccess: Achieved required compression ratio ≥ 3.2")
else:
print("\nWarning: Compression ratio below target 3.2")
# Verify vocabulary size requirement
if stats['vocab_size'] < 5000:
print("Success: Vocabulary size within limit < 5000")
else:
print("Warning: Vocabulary size exceeds limit")
if __name__ == "__main__":
main() |