|
import unittest |
|
from pathlib import Path |
|
from hindi_bpe import HindiBPE, preprocess_hindi_text |
|
from data_loader import load_hindi_dataset |
|
import pandas as pd |
|
import plotly.express as px |
|
from typing import List, Dict |
|
import time |
|
|
|
class HindiBPETest: |
|
def __init__(self, vocab_size: int = 4500, num_articles: int = 1000): |
|
self.vocab_size = vocab_size |
|
self.num_articles = num_articles |
|
self.bpe = None |
|
self.text = None |
|
self.encoded = None |
|
self.stats = {} |
|
|
|
def load_data(self) -> str: |
|
"""Load and preprocess the dataset.""" |
|
print("\nStep 1: Loading dataset...") |
|
start_time = time.time() |
|
|
|
|
|
self.text = load_hindi_dataset( |
|
split="train", |
|
num_files=self.num_articles |
|
) |
|
self.text = preprocess_hindi_text(self.text) |
|
|
|
|
|
self.valid_text = load_hindi_dataset( |
|
split="valid", |
|
num_files=min(self.num_articles // 5, 100) |
|
) |
|
|
|
self.stats['load_time'] = time.time() - start_time |
|
self.stats['original_length'] = len(self.text) |
|
self.stats['valid_length'] = len(self.valid_text) |
|
print(f"Loading completed in {self.stats['load_time']:.2f} seconds") |
|
return self.text |
|
|
|
def train_tokenizer(self) -> HindiBPE: |
|
"""Train the BPE tokenizer.""" |
|
print("\nStep 2: Training BPE tokenizer...") |
|
start_time = time.time() |
|
|
|
self.bpe = HindiBPE(vocab_size=self.vocab_size) |
|
self.bpe.train(self.text) |
|
|
|
self.stats['train_time'] = time.time() - start_time |
|
self.stats['vocab_size'] = len(self.bpe.vocab) |
|
print(f"Training completed in {self.stats['train_time']:.2f} seconds") |
|
return self.bpe |
|
|
|
def encode_text(self) -> List[str]: |
|
"""Encode the text using trained tokenizer.""" |
|
print("\nStep 3: Encoding text...") |
|
start_time = time.time() |
|
|
|
self.encoded = self.bpe.encode(self.text) |
|
|
|
self.stats['encode_time'] = time.time() - start_time |
|
self.stats['encoded_length'] = sum(len(token) for token in self.encoded) |
|
self.stats['compression_ratio'] = self.stats['original_length'] / self.stats['encoded_length'] |
|
print(f"Encoding completed in {self.stats['encode_time']:.2f} seconds") |
|
return self.encoded |
|
|
|
def save_visualizations(self, output_dir: str = "output"): |
|
"""Generate and save visualizations.""" |
|
print("\nStep 4: Generating visualizations...") |
|
output_dir = Path(output_dir) |
|
output_dir.mkdir(exist_ok=True) |
|
|
|
|
|
token_lengths = [len(token) for token in self.bpe.vocab] |
|
df = pd.DataFrame({'Length': token_lengths}) |
|
fig = px.histogram(df, x='Length', |
|
title='Token Length Distribution', |
|
labels={'Length': 'Token Length', 'count': 'Frequency'}) |
|
fig.write_html(output_dir / "token_distribution.html") |
|
|
|
|
|
comp_df = pd.DataFrame({ |
|
'Stage': ['Original', 'Encoded'], |
|
'Size': [self.stats['original_length'], self.stats['encoded_length']] |
|
}) |
|
fig = px.bar(comp_df, x='Stage', y='Size', |
|
title='Text Compression Comparison') |
|
fig.write_html(output_dir / "compression.html") |
|
|
|
|
|
pd.DataFrame([self.stats]).to_csv(output_dir / "stats.csv") |
|
print(f"Visualizations saved to {output_dir}") |
|
|
|
def print_summary(self): |
|
"""Print summary of the tokenization process.""" |
|
print("\nTokenization Summary:") |
|
print("-" * 50) |
|
print(f"Dataset size: {self.stats['original_length']:,} characters") |
|
print(f"Vocabulary size: {self.stats['vocab_size']:,} tokens") |
|
print(f"Compression ratio: {self.stats['compression_ratio']:.2f}") |
|
print(f"\nProcessing times:") |
|
print(f"Loading: {self.stats['load_time']:.2f} seconds") |
|
print(f"Training: {self.stats['train_time']:.2f} seconds") |
|
print(f"Encoding: {self.stats['encode_time']:.2f} seconds") |
|
|
|
def run_full_pipeline(self) -> Dict: |
|
"""Run the complete tokenization pipeline.""" |
|
self.load_data() |
|
self.train_tokenizer() |
|
self.encode_text() |
|
self.save_visualizations() |
|
self.print_summary() |
|
return self.stats |
|
|
|
def main(): |
|
|
|
test = HindiBPETest(vocab_size=4500, num_articles=1000) |
|
stats = test.run_full_pipeline() |
|
|
|
|
|
sample_text = """ |
|
भारत एक विशाल देश है। यहाँ की संस्कृति बहुत पुरानी है। |
|
हिंदी भारत की प्रमुख भाषाओं में से एक है। |
|
""" |
|
|
|
print("\nTesting tokenization on sample text:") |
|
tokens = test.bpe.encode(sample_text) |
|
print(f"Original text: {sample_text}") |
|
print(f"Tokens: {tokens}") |
|
decoded = test.bpe.decode(tokens) |
|
print(f"Decoded text: {decoded}") |
|
|
|
|
|
if stats['compression_ratio'] >= 3.2: |
|
print("\nSuccess: Achieved required compression ratio ≥ 3.2") |
|
else: |
|
print("\nWarning: Compression ratio below target 3.2") |
|
|
|
|
|
if stats['vocab_size'] < 5000: |
|
print("Success: Vocabulary size within limit < 5000") |
|
else: |
|
print("Warning: Vocabulary size exceeds limit") |
|
|
|
if __name__ == "__main__": |
|
main() |