File size: 5,777 Bytes
f1c672a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import unittest
from pathlib import Path
from hindi_bpe import HindiBPE, preprocess_hindi_text
from data_loader import load_hindi_dataset
import pandas as pd
import plotly.express as px
from typing import List, Dict
import time

class HindiBPETest:
    def __init__(self, vocab_size: int = 4500, num_articles: int = 1000):
        self.vocab_size = vocab_size
        self.num_articles = num_articles
        self.bpe = None
        self.text = None
        self.encoded = None
        self.stats = {}
        
    def load_data(self) -> str:
        """Load and preprocess the dataset."""
        print("\nStep 1: Loading dataset...")
        start_time = time.time()
        
        # Load train split
        self.text = load_hindi_dataset(
            split="train",
            num_files=self.num_articles
        )
        self.text = preprocess_hindi_text(self.text)
        
        # Get validation text for testing
        self.valid_text = load_hindi_dataset(
            split="valid",
            num_files=min(self.num_articles // 5, 100)  # 20% of train size or max 100
        )
        
        self.stats['load_time'] = time.time() - start_time
        self.stats['original_length'] = len(self.text)
        self.stats['valid_length'] = len(self.valid_text)
        print(f"Loading completed in {self.stats['load_time']:.2f} seconds")
        return self.text
    
    def train_tokenizer(self) -> HindiBPE:
        """Train the BPE tokenizer."""
        print("\nStep 2: Training BPE tokenizer...")
        start_time = time.time()
        
        self.bpe = HindiBPE(vocab_size=self.vocab_size)
        self.bpe.train(self.text)
        
        self.stats['train_time'] = time.time() - start_time
        self.stats['vocab_size'] = len(self.bpe.vocab)
        print(f"Training completed in {self.stats['train_time']:.2f} seconds")
        return self.bpe
    
    def encode_text(self) -> List[str]:
        """Encode the text using trained tokenizer."""
        print("\nStep 3: Encoding text...")
        start_time = time.time()
        
        self.encoded = self.bpe.encode(self.text)
        
        self.stats['encode_time'] = time.time() - start_time
        self.stats['encoded_length'] = sum(len(token) for token in self.encoded)
        self.stats['compression_ratio'] = self.stats['original_length'] / self.stats['encoded_length']
        print(f"Encoding completed in {self.stats['encode_time']:.2f} seconds")
        return self.encoded
    
    def save_visualizations(self, output_dir: str = "output"):
        """Generate and save visualizations."""
        print("\nStep 4: Generating visualizations...")
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True)
        
        # Token length distribution
        token_lengths = [len(token) for token in self.bpe.vocab]
        df = pd.DataFrame({'Length': token_lengths})
        fig = px.histogram(df, x='Length', 
                          title='Token Length Distribution',
                          labels={'Length': 'Token Length', 'count': 'Frequency'})
        fig.write_html(output_dir / "token_distribution.html")
        
        # Compression visualization
        comp_df = pd.DataFrame({
            'Stage': ['Original', 'Encoded'],
            'Size': [self.stats['original_length'], self.stats['encoded_length']]
        })
        fig = px.bar(comp_df, x='Stage', y='Size',
                    title='Text Compression Comparison')
        fig.write_html(output_dir / "compression.html")
        
        # Save statistics to CSV
        pd.DataFrame([self.stats]).to_csv(output_dir / "stats.csv")
        print(f"Visualizations saved to {output_dir}")
    
    def print_summary(self):
        """Print summary of the tokenization process."""
        print("\nTokenization Summary:")
        print("-" * 50)
        print(f"Dataset size: {self.stats['original_length']:,} characters")
        print(f"Vocabulary size: {self.stats['vocab_size']:,} tokens")
        print(f"Compression ratio: {self.stats['compression_ratio']:.2f}")
        print(f"\nProcessing times:")
        print(f"Loading: {self.stats['load_time']:.2f} seconds")
        print(f"Training: {self.stats['train_time']:.2f} seconds")
        print(f"Encoding: {self.stats['encode_time']:.2f} seconds")
        
    def run_full_pipeline(self) -> Dict:
        """Run the complete tokenization pipeline."""
        self.load_data()
        self.train_tokenizer()
        self.encode_text()
        self.save_visualizations()
        self.print_summary()
        return self.stats

def main():
    # Example usage
    test = HindiBPETest(vocab_size=4500, num_articles=1000)
    stats = test.run_full_pipeline()
    
    # Test tokenization on a sample text
    sample_text = """
    भारत एक विशाल देश है। यहाँ की संस्कृति बहुत पुरानी है।
    हिंदी भारत की प्रमुख भाषाओं में से एक है।
    """
    
    print("\nTesting tokenization on sample text:")
    tokens = test.bpe.encode(sample_text)
    print(f"Original text: {sample_text}")
    print(f"Tokens: {tokens}")
    decoded = test.bpe.decode(tokens)
    print(f"Decoded text: {decoded}")
    
    # Verify compression ratio requirement
    if stats['compression_ratio'] >= 3.2:
        print("\nSuccess: Achieved required compression ratio ≥ 3.2")
    else:
        print("\nWarning: Compression ratio below target 3.2")
    
    # Verify vocabulary size requirement
    if stats['vocab_size'] < 5000:
        print("Success: Vocabulary size within limit < 5000")
    else:
        print("Warning: Vocabulary size exceeds limit")

if __name__ == "__main__":
    main()