bpe-hindi / byte_pair_encoder.py
aayushraina's picture
Upload 9 files
f1c672a verified
raw
history blame
8.03 kB
from typing import List, Dict, Optional
from tqdm import tqdm
from collections import Counter
from matplotlib import pyplot as plt
import json
from pathlib import Path
class TrieNode:
"""Node in the prefix tree (trie) for fast token matching"""
def __init__(self):
self.children = {}
self.is_token = False
self.token = None
class BytePairEncoder:
def __init__(self, text: str):
# Initialize vocabulary from characters
self.chars = sorted(list(set(text)))
self.stoi = {ch: i for i, ch in enumerate(self.chars)}
self.itos = {i: ch for i, ch in enumerate(self.chars)}
# Initial encoding of text
self.data = [self.stoi[c] for c in text]
# Statistics tracking
self.stats = {
"vocab_sizes": [len(self.chars)],
"data_sizes": [len(self.data)],
"compression_ratios": [1.0],
"merge_counts": [],
"tokens_created": [],
"max_token_lengths": [1],
}
# Store original length for compression ratio
self.original_length = len(self.data)
self.max_token_length = 1
def get_digram_stats(self) -> Counter:
"""Get digram counts"""
counts = Counter()
for pair in zip(self.data, self.data[1:]):
pair = (int(pair[0]), int(pair[1]))
counts[pair] += 1
return counts
def encode_to_vocab_size(self, target_vocab_size: int, plot_interval: Optional[int] = None,
print_interval: int = 100) -> None:
"""Train until reaching target vocabulary size"""
pbar = tqdm(total=target_vocab_size, desc="Training BPE", initial=len(self.chars))
iteration = 0
while len(self.itos) < target_vocab_size:
result = self._merge_step()
if result is None:
break
iteration += 1
pbar.update(1)
if print_interval and iteration % print_interval == 0:
self._print_progress(iteration)
if plot_interval and iteration % plot_interval == 0:
self.plot_statistics(iteration=iteration)
pbar.close()
def _merge_step(self):
"""Perform one merge operation"""
stats = self.get_digram_stats()
if not stats:
return None
top_pair, count = max(stats.items(), key=lambda x: x[1])
new_token = self._add_token(top_pair)
self.data = self._replace_pairs(top_pair, new_token)
self._update_stats(count)
return new_token, count
def _add_token(self, pair: tuple) -> int:
"""Add new token to vocabulary"""
token_str = self.itos[pair[0]] + self.itos[pair[1]]
token_id = len(self.itos)
self.stoi[token_str] = token_id
self.itos[token_id] = token_str
self.max_token_length = max(self.max_token_length, len(token_str))
return token_id
def _replace_pairs(self, pair: tuple, new_token: int) -> List[int]:
"""Replace all occurrences of pair with new token"""
result = []
i = 0
while i < len(self.data):
if i < len(self.data) - 1 and self.data[i] == pair[0] and self.data[i + 1] == pair[1]:
result.append(new_token)
i += 2
else:
result.append(self.data[i])
i += 1
return result
def _update_stats(self, merge_count: int):
"""Update training statistics"""
self.stats["vocab_sizes"].append(len(self.itos))
self.stats["data_sizes"].append(len(self.data))
compression = self.original_length / len(self.data)
self.stats["compression_ratios"].append(compression)
self.stats["merge_counts"].append(merge_count)
self.stats["tokens_created"].append(self.itos[len(self.itos)-1])
self.stats["max_token_lengths"].append(self.max_token_length)
def plot_statistics(self, iteration: Optional[int] = None):
"""Plot training statistics"""
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
# Plot training metrics
ax1.plot(self.stats["vocab_sizes"], self.stats["data_sizes"])
ax1.set_title("Vocabulary vs Dataset Size")
ax2.plot(self.stats["vocab_sizes"], self.stats["compression_ratios"])
ax2.set_title("Compression Ratio Progress")
if self.stats["merge_counts"]:
ax3.hist(self.stats["merge_counts"], bins=30)
ax3.set_title("Merge Counts Distribution")
if self.stats["tokens_created"]:
lengths = [len(t) for t in self.stats["tokens_created"]]
ax4.plot(range(len(lengths)), lengths)
ax4.set_title("Token Length Evolution")
plt.tight_layout()
plt.show()
def save_to_file(self, filepath: Path):
"""Save encoder state"""
state = {
"chars": self.chars,
"stoi": self.stoi,
"max_token_length": self.max_token_length,
"stats": self.stats
}
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(state, f, ensure_ascii=False, indent=2)
@classmethod
def load_from_file(cls, filepath: Path):
"""Load encoder state"""
with open(filepath, 'r', encoding='utf-8') as f:
state = json.load(f)
instance = cls("") # Create empty instance
instance.chars = state["chars"]
instance.stoi = state["stoi"]
instance.itos = {int(i): s for s, i in state["stoi"].items()}
instance.max_token_length = state["max_token_length"]
instance.stats = state["stats"]
return instance
def _print_progress(self, iteration: int):
"""Print training progress"""
print(f"\nIteration {iteration}:")
print(f"Vocabulary size: {len(self.itos):,}")
print(f"Data size: {len(self.data):,}")
print(f"Compression ratio: {self.stats['compression_ratios'][-1]:.2f}")
if self.stats["merge_counts"]:
last_merge = self.stats["merge_counts"][-1]
last_token = self.stats["tokens_created"][-1]
print(f"Last merge count: {last_merge:,}")
print(f"Last token created: '{last_token}'")
print(f"Max token length: {self.max_token_length}")
class TokenizerInternal:
"""Tokenizer using trained BPE model"""
def __init__(self, encoder: BytePairEncoder):
self.stoi = encoder.stoi
self.max_token_length = encoder.max_token_length
self._trie = self._build_trie()
def _build_trie(self) -> TrieNode:
"""Build trie for efficient tokenization"""
root = TrieNode()
for token in self.stoi:
node = root
for char in token:
if char not in node.children:
node.children[char] = TrieNode()
node = node.children[char]
node.is_token = True
node.token = token
return root
def tokenize(self, text: str) -> List[str]:
"""Tokenize text using trie-based matching"""
tokens = []
pos = 0
while pos < len(text):
token = self._find_longest_token(text[pos:])
tokens.append(token)
pos += len(token)
return tokens
def _find_longest_token(self, text: str) -> str:
"""Find longest matching token starting at current position"""
node = self._trie
longest = text[0]
current = ""
for char in text[:self.max_token_length]:
if char not in node.children:
break
current += char
node = node.children[char]
if node.is_token:
longest = node.token
return longest