bpe-hindi / byte_pair_encoder.py

Upload 9 files

f1c672a verified 5 months ago

8.03 kB

	from typing import List, Dict, Optional
	from tqdm import tqdm
	from collections import Counter
	from matplotlib import pyplot as plt
	import json
	from pathlib import Path

	class TrieNode:
	"""Node in the prefix tree (trie) for fast token matching"""
	def __init__(self):
	self.children = {}
	self.is_token = False
	self.token = None

	class BytePairEncoder:
	def __init__(self, text: str):
	# Initialize vocabulary from characters
	self.chars = sorted(list(set(text)))
	self.stoi = {ch: i for i, ch in enumerate(self.chars)}
	self.itos = {i: ch for i, ch in enumerate(self.chars)}

	# Initial encoding of text
	self.data = [self.stoi[c] for c in text]

	# Statistics tracking
	self.stats = {
	"vocab_sizes": [len(self.chars)],
	"data_sizes": [len(self.data)],
	"compression_ratios": [1.0],
	"merge_counts": [],
	"tokens_created": [],
	"max_token_lengths": [1],
	}

	# Store original length for compression ratio
	self.original_length = len(self.data)
	self.max_token_length = 1

	def get_digram_stats(self) -> Counter:
	"""Get digram counts"""
	counts = Counter()
	for pair in zip(self.data, self.data[1:]):
	pair = (int(pair[0]), int(pair[1]))
	counts[pair] += 1
	return counts

	def encode_to_vocab_size(self, target_vocab_size: int, plot_interval: Optional[int] = None,
	print_interval: int = 100) -> None:
	"""Train until reaching target vocabulary size"""
	pbar = tqdm(total=target_vocab_size, desc="Training BPE", initial=len(self.chars))

	iteration = 0
	while len(self.itos) < target_vocab_size:
	result = self._merge_step()
	if result is None:
	break

	iteration += 1
	pbar.update(1)

	if print_interval and iteration % print_interval == 0:
	self._print_progress(iteration)

	if plot_interval and iteration % plot_interval == 0:
	self.plot_statistics(iteration=iteration)

	pbar.close()

	def _merge_step(self):
	"""Perform one merge operation"""
	stats = self.get_digram_stats()
	if not stats:
	return None

	top_pair, count = max(stats.items(), key=lambda x: x[1])
	new_token = self._add_token(top_pair)
	self.data = self._replace_pairs(top_pair, new_token)
	self._update_stats(count)

	return new_token, count

	def _add_token(self, pair: tuple) -> int:
	"""Add new token to vocabulary"""
	token_str = self.itos[pair[0]] + self.itos[pair[1]]
	token_id = len(self.itos)
	self.stoi[token_str] = token_id
	self.itos[token_id] = token_str
	self.max_token_length = max(self.max_token_length, len(token_str))
	return token_id

	def _replace_pairs(self, pair: tuple, new_token: int) -> List[int]:
	"""Replace all occurrences of pair with new token"""
	result = []
	i = 0
	while i < len(self.data):
	if i < len(self.data) - 1 and self.data[i] == pair[0] and self.data[i + 1] == pair[1]:
	result.append(new_token)
	i += 2
	else:
	result.append(self.data[i])
	i += 1
	return result

	def _update_stats(self, merge_count: int):
	"""Update training statistics"""
	self.stats["vocab_sizes"].append(len(self.itos))
	self.stats["data_sizes"].append(len(self.data))
	compression = self.original_length / len(self.data)
	self.stats["compression_ratios"].append(compression)
	self.stats["merge_counts"].append(merge_count)
	self.stats["tokens_created"].append(self.itos[len(self.itos)-1])
	self.stats["max_token_lengths"].append(self.max_token_length)

	def plot_statistics(self, iteration: Optional[int] = None):
	"""Plot training statistics"""
	fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

	# Plot training metrics
	ax1.plot(self.stats["vocab_sizes"], self.stats["data_sizes"])
	ax1.set_title("Vocabulary vs Dataset Size")

	ax2.plot(self.stats["vocab_sizes"], self.stats["compression_ratios"])
	ax2.set_title("Compression Ratio Progress")

	if self.stats["merge_counts"]:
	ax3.hist(self.stats["merge_counts"], bins=30)
	ax3.set_title("Merge Counts Distribution")

	if self.stats["tokens_created"]:
	lengths = [len(t) for t in self.stats["tokens_created"]]
	ax4.plot(range(len(lengths)), lengths)
	ax4.set_title("Token Length Evolution")

	plt.tight_layout()
	plt.show()

	def save_to_file(self, filepath: Path):
	"""Save encoder state"""
	state = {
	"chars": self.chars,
	"stoi": self.stoi,
	"max_token_length": self.max_token_length,
	"stats": self.stats
	}
	with open(filepath, 'w', encoding='utf-8') as f:
	json.dump(state, f, ensure_ascii=False, indent=2)

	@classmethod
	def load_from_file(cls, filepath: Path):
	"""Load encoder state"""
	with open(filepath, 'r', encoding='utf-8') as f:
	state = json.load(f)

	instance = cls("") # Create empty instance
	instance.chars = state["chars"]
	instance.stoi = state["stoi"]
	instance.itos = {int(i): s for s, i in state["stoi"].items()}
	instance.max_token_length = state["max_token_length"]
	instance.stats = state["stats"]

	return instance

	def _print_progress(self, iteration: int):
	"""Print training progress"""
	print(f"\nIteration {iteration}:")
	print(f"Vocabulary size: {len(self.itos):,}")
	print(f"Data size: {len(self.data):,}")
	print(f"Compression ratio: {self.stats['compression_ratios'][-1]:.2f}")

	if self.stats["merge_counts"]:
	last_merge = self.stats["merge_counts"][-1]
	last_token = self.stats["tokens_created"][-1]
	print(f"Last merge count: {last_merge:,}")
	print(f"Last token created: '{last_token}'")

	print(f"Max token length: {self.max_token_length}")

	class TokenizerInternal:
	"""Tokenizer using trained BPE model"""
	def __init__(self, encoder: BytePairEncoder):
	self.stoi = encoder.stoi
	self.max_token_length = encoder.max_token_length
	self._trie = self._build_trie()

	def _build_trie(self) -> TrieNode:
	"""Build trie for efficient tokenization"""
	root = TrieNode()
	for token in self.stoi:
	node = root
	for char in token:
	if char not in node.children:
	node.children[char] = TrieNode()
	node = node.children[char]
	node.is_token = True
	node.token = token
	return root

	def tokenize(self, text: str) -> List[str]:
	"""Tokenize text using trie-based matching"""
	tokens = []
	pos = 0
	while pos < len(text):
	token = self._find_longest_token(text[pos:])
	tokens.append(token)
	pos += len(token)
	return tokens

	def _find_longest_token(self, text: str) -> str:
	"""Find longest matching token starting at current position"""
	node = self._trie
	longest = text[0]
	current = ""

	for char in text[:self.max_token_length]:
	if char not in node.children:
	break
	current += char
	node = node.children[char]
	if node.is_token:
	longest = node.token

	return longest