File size: 816 Bytes
d18eb09 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
# utils/tokenizer.py
class CharTokenizer:
def __init__(self):
self.chars = set()
self.char2idx = {}
self.idx2char = {}
def fit(self, texts):
for text in texts:
self.chars.update(set(text))
self.chars = sorted(list(self.chars))
self.char2idx = {char: idx for idx, char in enumerate(self.chars)}
self.idx2char = {idx: char for char, idx in self.char2idx.items()}
def encode(self, text, max_length=None):
encoded = [self.char2idx[char] for char in text if char in self.char2idx]
if max_length:
encoded = encoded[:max_length] + [0] * (max_length - len(encoded))
return encoded
def decode(self, tokens):
return ''.join([self.idx2char[token] for token in tokens if token in self.idx2char])
|