File size: 2,052 Bytes
9cdcbb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from .base import get_freq_pairs, merge, Tokenizer

class BPE(Tokenizer):
    def __init__(self) -> None:
        super().__init__()
    
    def train(self, vocab_size, text):
        ##Vocabulary should contain atleast the ASCII characters
        assert vocab_size>=256

        num_merges = vocab_size-256
        tokens = list(text.encode('utf-8'))
        merges = {}
        vocab = {idx: bytes([idx]) for idx in range(256)}

        for i in range(num_merges):
            stats = get_freq_pairs(tokens)
            max_pair = max(stats, key=stats.get)
            idx = 256 + i
            tokens = merge(tokens, max_pair, idx)
            merges[max_pair] = idx 
            vocab[idx] = vocab[max_pair[0]] + vocab[max_pair[1]]

            
        self.merges = merges
        self.vocab = vocab

        self.save()

    def encode(self, text):
        ids = list(text.encode('utf-8'))    
        # print(ids)
        # assert len(self.merges) > 0
        ##if len(ids) is greater than 2, we need to merge it
        while True:
            pair_counts = get_freq_pairs(ids)
            # print(pair_counts)
        
            min_index_pair = min(pair_counts, key= lambda x: self.merges.get(x, float('inf')))
            if(min_index_pair) not in self.merges:
                break

            idx = self.merges.get(min_index_pair)
            # print(ids)
            ids = merge(ids, min_index_pair, idx)
        return ids

    def decode(self, ids):
        print(ids)
        # given ids (list of integers), return Python string
        text_bytes = b"".join(self.vocab[idx] for idx in ids)
        text = text_bytes.decode("utf-8", errors="replace")
        return text


if __name__ == "__main__":

    tokenizer = tokenizer()

    with open('cindrella_stories.txt', 'r') as f:
        text = f.read()


    tokenizer.train(500, text)

    s = "😁"
    print("String is",s)

    ids  = tokenizer.encode(s)
    print("Encoded string ",ids)
    decoded_string = tokenizer.decode(ids)
    print("Decoded string ",decoded_string)