In [1]:
from transformers import AutoTokenizer

model_name = "naver-clova-ix/donut-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [2]:
tokenizer.save_pretrained("old_tokenizer")

print(len(tokenizer))
tokenizer

57525


XLMRobertaTokenizerFast(name_or_path='naver-clova-ix/donut-base', vocab_size=57522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '', 'sep_token': '', 'pad_token': '', 'cls_token': '', 'mask_token': AddedToken("", rstrip=False, lstrip=True, single_word=False, normalized=True), 'additional_special_tokens': ['', '']}, clean_up_tokenization_spaces=True)

# Modifying the sentencepiece file


Reference: https://blog.ceshine.net/post/trim-down-sentencepiece-vocabulary/

In [3]:
from transformers.convert_slow_tokenizer import import_protobuf

model_pb2 = import_protobuf()

m = model_pb2.ModelProto()
m.ParseFromString(open("./old_tokenizer/sentencepiece.bpe.model", 'rb').read())
len(m.pieces)

57520

Because m.pieces is a Protocol Buffers field, we can not merely point it to a new list. Instead, we need to use the field’s methods to manipulate its content:

In [4]:
kept_pieces = []


for p in m.pieces:

 # WRITE YOUR OWN RULE FOR WHAT TOKENS TO KEEP
 if p.piece.lstrip("▁").isascii():
 kept_pieces.append(p)

In [5]:
i = 0

kept_tokens = set([x.piece for x in kept_pieces])

# go backwards from end
# until at start
while i < len(m.pieces):
 
 idx = len(m.pieces) - i - 1

 if m.pieces[idx].piece not in kept_tokens:
 m.pieces.pop(idx)
 else:
 i += 1


In [6]:
len(m.pieces)

27510

# The Donut tokenizer doesn't have the "1" token

It has tokens for " 1", "10", and "1.1", but certain scenarios result in the UNK token being used

In [7]:
print(tokenizer.unk_token_id)

# This results in the token turning into an unknown token (3)
tokenizer(">1").input_ids

3


[0, 56881, 3, 2]

In [8]:
# Whenever a character is before the number 1, there is a decent chance the 1 will turn into UNK (id = 3)
tokenizer("10.1 )1 a1").input_ids

[0, 39772, 3, 9447, 3, 54915, 3, 2]

## Adding 1 into the sentencepiece model

In [9]:
from copy import deepcopy

# copy the last piece
piece1 = deepcopy(m.pieces[-1])

# modify the values of the following variables
piece1.piece = "1"
piece1.score = -10

# include it in the models list of pieces
m.pieces.extend([piece1])

In [10]:
# create temporary sentencepiece file

with open("temp_sentencepiece.bpe.model", 'wb') as f:
 f.write(m.SerializeToString())

In [11]:
from transformers import XLMRobertaTokenizer

new_tokenizer = XLMRobertaTokenizer(vocab_file="temp_sentencepiece.bpe.model")

In [12]:
len(new_tokenizer), len(tokenizer)

(27513, 57525)

In [13]:
# the special tokens are in the model, but due to a quirk, they need to be added again

new_tokenizer.add_special_tokens(new_tokenizer.special_tokens_map)

new_tokenizer.save_pretrained('donut-base-ascii')

('donut-base-ascii/tokenizer_config.json',
 'donut-base-ascii/special_tokens_map.json',
 'donut-base-ascii/sentencepiece.bpe.model',
 'donut-base-ascii/added_tokens.json')

In [14]:
len(new_tokenizer), len(tokenizer)

(27513, 57525)

In [15]:
# reload to get all features

new_tokenizer = AutoTokenizer.from_pretrained("donut-base-ascii")

In [16]:
old_mapping = tokenizer.vocab

new_mapping = new_tokenizer.vocab

sorted_new_mapping = sorted(new_mapping.items(), key=lambda x: x[1])# sort by id, ascending

# `embed_indexes` will have the old index value stored at the new index
# e.g. embed_indexes[i] = j means the new embedding id at i has the same value
# as the old embedding id of j
embed_indexes = [old_mapping[tok] for tok, _ in sorted_new_mapping[:-2]]

In [17]:
# embed_indexes ignores the last two because
# the second to last one is brand new.

# these two embeddings will get added later
sorted_new_mapping[-2:]

[('1', 27511), ('', 27512)]

In [26]:
from transformers import VisionEncoderDecoderModel

model_name = "naver-clova-ix/donut-base"
model = VisionEncoderDecoderModel.from_pretrained(model_name)

old_embeds = model.decoder.model.decoder.embed_tokens.weight.data
old_embeds

new_embeds = old_embeds[embed_indexes, :].clone()

print(new_embeds.shape)

torch.Size([27511, 1024])


In [19]:
import torch

# setting the embedding for the new token to be the same as " 1"
# during training, they will differentiate
embed_1 = old_embeds[old_mapping["▁1"]].clone()
print(embed_1.shape)

embed_mask = old_embeds[old_mapping[""]].clone()
print(embed_mask.shape)

new_embeds = torch.vstack([new_embeds, embed_1.unsqueeze(0), embed_mask.unsqueeze(0)])

new_embeds.shape

torch.Size([1024])
torch.Size([1024])


torch.Size([27513, 1024])

## Put embeddings back into model

In [20]:
model.decoder.model.decoder.embed_tokens.weight.data = new_embeds

model.decoder.config.update({
 "vocab_size": new_embeds.shape[0]
})

model.save_pretrained("donut-base-ascii")

# Making sure the embeddings are correct

In [21]:
old_ids = tokenizer("hello there").input_ids
print(old_ids)

new_ids = new_tokenizer("hello there").input_ids
print(new_ids)

[0, 37199, 35816, 34554, 2]
[0, 14026, 13045, 12147, 2]


In [22]:
import torch

old_embeddings = torch.stack([old_embeds[i] for i in old_ids])
new_embeddings = torch.stack([new_embeds[i] for i in new_ids])

torch.all(torch.eq(old_embeddings, new_embeddings))

tensor(True)

## Add image processor so that all files are together

In [27]:
from transformers import AutoImageProcessor

proc = AutoImageProcessor.from_pretrained(model_name)
proc.save_pretrained("donut-base-ascii")

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


['donut-base-ascii/preprocessor_config.json']

## Check that the new token for 1 works


unk_token_id = 3, so that shouldn't be present! Instead it should have 27511, the new token for "1"

In [24]:
new_tokenizer("10.1 )1 a1").input_ids

[0, 15793, 27511, 4056, 27511, 26020, 27511, 2]