In [3]:
from transformers import BertTokenizer
from pprint import pprint
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")



In [10]:
pprint(tokenizer("The HuggingFace Course is quite intuitive"))

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [101,
 1109,
 20164,
 10932,
 2271,
 7954,
 10176,
 1110,
 2385,
 1107,
 7926,
 8588,
 102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [11]:
tokenizer.save_pretrained(save_directory="./artifacts/")

('./artifacts/tokenizer_config.json',
 './artifacts/special_tokens_map.json',
 './artifacts/vocab.txt',
 './artifacts/added_tokens.json')

# Breaking it down

In [12]:
sequence = "The HuggingFace Course is quite intuitive"

In [14]:
tokens = tokenizer.tokenize(sequence)
print(tokens)

['The', 'Hu', '##gging', '##F', '##ace', 'Course', 'is', 'quite', 'in', '##tu', '##itive']


In [15]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_ids

[1109, 20164, 10932, 2271, 7954, 10176, 1110, 2385, 1107, 7926, 8588]

Try tokenization using tokenize method and the __call__ method of the tokenizer object and confirm the outputs

In [24]:
sentences = ["I’ve been waiting for a HuggingFace course my whole life.", "I hate this so much!"]

for sentence in sentences:
 # 1: Perform tokenization using the default call method
 token_ids = tokenizer(sentence)["input_ids"]
 print(token_ids)
 print(tokenizer.decode(token_ids))
 print()

 # 2: First tokenize and then convert to ids
 tokens = tokenizer.tokenize(sentence)
 token_ids = tokenizer.convert_tokens_to_ids(tokens)
 print(token_ids)
 print(tokenizer.decode(token_ids))

 print("="*100)

[101, 146, 787, 1396, 1151, 2613, 1111, 170, 20164, 10932, 2271, 7954, 1736, 1139, 2006, 1297, 119, 102]
[CLS] I ’ ve been waiting for a HuggingFace course my whole life. [SEP]

[146, 787, 1396, 1151, 2613, 1111, 170, 20164, 10932, 2271, 7954, 1736, 1139, 2006, 1297, 119]
I ’ ve been waiting for a HuggingFace course my whole life.
[101, 146, 4819, 1142, 1177, 1277, 106, 102]
[CLS] I hate this so much! [SEP]

[146, 4819, 1142, 1177, 1277, 106]
I hate this so much!


In [22]:
tokenizer.decode([101, 102])

'[CLS] [SEP]'

The difference in the first and last token values is because of the introduction of special tokens which is proposed in the BERT paper otherwise all the tokens are exactly the same.