File size: 464 Bytes
5cf5457
 
 
 
719a68e
5cf5457
 
 
 
 
 
409a8a3
 
 
5cf5457
409a8a3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from tokenizers import Tokenizer

def bpe_tokenizer(smiles_string):
    # Load the tokenizer from the saved file
    tokenizer = Tokenizer.from_file("chembl_bpe_tokenizer.json")

    # Tokenize the SMILES string
    encoded_output = tokenizer.encode(smiles_string)

    # To get the tokenized output as text
    tokens_text = encoded_output.tokens

    # To get the corresponding token IDs
    token_ids = encoded_output.ids
    
    return tokens_text, token_ids