File size: 464 Bytes
f23bcf0
1fc0c38
f23bcf0
 
 
1fc0c38
f23bcf0
 
1fc0c38
f23bcf0
 
1fc0c38
f23bcf0
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from tokenizers import Tokenizer

def bpe_tokenizer(smiles_string):
    # Load the tokenizer from the saved file
    tokenizer = Tokenizer.from_file("chembl_bpe_tokenizer.json")

    # Tokenize the SMILES string
    encoded_output = tokenizer.encode(smiles_string)

    # To get the tokenized output as text
    tokens_text = encoded_output.tokens

    # To get the corresponding token IDs
    token_ids = encoded_output.ids
    
    return tokens_text, token_ids