from tokenizers import Tokenizer def bpe_tokenizer(smiles_string): # Load the tokenizer from the saved file tokenizer = Tokenizer.from_file("chembl_bpe_tokenizer.json") # Tokenize the SMILES string encoded_output = tokenizer.encode(smiles_string) # To get the tokenized output as text tokens_text = encoded_output.tokens # To get the corresponding token IDs token_ids = encoded_output.ids return tokens_text, token_ids