Spaces:
Runtime error
Runtime error
File size: 1,217 Bytes
d235aee 1fc0c38 6b58f50 1fc0c38 6b58f50 d235aee 6b58f50 d235aee 6b58f50 d235aee 6b58f50 1fc0c38 6b58f50 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
import codecs
from SmilesPE.tokenizer import *
def load_vocabulary_to_dict(vocabulary_path):
vocab_dict = {}
with codecs.open(vocabulary_path, 'r', 'utf-8') as file:
for index, line in enumerate(file):
token = line.strip().split()[0] # Assuming first item is the token
vocab_dict[token] = index # Or use the token itself as ID if preferable
return vocab_dict
def smilespe_tokenizer(smiles_string, vocab_dict):
# Initialize SPE_Tokenizer with the vocabulary
spe_vob = codecs.open('chembl_smiles_tokenizer30000.txt', 'r', 'utf-8')
spe = SPE_Tokenizer(spe_vob)
# Tokenize the SMILES string
tokenized = spe.tokenize(smiles_string)
# Convert tokens to IDs using the vocab_dict
token_ids = [vocab_dict[token] for token in tokenized if token in vocab_dict]
return tokenized, token_ids
# Load the vocabulary into a dictionary
# vocab_path = 'chembl_smiles_tokenizer30000.txt'
# vocab_dict = load_vocabulary_to_dict(vocab_path)
# # Example usage
# smiles_string = 'Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1'
# tokens, token_ids = smilespe_tokenizer(smiles_string, vocab_dict)
# print("Tokens:", tokens)
# print("Token IDs:", token_ids)
|