File size: 1,217 Bytes
d235aee
 
1fc0c38
6b58f50
 
 
 
 
 
 
1fc0c38
6b58f50
 
 
d235aee
6b58f50
 
d235aee
6b58f50
 
 
 
 
d235aee
6b58f50
 
 
1fc0c38
6b58f50
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import codecs
from SmilesPE.tokenizer import *

def load_vocabulary_to_dict(vocabulary_path):
    vocab_dict = {}
    with codecs.open(vocabulary_path, 'r', 'utf-8') as file:
        for index, line in enumerate(file):
            token = line.strip().split()[0]  # Assuming first item is the token
            vocab_dict[token] = index  # Or use the token itself as ID if preferable
    return vocab_dict

def smilespe_tokenizer(smiles_string, vocab_dict):
    # Initialize SPE_Tokenizer with the vocabulary
    spe_vob = codecs.open('chembl_smiles_tokenizer30000.txt', 'r', 'utf-8')
    spe = SPE_Tokenizer(spe_vob)
    
    # Tokenize the SMILES string
    tokenized = spe.tokenize(smiles_string)
    
    # Convert tokens to IDs using the vocab_dict
    token_ids = [vocab_dict[token] for token in tokenized if token in vocab_dict]

    return tokenized, token_ids

# Load the vocabulary into a dictionary
# vocab_path = 'chembl_smiles_tokenizer30000.txt'
# vocab_dict = load_vocabulary_to_dict(vocab_path)

# # Example usage
# smiles_string = 'Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1'
# tokens, token_ids = smilespe_tokenizer(smiles_string, vocab_dict)
# print("Tokens:", tokens)
# print("Token IDs:", token_ids)