Spaces:
Sleeping
Sleeping
def atomwise_tokenizer(smi, exclusive_tokens=None): | |
""" | |
Tokenize a SMILES molecule at atom-level and return tokens with their token IDs. | |
- 'Br' and 'Cl' are two-character tokens. | |
- Symbols with brackets are considered as tokens. | |
- If `exclusive_tokens` is provided, symbols with brackets not in `exclusive_tokens` will be replaced by '[UNK]'. | |
Parameters: | |
- smi (str): SMILES string to tokenize. | |
- exclusive_tokens (list of str, optional): Specific symbols with brackets to keep. | |
Returns: | |
- tuple: (tokens, token_ids), where tokens is a list of atom-level tokens and token_ids is a list of corresponding token IDs. | |
""" | |
import re | |
pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])" | |
regex = re.compile(pattern) | |
tokens = [token for token in regex.findall(smi)] | |
# Handle exclusive tokens, replacing non-exclusive bracketed tokens with '[UNK]' | |
if exclusive_tokens: | |
tokens = [tok if tok in exclusive_tokens or not tok.startswith('[') else '[UNK]' for tok in tokens] | |
# Generating token IDs based on the order of unique token appearance | |
token_to_id = {} | |
token_ids = [] | |
for token in tokens: | |
if token not in token_to_id: | |
# Assign a new ID based on the current size of the dictionary | |
token_to_id[token] = len(token_to_id) | |
token_ids.append(token_to_id[token]) | |
return tokens, token_ids | |
# def atomwise_tokenizer(smi, exclusive_tokens = None): | |
# """ | |
# Tokenize a SMILES molecule at atom-level: | |
# (1) 'Br' and 'Cl' are two-character tokens | |
# (2) Symbols with bracket are considered as tokens | |
# exclusive_tokens: A list of specifical symbols with bracket you want to keep. e.g., ['[C@@H]', '[nH]']. | |
# Other symbols with bracket will be replaced by '[UNK]'. default is `None`. | |
# """ | |
# import re | |
# pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])" | |
# regex = re.compile(pattern) | |
# tokens = [token for token in regex.findall(smi)] | |
# if exclusive_tokens: | |
# for i, tok in enumerate(tokens): | |
# if tok.startswith('['): | |
# if tok not in exclusive_tokens: | |
# tokens[i] = '[UNK]' | |
# return tokens | |