Spaces:

saicharan2804
/

AtomwiseTokenizer

Sleeping

AtomwiseTokenizer / AtomwiseTokenizer.py

saicharan2804

Added token IDs

5a4be66 12 months ago

2.36 kB

	def atomwise_tokenizer(smi, exclusive_tokens=None):
	"""
	Tokenize a SMILES molecule at atom-level and return tokens with their token IDs.
	- 'Br' and 'Cl' are two-character tokens.
	- Symbols with brackets are considered as tokens.
	- If `exclusive_tokens` is provided, symbols with brackets not in `exclusive_tokens` will be replaced by '[UNK]'.

	Parameters:
	- smi (str): SMILES string to tokenize.
	- exclusive_tokens (list of str, optional): Specific symbols with brackets to keep.

	Returns:
	- tuple: (tokens, token_ids), where tokens is a list of atom-level tokens and token_ids is a list of corresponding token IDs.
	"""
	import re
	pattern = "(\[[^\]]+]\|Br?\|Cl?\|N\|O\|S\|P\|F\|I\|b\|c\|n\|o\|s\|p\|$\|$\|\.\|=\|#\|-\|\+\|\\\\\|\/\|:\|~\|@\|\?\|>\|\*\|\$\|\%[0-9]{2}\|[0-9])"
	regex = re.compile(pattern)
	tokens = [token for token in regex.findall(smi)]

	# Handle exclusive tokens, replacing non-exclusive bracketed tokens with '[UNK]'
	if exclusive_tokens:
	tokens = [tok if tok in exclusive_tokens or not tok.startswith('[') else '[UNK]' for tok in tokens]

	# Generating token IDs based on the order of unique token appearance
	token_to_id = {}
	token_ids = []
	for token in tokens:
	if token not in token_to_id:
	# Assign a new ID based on the current size of the dictionary
	token_to_id[token] = len(token_to_id)
	token_ids.append(token_to_id[token])

	return tokens, token_ids



	# def atomwise_tokenizer(smi, exclusive_tokens = None):
	# """
	# Tokenize a SMILES molecule at atom-level:
	# (1) 'Br' and 'Cl' are two-character tokens
	# (2) Symbols with bracket are considered as tokens

	# exclusive_tokens: A list of specifical symbols with bracket you want to keep. e.g., ['[C@@H]', '[nH]'].
	# Other symbols with bracket will be replaced by '[UNK]'. default is `None`.
	# """
	# import re
	# pattern = "(\[[^\]]+]\|Br?\|Cl?\|N\|O\|S\|P\|F\|I\|b\|c\|n\|o\|s\|p\|$\|$\|\.\|=\|#\|-\|\+\|\\\\\|\/\|:\|~\|@\|\?\|>\|\*\|\$\|\%[0-9]{2}\|[0-9])"
	# regex = re.compile(pattern)
	# tokens = [token for token in regex.findall(smi)]

	# if exclusive_tokens:
	# for i, tok in enumerate(tokens):
	# if tok.startswith('['):
	# if tok not in exclusive_tokens:
	# tokens[i] = '[UNK]'
	# return tokens