Spaces:
Sleeping
Sleeping
saicharan2804
commited on
Commit
·
5a4be66
1
Parent(s):
1cd9d39
Added token IDs
Browse files- AtomwiseTokenizer.py +49 -12
- app.py +1 -1
AtomwiseTokenizer.py
CHANGED
@@ -1,20 +1,57 @@
|
|
1 |
-
def atomwise_tokenizer(smi, exclusive_tokens
|
2 |
"""
|
3 |
-
Tokenize a SMILES molecule at atom-level
|
4 |
-
|
5 |
-
|
|
|
6 |
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
9 |
"""
|
10 |
import re
|
11 |
-
pattern =
|
12 |
regex = re.compile(pattern)
|
13 |
tokens = [token for token in regex.findall(smi)]
|
14 |
|
|
|
15 |
if exclusive_tokens:
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def atomwise_tokenizer(smi, exclusive_tokens=None):
|
2 |
"""
|
3 |
+
Tokenize a SMILES molecule at atom-level and return tokens with their token IDs.
|
4 |
+
- 'Br' and 'Cl' are two-character tokens.
|
5 |
+
- Symbols with brackets are considered as tokens.
|
6 |
+
- If `exclusive_tokens` is provided, symbols with brackets not in `exclusive_tokens` will be replaced by '[UNK]'.
|
7 |
|
8 |
+
Parameters:
|
9 |
+
- smi (str): SMILES string to tokenize.
|
10 |
+
- exclusive_tokens (list of str, optional): Specific symbols with brackets to keep.
|
11 |
+
|
12 |
+
Returns:
|
13 |
+
- tuple: (tokens, token_ids), where tokens is a list of atom-level tokens and token_ids is a list of corresponding token IDs.
|
14 |
"""
|
15 |
import re
|
16 |
+
pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
|
17 |
regex = re.compile(pattern)
|
18 |
tokens = [token for token in regex.findall(smi)]
|
19 |
|
20 |
+
# Handle exclusive tokens, replacing non-exclusive bracketed tokens with '[UNK]'
|
21 |
if exclusive_tokens:
|
22 |
+
tokens = [tok if tok in exclusive_tokens or not tok.startswith('[') else '[UNK]' for tok in tokens]
|
23 |
+
|
24 |
+
# Generating token IDs based on the order of unique token appearance
|
25 |
+
token_to_id = {}
|
26 |
+
token_ids = []
|
27 |
+
for token in tokens:
|
28 |
+
if token not in token_to_id:
|
29 |
+
# Assign a new ID based on the current size of the dictionary
|
30 |
+
token_to_id[token] = len(token_to_id)
|
31 |
+
token_ids.append(token_to_id[token])
|
32 |
+
|
33 |
+
return tokens, token_ids
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
# def atomwise_tokenizer(smi, exclusive_tokens = None):
|
38 |
+
# """
|
39 |
+
# Tokenize a SMILES molecule at atom-level:
|
40 |
+
# (1) 'Br' and 'Cl' are two-character tokens
|
41 |
+
# (2) Symbols with bracket are considered as tokens
|
42 |
+
|
43 |
+
# exclusive_tokens: A list of specifical symbols with bracket you want to keep. e.g., ['[C@@H]', '[nH]'].
|
44 |
+
# Other symbols with bracket will be replaced by '[UNK]'. default is `None`.
|
45 |
+
# """
|
46 |
+
# import re
|
47 |
+
# pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
|
48 |
+
# regex = re.compile(pattern)
|
49 |
+
# tokens = [token for token in regex.findall(smi)]
|
50 |
+
|
51 |
+
# if exclusive_tokens:
|
52 |
+
# for i, tok in enumerate(tokens):
|
53 |
+
# if tok.startswith('['):
|
54 |
+
# if tok not in exclusive_tokens:
|
55 |
+
# tokens[i] = '[UNK]'
|
56 |
+
# return tokens
|
57 |
+
|
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
from AtomwiseTokenizer import atomwise_tokenizer
|
3 |
|
4 |
-
iface = gr.Interface(fn=atomwise_tokenizer, inputs=["text", "text"], outputs="text")
|
5 |
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
from AtomwiseTokenizer import atomwise_tokenizer
|
3 |
|
4 |
+
iface = gr.Interface(fn=atomwise_tokenizer, inputs=["text", "text"], outputs=["text","text"])
|
5 |
iface.launch()
|