Spaces:

saicharan2804
/

AtomwiseTokenizer

Sleeping

App Files Files Community

saicharan2804 commited on Feb 27, 2024

Commit

5a4be66

1 Parent(s): 1cd9d39

Added token IDs

Browse files

Files changed (2) hide show

AtomwiseTokenizer.py +49 -12
app.py +1 -1

AtomwiseTokenizer.py CHANGED Viewed

@@ -1,20 +1,57 @@
-def atomwise_tokenizer(smi, exclusive_tokens = None):
     """
-    Tokenize a SMILES molecule at atom-level:
-        (1) 'Br' and 'Cl' are two-character tokens
-        (2) Symbols with bracket are considered as tokens
-    exclusive_tokens: A list of specifical symbols with bracket you want to keep. e.g., ['[C@@H]', '[nH]'].
-    Other symbols with bracket will be replaced by '[UNK]'. default is `None`.
     """
     import re
-    pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
     regex = re.compile(pattern)
     tokens = [token for token in regex.findall(smi)]
     if exclusive_tokens:
-        for i, tok in enumerate(tokens):
-            if tok.startswith('['):
-                if tok not in exclusive_tokens:
-                    tokens[i] = '[UNK]'
-    return tokens

+def atomwise_tokenizer(smi, exclusive_tokens=None):
     """
+    Tokenize a SMILES molecule at atom-level and return tokens with their token IDs.
+    - 'Br' and 'Cl' are two-character tokens.
+    - Symbols with brackets are considered as tokens.
+    - If `exclusive_tokens` is provided, symbols with brackets not in `exclusive_tokens` will be replaced by '[UNK]'.
+    Parameters:
+    - smi (str): SMILES string to tokenize.
+    - exclusive_tokens (list of str, optional): Specific symbols with brackets to keep.
+    Returns:
+    - tuple: (tokens, token_ids), where tokens is a list of atom-level tokens and token_ids is a list of corresponding token IDs.
     """
     import re
+    pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
     regex = re.compile(pattern)
     tokens = [token for token in regex.findall(smi)]
+    # Handle exclusive tokens, replacing non-exclusive bracketed tokens with '[UNK]'
     if exclusive_tokens:
+        tokens = [tok if tok in exclusive_tokens or not tok.startswith('[') else '[UNK]' for tok in tokens]
+    # Generating token IDs based on the order of unique token appearance
+    token_to_id = {}
+    token_ids = []
+    for token in tokens:
+        if token not in token_to_id:
+            # Assign a new ID based on the current size of the dictionary
+            token_to_id[token] = len(token_to_id)
+        token_ids.append(token_to_id[token])
+    return tokens, token_ids
+# def atomwise_tokenizer(smi, exclusive_tokens = None):
+#     """
+#     Tokenize a SMILES molecule at atom-level:
+#         (1) 'Br' and 'Cl' are two-character tokens
+#         (2) Symbols with bracket are considered as tokens
+#     exclusive_tokens: A list of specifical symbols with bracket you want to keep. e.g., ['[C@@H]', '[nH]'].
+#     Other symbols with bracket will be replaced by '[UNK]'. default is `None`.
+#     """
+#     import re
+#     pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
+#     regex = re.compile(pattern)
+#     tokens = [token for token in regex.findall(smi)]
+#     if exclusive_tokens:
+#         for i, tok in enumerate(tokens):
+#             if tok.startswith('['):
+#                 if tok not in exclusive_tokens:
+#                     tokens[i] = '[UNK]'
+#     return tokens

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
 from AtomwiseTokenizer import atomwise_tokenizer
-iface = gr.Interface(fn=atomwise_tokenizer, inputs=["text", "text"], outputs="text")
 iface.launch()

 import gradio as gr
 from AtomwiseTokenizer import atomwise_tokenizer
+iface = gr.Interface(fn=atomwise_tokenizer, inputs=["text", "text"], outputs=["text","text"])
 iface.launch()