Spaces:

saicharan2804
/

KmerTokenizer

Sleeping

App Files Files Community

saicharan2804 commited on Feb 27, 2024

Commit

4d8cc2b

1 Parent(s): a843a07

Added token IDs

Browse files

Files changed (2) hide show

KmerTokenizer.py +24 -24
app.py +1 -1

KmerTokenizer.py CHANGED Viewed

@@ -1,38 +1,38 @@
-def atomwise_tokenizer(smi, exclusive_tokens = None):
     """
-    Tokenize a SMILES molecule at atom-level:
-        (1) 'Br' and 'Cl' are two-character tokens
-        (2) Symbols with bracket are considered as tokens
-    exclusive_tokens: A list of specifical symbols with bracket you want to keep. e.g., ['[C@@H]', '[nH]'].
-    Other symbols with bracket will be replaced by '[UNK]'. default is `None`.
     """
     import re
-    pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
     regex = re.compile(pattern)
     tokens = [token for token in regex.findall(smi)]
     if exclusive_tokens:
-        for i, tok in enumerate(tokens):
-            if tok.startswith('['):
-                if tok not in exclusive_tokens:
-                    tokens[i] = '[UNK]'
     return tokens
-def tokens_to_mer(toks):
-    return ''.join(toks)
-def kmer_tokenizer(smiles, ngram=4, stride=1, remove_last = False, exclusive_tokens = None):
-    units = atomwise_tokenizer(smiles, exclusive_tokens = exclusive_tokens) #collect all the atom-wise tokens from the SMILES
     if ngram == 1:
         tokens = units
     else:
-        tokens = [tokens_to_mer(units[i:i+ngram]) for i in range(0, len(units), stride) if len(units[i:i+ngram]) == ngram]
-    if remove_last:
-        if len(tokens[-1]) < ngram: #truncate last whole k-mer if the length of the last k-mers is less than ngram.
-            tokens = tokens[:-1]
-    return tokens
-print(kmer_tokenizer('CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@@H](N)CCSC)[C@@H](C)O)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc1c[nH]cn1)C(=O)N[C@@H](CC(N)=O)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)NCC(=O)N[C@@H](CO)C(=O)N[C@@H](CCCN=C(N)N)C(N)=O'))

+def atomwise_tokenizer(smi, exclusive_tokens=None):
     """
+    Tokenize a SMILES molecule at atom-level.
     """
     import re
+    pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
     regex = re.compile(pattern)
     tokens = [token for token in regex.findall(smi)]
     if exclusive_tokens:
+        tokens = [tok if tok in exclusive_tokens or not tok.startswith('[') else '[UNK]' for tok in tokens]
     return tokens
+def kmer_tokenizer(smiles, ngram=4, stride=1, remove_last=False, exclusive_tokens=None):
+    """
+    Tokenize a SMILES molecule into k-mers and return both the tokens and their token IDs.
+    """
+    units = atomwise_tokenizer(smiles, exclusive_tokens=exclusive_tokens)  # Atom-wise tokens from the SMILES
     if ngram == 1:
         tokens = units
     else:
+        tokens = [''.join(units[i:i+ngram]) for i in range(0, len(units), stride) if len(units[i:i+ngram]) == ngram]
+    if remove_last and tokens and len(tokens[-1]) < ngram:
+        tokens = tokens[:-1]  # Remove the last token if its length is less than ngram
+    # Generating token IDs
+    token_to_id = {}
+    token_ids = []
+    for token in tokens:
+        if token not in token_to_id:
+            token_to_id[token] = len(token_to_id)  # Assign a new ID based on the current size of the dictionary
+        token_ids.append(token_to_id[token])
+    return tokens, token_ids
+# print(kmer_tokenizer('CC[N+](C)(C)Cc1ccccc1Br'))

app.py CHANGED Viewed

@@ -16,7 +16,7 @@ iface = gr.Interface(
         gr.Checkbox(label="Remove Last", value=False),
         gr.Textbox(label="Exclusive Tokens (comma-separated)", value="")
     ],
-    outputs="text"
 )
 iface.launch()

         gr.Checkbox(label="Remove Last", value=False),
         gr.Textbox(label="Exclusive Tokens (comma-separated)", value="")
     ],
+    outputs=["text", "text"]
 )
 iface.launch()