Spaces:
Sleeping
Sleeping
saicharan2804
commited on
Commit
•
4d8cc2b
1
Parent(s):
a843a07
Added token IDs
Browse files- KmerTokenizer.py +24 -24
- app.py +1 -1
KmerTokenizer.py
CHANGED
@@ -1,38 +1,38 @@
|
|
1 |
-
def atomwise_tokenizer(smi, exclusive_tokens
|
2 |
"""
|
3 |
-
Tokenize a SMILES molecule at atom-level
|
4 |
-
(1) 'Br' and 'Cl' are two-character tokens
|
5 |
-
(2) Symbols with bracket are considered as tokens
|
6 |
-
|
7 |
-
exclusive_tokens: A list of specifical symbols with bracket you want to keep. e.g., ['[C@@H]', '[nH]'].
|
8 |
-
Other symbols with bracket will be replaced by '[UNK]'. default is `None`.
|
9 |
"""
|
10 |
import re
|
11 |
-
pattern =
|
12 |
regex = re.compile(pattern)
|
13 |
tokens = [token for token in regex.findall(smi)]
|
14 |
|
15 |
if exclusive_tokens:
|
16 |
-
|
17 |
-
if tok.startswith('['):
|
18 |
-
if tok not in exclusive_tokens:
|
19 |
-
tokens[i] = '[UNK]'
|
20 |
return tokens
|
21 |
-
|
22 |
-
def
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
units = atomwise_tokenizer(smiles, exclusive_tokens
|
27 |
if ngram == 1:
|
28 |
tokens = units
|
29 |
else:
|
30 |
-
tokens = [
|
31 |
|
32 |
-
if remove_last:
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
|
38 |
-
print(kmer_tokenizer('CC[
|
|
|
1 |
+
def atomwise_tokenizer(smi, exclusive_tokens=None):
|
2 |
"""
|
3 |
+
Tokenize a SMILES molecule at atom-level.
|
|
|
|
|
|
|
|
|
|
|
4 |
"""
|
5 |
import re
|
6 |
+
pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
|
7 |
regex = re.compile(pattern)
|
8 |
tokens = [token for token in regex.findall(smi)]
|
9 |
|
10 |
if exclusive_tokens:
|
11 |
+
tokens = [tok if tok in exclusive_tokens or not tok.startswith('[') else '[UNK]' for tok in tokens]
|
|
|
|
|
|
|
12 |
return tokens
|
13 |
+
|
14 |
+
def kmer_tokenizer(smiles, ngram=4, stride=1, remove_last=False, exclusive_tokens=None):
|
15 |
+
"""
|
16 |
+
Tokenize a SMILES molecule into k-mers and return both the tokens and their token IDs.
|
17 |
+
"""
|
18 |
+
units = atomwise_tokenizer(smiles, exclusive_tokens=exclusive_tokens) # Atom-wise tokens from the SMILES
|
19 |
if ngram == 1:
|
20 |
tokens = units
|
21 |
else:
|
22 |
+
tokens = [''.join(units[i:i+ngram]) for i in range(0, len(units), stride) if len(units[i:i+ngram]) == ngram]
|
23 |
|
24 |
+
if remove_last and tokens and len(tokens[-1]) < ngram:
|
25 |
+
tokens = tokens[:-1] # Remove the last token if its length is less than ngram
|
26 |
+
|
27 |
+
# Generating token IDs
|
28 |
+
token_to_id = {}
|
29 |
+
token_ids = []
|
30 |
+
for token in tokens:
|
31 |
+
if token not in token_to_id:
|
32 |
+
token_to_id[token] = len(token_to_id) # Assign a new ID based on the current size of the dictionary
|
33 |
+
token_ids.append(token_to_id[token])
|
34 |
+
|
35 |
+
return tokens, token_ids
|
36 |
|
37 |
|
38 |
+
# print(kmer_tokenizer('CC[N+](C)(C)Cc1ccccc1Br'))
|
app.py
CHANGED
@@ -16,7 +16,7 @@ iface = gr.Interface(
|
|
16 |
gr.Checkbox(label="Remove Last", value=False),
|
17 |
gr.Textbox(label="Exclusive Tokens (comma-separated)", value="")
|
18 |
],
|
19 |
-
outputs="text"
|
20 |
)
|
21 |
|
22 |
iface.launch()
|
|
|
16 |
gr.Checkbox(label="Remove Last", value=False),
|
17 |
gr.Textbox(label="Exclusive Tokens (comma-separated)", value="")
|
18 |
],
|
19 |
+
outputs=["text", "text"]
|
20 |
)
|
21 |
|
22 |
iface.launch()
|