saicharan2804 commited on
Commit
4d8cc2b
1 Parent(s): a843a07

Added token IDs

Browse files
Files changed (2) hide show
  1. KmerTokenizer.py +24 -24
  2. app.py +1 -1
KmerTokenizer.py CHANGED
@@ -1,38 +1,38 @@
1
- def atomwise_tokenizer(smi, exclusive_tokens = None):
2
  """
3
- Tokenize a SMILES molecule at atom-level:
4
- (1) 'Br' and 'Cl' are two-character tokens
5
- (2) Symbols with bracket are considered as tokens
6
-
7
- exclusive_tokens: A list of specifical symbols with bracket you want to keep. e.g., ['[C@@H]', '[nH]'].
8
- Other symbols with bracket will be replaced by '[UNK]'. default is `None`.
9
  """
10
  import re
11
- pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
12
  regex = re.compile(pattern)
13
  tokens = [token for token in regex.findall(smi)]
14
 
15
  if exclusive_tokens:
16
- for i, tok in enumerate(tokens):
17
- if tok.startswith('['):
18
- if tok not in exclusive_tokens:
19
- tokens[i] = '[UNK]'
20
  return tokens
21
-
22
- def tokens_to_mer(toks):
23
- return ''.join(toks)
24
-
25
- def kmer_tokenizer(smiles, ngram=4, stride=1, remove_last = False, exclusive_tokens = None):
26
- units = atomwise_tokenizer(smiles, exclusive_tokens = exclusive_tokens) #collect all the atom-wise tokens from the SMILES
27
  if ngram == 1:
28
  tokens = units
29
  else:
30
- tokens = [tokens_to_mer(units[i:i+ngram]) for i in range(0, len(units), stride) if len(units[i:i+ngram]) == ngram]
31
 
32
- if remove_last:
33
- if len(tokens[-1]) < ngram: #truncate last whole k-mer if the length of the last k-mers is less than ngram.
34
- tokens = tokens[:-1]
35
- return tokens
 
 
 
 
 
 
 
 
36
 
37
 
38
- print(kmer_tokenizer('CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@@H](N)CCSC)[C@@H](C)O)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc1c[nH]cn1)C(=O)N[C@@H](CC(N)=O)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)NCC(=O)N[C@@H](CO)C(=O)N[C@@H](CCCN=C(N)N)C(N)=O'))
 
1
+ def atomwise_tokenizer(smi, exclusive_tokens=None):
2
  """
3
+ Tokenize a SMILES molecule at atom-level.
 
 
 
 
 
4
  """
5
  import re
6
+ pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
7
  regex = re.compile(pattern)
8
  tokens = [token for token in regex.findall(smi)]
9
 
10
  if exclusive_tokens:
11
+ tokens = [tok if tok in exclusive_tokens or not tok.startswith('[') else '[UNK]' for tok in tokens]
 
 
 
12
  return tokens
13
+
14
+ def kmer_tokenizer(smiles, ngram=4, stride=1, remove_last=False, exclusive_tokens=None):
15
+ """
16
+ Tokenize a SMILES molecule into k-mers and return both the tokens and their token IDs.
17
+ """
18
+ units = atomwise_tokenizer(smiles, exclusive_tokens=exclusive_tokens) # Atom-wise tokens from the SMILES
19
  if ngram == 1:
20
  tokens = units
21
  else:
22
+ tokens = [''.join(units[i:i+ngram]) for i in range(0, len(units), stride) if len(units[i:i+ngram]) == ngram]
23
 
24
+ if remove_last and tokens and len(tokens[-1]) < ngram:
25
+ tokens = tokens[:-1] # Remove the last token if its length is less than ngram
26
+
27
+ # Generating token IDs
28
+ token_to_id = {}
29
+ token_ids = []
30
+ for token in tokens:
31
+ if token not in token_to_id:
32
+ token_to_id[token] = len(token_to_id) # Assign a new ID based on the current size of the dictionary
33
+ token_ids.append(token_to_id[token])
34
+
35
+ return tokens, token_ids
36
 
37
 
38
+ # print(kmer_tokenizer('CC[N+](C)(C)Cc1ccccc1Br'))
app.py CHANGED
@@ -16,7 +16,7 @@ iface = gr.Interface(
16
  gr.Checkbox(label="Remove Last", value=False),
17
  gr.Textbox(label="Exclusive Tokens (comma-separated)", value="")
18
  ],
19
- outputs="text"
20
  )
21
 
22
  iface.launch()
 
16
  gr.Checkbox(label="Remove Last", value=False),
17
  gr.Textbox(label="Exclusive Tokens (comma-separated)", value="")
18
  ],
19
+ outputs=["text", "text"]
20
  )
21
 
22
  iface.launch()