saicharan2804 commited on
Commit
5a4be66
·
1 Parent(s): 1cd9d39

Added token IDs

Browse files
Files changed (2) hide show
  1. AtomwiseTokenizer.py +49 -12
  2. app.py +1 -1
AtomwiseTokenizer.py CHANGED
@@ -1,20 +1,57 @@
1
- def atomwise_tokenizer(smi, exclusive_tokens = None):
2
  """
3
- Tokenize a SMILES molecule at atom-level:
4
- (1) 'Br' and 'Cl' are two-character tokens
5
- (2) Symbols with bracket are considered as tokens
 
6
 
7
- exclusive_tokens: A list of specifical symbols with bracket you want to keep. e.g., ['[C@@H]', '[nH]'].
8
- Other symbols with bracket will be replaced by '[UNK]'. default is `None`.
 
 
 
 
9
  """
10
  import re
11
- pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
12
  regex = re.compile(pattern)
13
  tokens = [token for token in regex.findall(smi)]
14
 
 
15
  if exclusive_tokens:
16
- for i, tok in enumerate(tokens):
17
- if tok.startswith('['):
18
- if tok not in exclusive_tokens:
19
- tokens[i] = '[UNK]'
20
- return tokens
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def atomwise_tokenizer(smi, exclusive_tokens=None):
2
  """
3
+ Tokenize a SMILES molecule at atom-level and return tokens with their token IDs.
4
+ - 'Br' and 'Cl' are two-character tokens.
5
+ - Symbols with brackets are considered as tokens.
6
+ - If `exclusive_tokens` is provided, symbols with brackets not in `exclusive_tokens` will be replaced by '[UNK]'.
7
 
8
+ Parameters:
9
+ - smi (str): SMILES string to tokenize.
10
+ - exclusive_tokens (list of str, optional): Specific symbols with brackets to keep.
11
+
12
+ Returns:
13
+ - tuple: (tokens, token_ids), where tokens is a list of atom-level tokens and token_ids is a list of corresponding token IDs.
14
  """
15
  import re
16
+ pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
17
  regex = re.compile(pattern)
18
  tokens = [token for token in regex.findall(smi)]
19
 
20
+ # Handle exclusive tokens, replacing non-exclusive bracketed tokens with '[UNK]'
21
  if exclusive_tokens:
22
+ tokens = [tok if tok in exclusive_tokens or not tok.startswith('[') else '[UNK]' for tok in tokens]
23
+
24
+ # Generating token IDs based on the order of unique token appearance
25
+ token_to_id = {}
26
+ token_ids = []
27
+ for token in tokens:
28
+ if token not in token_to_id:
29
+ # Assign a new ID based on the current size of the dictionary
30
+ token_to_id[token] = len(token_to_id)
31
+ token_ids.append(token_to_id[token])
32
+
33
+ return tokens, token_ids
34
+
35
+
36
+
37
+ # def atomwise_tokenizer(smi, exclusive_tokens = None):
38
+ # """
39
+ # Tokenize a SMILES molecule at atom-level:
40
+ # (1) 'Br' and 'Cl' are two-character tokens
41
+ # (2) Symbols with bracket are considered as tokens
42
+
43
+ # exclusive_tokens: A list of specifical symbols with bracket you want to keep. e.g., ['[C@@H]', '[nH]'].
44
+ # Other symbols with bracket will be replaced by '[UNK]'. default is `None`.
45
+ # """
46
+ # import re
47
+ # pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
48
+ # regex = re.compile(pattern)
49
+ # tokens = [token for token in regex.findall(smi)]
50
+
51
+ # if exclusive_tokens:
52
+ # for i, tok in enumerate(tokens):
53
+ # if tok.startswith('['):
54
+ # if tok not in exclusive_tokens:
55
+ # tokens[i] = '[UNK]'
56
+ # return tokens
57
+
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
  from AtomwiseTokenizer import atomwise_tokenizer
3
 
4
- iface = gr.Interface(fn=atomwise_tokenizer, inputs=["text", "text"], outputs="text")
5
  iface.launch()
 
1
  import gradio as gr
2
  from AtomwiseTokenizer import atomwise_tokenizer
3
 
4
+ iface = gr.Interface(fn=atomwise_tokenizer, inputs=["text", "text"], outputs=["text","text"])
5
  iface.launch()