saicharan2804 commited on
Commit
6b58f50
·
1 Parent(s): d235aee

Added token IDs

Browse files
Files changed (2) hide show
  1. SmilesPeTokenizer.py +25 -4
  2. app.py +1 -1
SmilesPeTokenizer.py CHANGED
@@ -1,12 +1,33 @@
1
  import codecs
2
  from SmilesPE.tokenizer import *
3
 
4
- def smilespe_tokenizer(smiles_string):
 
 
 
 
 
 
5
 
6
- spe_vob = codecs.open('chembl_smiles_tokenizer30000.txt')
 
 
7
  spe = SPE_Tokenizer(spe_vob)
8
-
 
9
  tokenized = spe.tokenize(smiles_string)
 
 
 
 
 
10
 
11
- return tokenized
 
 
12
 
 
 
 
 
 
 
1
  import codecs
2
  from SmilesPE.tokenizer import *
3
 
4
+ def load_vocabulary_to_dict(vocabulary_path):
5
+ vocab_dict = {}
6
+ with codecs.open(vocabulary_path, 'r', 'utf-8') as file:
7
+ for index, line in enumerate(file):
8
+ token = line.strip().split()[0] # Assuming first item is the token
9
+ vocab_dict[token] = index # Or use the token itself as ID if preferable
10
+ return vocab_dict
11
 
12
+ def smilespe_tokenizer(smiles_string, vocab_dict):
13
+ # Initialize SPE_Tokenizer with the vocabulary
14
+ spe_vob = codecs.open('chembl_smiles_tokenizer30000.txt', 'r', 'utf-8')
15
  spe = SPE_Tokenizer(spe_vob)
16
+
17
+ # Tokenize the SMILES string
18
  tokenized = spe.tokenize(smiles_string)
19
+
20
+ # Convert tokens to IDs using the vocab_dict
21
+ token_ids = [vocab_dict[token] for token in tokenized if token in vocab_dict]
22
+
23
+ return tokenized, token_ids
24
 
25
+ # Load the vocabulary into a dictionary
26
+ # vocab_path = 'chembl_smiles_tokenizer30000.txt'
27
+ # vocab_dict = load_vocabulary_to_dict(vocab_path)
28
 
29
+ # # Example usage
30
+ # smiles_string = 'Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1'
31
+ # tokens, token_ids = smilespe_tokenizer(smiles_string, vocab_dict)
32
+ # print("Tokens:", tokens)
33
+ # print("Token IDs:", token_ids)
app.py CHANGED
@@ -6,7 +6,7 @@ iface = gr.Interface(
6
  inputs=[
7
  gr.Textbox(label="SMILES"),
8
  ],
9
- outputs="text"
10
  )
11
 
12
  iface.launch()
 
6
  inputs=[
7
  gr.Textbox(label="SMILES"),
8
  ],
9
+ outputs=["text", "text"]
10
  )
11
 
12
  iface.launch()