Spaces:
Runtime error
Runtime error
saicharan2804
commited on
Commit
•
1fc0c38
1
Parent(s):
ef06a41
SmilesPE tokenizer
Browse files- SmilesPeTokenizer.py +12 -0
- app.py +13 -0
- chembl_smiles_tokenizer30000.txt +148 -0
- requirements.txt +1 -0
- trainSmilesPeTokenizer.py +19 -0
SmilesPeTokenizer.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import codecs
|
2 |
+
from SmilesPE.tokenizer import *
|
3 |
+
|
4 |
+
def smilespe_tokenizer(smiles_string):
|
5 |
+
|
6 |
+
spe_vob = codecs.open('chembl_smiles_tokenizer30000.txt')
|
7 |
+
spe = SPE_Tokenizer(spe_vob)
|
8 |
+
|
9 |
+
tokenized = spe.tokenize(smiles_string)
|
10 |
+
|
11 |
+
return tokenized
|
12 |
+
|
app.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from SmilesPeTokenizer import smilespe_tokenizer
|
3 |
+
|
4 |
+
iface = gr.Interface(
|
5 |
+
fn = smilespe_tokenizer,
|
6 |
+
inputs=[
|
7 |
+
gr.Textbox(label="SMILES"),
|
8 |
+
],
|
9 |
+
outputs="text"
|
10 |
+
)
|
11 |
+
|
12 |
+
iface.launch()
|
13 |
+
|
chembl_smiles_tokenizer30000.txt
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
c c
|
2 |
+
C C
|
3 |
+
O )
|
4 |
+
C (
|
5 |
+
= O)
|
6 |
+
c 1
|
7 |
+
c (
|
8 |
+
C )
|
9 |
+
c 2
|
10 |
+
C( =O)
|
11 |
+
cc cc
|
12 |
+
( C)
|
13 |
+
c 3
|
14 |
+
cc c(
|
15 |
+
) cc
|
16 |
+
CC CC
|
17 |
+
[C@H] (
|
18 |
+
[C@@H] (
|
19 |
+
( =O)
|
20 |
+
N )
|
21 |
+
C(=O) N
|
22 |
+
2 )
|
23 |
+
N C(=O)
|
24 |
+
C 1
|
25 |
+
cc (
|
26 |
+
C N
|
27 |
+
C( C)
|
28 |
+
c1 ccc(
|
29 |
+
F )
|
30 |
+
c1 cccc
|
31 |
+
C O
|
32 |
+
c2 cccc
|
33 |
+
CC N
|
34 |
+
O C)
|
35 |
+
1 )
|
36 |
+
3 )
|
37 |
+
c 4
|
38 |
+
c n
|
39 |
+
c2 )
|
40 |
+
c1 )
|
41 |
+
c2 ccc(
|
42 |
+
= C(
|
43 |
+
C 2
|
44 |
+
n c(
|
45 |
+
c2 c(
|
46 |
+
( CC
|
47 |
+
n 1
|
48 |
+
)cc 1
|
49 |
+
C =
|
50 |
+
(C) C)
|
51 |
+
C( N
|
52 |
+
O CC
|
53 |
+
Cl )
|
54 |
+
c1 c(
|
55 |
+
( O)
|
56 |
+
= O
|
57 |
+
c3 cccc
|
58 |
+
C(=O)N [C@@H](
|
59 |
+
NC(=O) [C@H](
|
60 |
+
c1 cc(
|
61 |
+
c1 cc
|
62 |
+
CC (=O)
|
63 |
+
C /
|
64 |
+
C(=O) O)
|
65 |
+
( C
|
66 |
+
S (=O)
|
67 |
+
c( =O)
|
68 |
+
cc 1
|
69 |
+
CC 1
|
70 |
+
O C
|
71 |
+
CC CN
|
72 |
+
c3cccc c3
|
73 |
+
N C(
|
74 |
+
n 2
|
75 |
+
( F)
|
76 |
+
CC C
|
77 |
+
[C@H] 1
|
78 |
+
c2cccc c2)
|
79 |
+
C( O)
|
80 |
+
Cl )cc
|
81 |
+
[C@@H] 1
|
82 |
+
c1cccc c1)
|
83 |
+
c1cccc c1
|
84 |
+
C(C) C)
|
85 |
+
[C@H]( O)
|
86 |
+
c3 c(
|
87 |
+
c3 ccc(
|
88 |
+
S(=O) (=O)
|
89 |
+
c2cccc c2
|
90 |
+
F )cc
|
91 |
+
O) cc
|
92 |
+
C( F)
|
93 |
+
O =C(
|
94 |
+
C( =
|
95 |
+
c2 n
|
96 |
+
N) =O)
|
97 |
+
4 )
|
98 |
+
CCCC CCCC
|
99 |
+
c2 cc
|
100 |
+
CC (
|
101 |
+
C(F) (F)
|
102 |
+
N 1
|
103 |
+
/ C=
|
104 |
+
C O)
|
105 |
+
[C@@H] (C)
|
106 |
+
[C@@H]( O)
|
107 |
+
c2 cc(
|
108 |
+
c1 n
|
109 |
+
CC (C)
|
110 |
+
[C@H] 2
|
111 |
+
C 3
|
112 |
+
[C@@H] 2
|
113 |
+
C c1ccc(
|
114 |
+
= N
|
115 |
+
NC(=O) [C@@H](
|
116 |
+
CC )
|
117 |
+
c1 (
|
118 |
+
c1 2
|
119 |
+
[O-] )
|
120 |
+
CCN (
|
121 |
+
CC (C)C)
|
122 |
+
[C@H] (C)
|
123 |
+
c 5
|
124 |
+
O C(=O)
|
125 |
+
N (
|
126 |
+
c [nH]
|
127 |
+
C(=O) O
|
128 |
+
=O) =O)
|
129 |
+
CC 2
|
130 |
+
CC C(
|
131 |
+
c( O)
|
132 |
+
O =
|
133 |
+
cc 2
|
134 |
+
c( -
|
135 |
+
c3 )
|
136 |
+
C(=O)N [C@H](
|
137 |
+
C c1
|
138 |
+
C S
|
139 |
+
c( OC)
|
140 |
+
/C= C/
|
141 |
+
CC 2)
|
142 |
+
c3ccccc3 )
|
143 |
+
c1cccc (
|
144 |
+
C(C) (C)
|
145 |
+
c4 cccc
|
146 |
+
N 2
|
147 |
+
cc 2)
|
148 |
+
C c1ccccc1)
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
SmilesPE
|
trainSmilesPeTokenizer.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import codecs
|
2 |
+
from SmilesPE.learner import *
|
3 |
+
import pandas as pd
|
4 |
+
import argparse
|
5 |
+
|
6 |
+
parser = argparse.ArgumentParser(description='Train SmilesPE Tokenizer.')
|
7 |
+
parser.add_argument('dataset_file_path', type=str, help='Path to the dataset file')
|
8 |
+
parser.add_argument('output_file_path', type=str, help='Path to file containing trained tokenizer weights')
|
9 |
+
|
10 |
+
# Parse the arguments
|
11 |
+
args = parser.parse_args()
|
12 |
+
|
13 |
+
|
14 |
+
df = pd.read_csv(args.dataset_file_path)
|
15 |
+
|
16 |
+
# df = df[0:30000]
|
17 |
+
|
18 |
+
output = codecs.open(args.output_file_path, 'w')
|
19 |
+
learn_SPE(df['canonical_smiles'].tolist(), output, 30000, min_frequency=2000, augmentation=1, verbose=True, total_symbols=True)
|