Spaces:
Running
Running
v2 init
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- Dockerfile +13 -0
- Dockerfile-conda +13 -0
- README.md +1 -1
- app.py +489 -323
- data/lce/test.csv +31 -0
- data/lce/test_data.csv +14 -0
- data/lce/train.csv +121 -0
- data/lce/train_data.csv +148 -0
- models/.gitattributes +3 -0
- models/fm4m.py +366 -74
- models/mhg_model/README.md +1 -1
- models/mhg_model/images/mhg_example.png +0 -0
- models/mhg_model/images/mhg_example1.png +0 -0
- models/mhg_model/images/mhg_example2.png +0 -0
- models/mhg_model/load.py +22 -3
- models/mhg_model/paper/MHG-GNN_Combination of Molecular Hypergraph Grammar with Graph Neural Network.pdf +0 -0
- models/selfies_model/selfies-ted.png +0 -0
- models/selfies_ted/README.md +87 -0
- models/selfies_ted/load.py +92 -0
- models/selfies_ted/requirements.txt +12 -0
- models/selfies_ted/selfies-ted-example.ipynb +136 -0
- models/selfies_ted/selfies-ted.png +3 -0
- models/smi_ted/.gitignore +18 -0
- models/smi_ted/README.md +138 -0
- models/smi_ted/finetune/args.py +337 -0
- models/smi_ted/finetune/finetune_classification.py +68 -0
- models/smi_ted/finetune/finetune_classification_multitask.py +101 -0
- models/smi_ted/finetune/finetune_regression.py +70 -0
- models/smi_ted/finetune/moleculenet/bace/test.csv +3 -0
- models/smi_ted/finetune/moleculenet/bace/train.csv +3 -0
- models/smi_ted/finetune/moleculenet/bace/valid.csv +3 -0
- models/smi_ted/finetune/moleculenet/bbbp/test.csv +3 -0
- models/smi_ted/finetune/moleculenet/bbbp/train.csv +3 -0
- models/smi_ted/finetune/moleculenet/bbbp/valid.csv +3 -0
- models/smi_ted/finetune/moleculenet/biodegradability/biodeg_example.csv +3 -0
- models/smi_ted/finetune/moleculenet/biodegradability/biodegradability.csv +3 -0
- models/smi_ted/finetune/moleculenet/biodegradability/test.csv +3 -0
- models/smi_ted/finetune/moleculenet/biodegradability/train.csv +3 -0
- models/smi_ted/finetune/moleculenet/biodegradability/valid.csv +3 -0
- models/smi_ted/finetune/moleculenet/clintox/test.csv +3 -0
- models/smi_ted/finetune/moleculenet/clintox/train.csv +3 -0
- models/smi_ted/finetune/moleculenet/clintox/valid.csv +3 -0
- models/smi_ted/finetune/moleculenet/esol/test.csv +3 -0
- models/smi_ted/finetune/moleculenet/esol/train.csv +3 -0
- models/smi_ted/finetune/moleculenet/esol/valid.csv +3 -0
- models/smi_ted/finetune/moleculenet/freesolv/test.csv +3 -0
- models/smi_ted/finetune/moleculenet/freesolv/train.csv +3 -0
- models/smi_ted/finetune/moleculenet/freesolv/valid.csv +3 -0
- models/smi_ted/finetune/moleculenet/hiv/test.csv +3 -0
- models/smi_ted/finetune/moleculenet/hiv/train.csv +3 -0
Dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9.7
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
COPY requirements.txt .
|
5 |
+
RUN pip install -r requirements.txt
|
6 |
+
# preload models
|
7 |
+
RUN python -c '\
|
8 |
+
from transformers import BartForConditionalGeneration, AutoTokenizer;\
|
9 |
+
AutoTokenizer.from_pretrained("ibm/materials.selfies-ted");\
|
10 |
+
BartForConditionalGeneration.from_pretrained("ibm/materials.selfies-ted")'
|
11 |
+
COPY . .
|
12 |
+
|
13 |
+
CMD ["python", "app.py"]
|
Dockerfile-conda
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM condaforge/miniforge3
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
SHELL ["/bin/bash", "-i", "-c"]
|
5 |
+
RUN apt-get update && \
|
6 |
+
apt-get install -y build-essential libxrender1 libxext-dev
|
7 |
+
RUN conda create --name fm4m python=3.9.7
|
8 |
+
RUN conda activate fm4m
|
9 |
+
COPY requirements.txt .
|
10 |
+
RUN pip install -r requirements.txt
|
11 |
+
COPY . .
|
12 |
+
|
13 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title: Fm4m Kit
|
3 |
emoji: 🐢
|
4 |
colorFrom: indigo
|
5 |
colorTo: blue
|
|
|
1 |
---
|
2 |
+
title: Fix Fm4m Kit
|
3 |
emoji: 🐢
|
4 |
colorFrom: indigo
|
5 |
colorTo: blue
|
app.py
CHANGED
@@ -1,142 +1,103 @@
|
|
1 |
import gradio as gr
|
2 |
-
from huggingface_hub import InferenceClient
|
3 |
import matplotlib.pyplot as plt
|
4 |
-
|
5 |
-
|
6 |
-
from rdkit.Chem.Crippen import MolLogP
|
7 |
import pandas as pd
|
8 |
-
|
9 |
-
from rdkit.Chem import DataStructs, AllChem
|
10 |
-
from transformers import BartForConditionalGeneration, AutoTokenizer, AutoModel
|
11 |
-
from transformers.modeling_outputs import BaseModelOutput
|
12 |
import selfies as sf
|
13 |
-
from rdkit import Chem
|
14 |
import torch
|
15 |
-
import numpy as np
|
16 |
-
import umap
|
17 |
-
import pickle
|
18 |
import xgboost as xgb
|
19 |
-
from
|
20 |
-
from
|
|
|
|
|
|
|
21 |
from sklearn.kernel_ridge import KernelRidge
|
22 |
-
import
|
23 |
-
|
24 |
-
import
|
|
|
25 |
|
26 |
os.environ["OMP_MAX_ACTIVE_LEVELS"] = "1"
|
27 |
|
28 |
-
# my_theme = gr.Theme.from_hub("ysharma/steampunk")
|
29 |
-
# my_theme = gr.themes.Glass()
|
30 |
-
|
31 |
-
"""
|
32 |
-
# カスタムテーマ設定
|
33 |
-
theme = gr.themes.Default().set(
|
34 |
-
body_background_fill="#000000", # 背景色を黒に設定
|
35 |
-
text_color="#FFFFFF", # テキスト色を白に設定
|
36 |
-
)
|
37 |
-
"""
|
38 |
-
"""
|
39 |
-
import sys
|
40 |
-
sys.path.append("models")
|
41 |
-
sys.path.append("../models")
|
42 |
-
sys.path.append("../")"""
|
43 |
-
|
44 |
-
|
45 |
-
# Get the current file's directory
|
46 |
-
base_dir = os.path.dirname(__file__)
|
47 |
-
print("Base Dir : ", base_dir)
|
48 |
-
|
49 |
import models.fm4m as fm4m
|
50 |
|
|
|
|
|
51 |
|
52 |
# Function to display molecule image from SMILES
|
53 |
def smiles_to_image(smiles):
|
54 |
mol = Chem.MolFromSmiles(smiles)
|
55 |
-
if mol
|
56 |
-
img = Draw.MolToImage(mol)
|
57 |
-
return img
|
58 |
-
return None
|
59 |
-
|
60 |
-
|
61 |
-
# Function to get canonical SMILES
|
62 |
-
def get_canonical_smiles(smiles):
|
63 |
-
mol = Chem.MolFromSmiles(smiles)
|
64 |
-
if mol:
|
65 |
-
return Chem.MolToSmiles(mol, canonical=True)
|
66 |
-
return None
|
67 |
|
68 |
|
69 |
# Dictionary for SMILES strings and corresponding images (you can replace with your actual image paths)
|
70 |
smiles_image_mapping = {
|
71 |
-
"Mol 1": {
|
|
|
|
|
|
|
72 |
# Example SMILES for ethanol
|
73 |
-
"Mol 2": {
|
|
|
|
|
|
|
74 |
# Example SMILES for butane
|
75 |
-
"Mol 3": {
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
78 |
# Example SMILES for diethyl ether
|
79 |
-
"Mol 5": {
|
|
|
|
|
|
|
80 |
}
|
81 |
|
82 |
datasets = [" ", "BACE", "ESOL", "Load Custom Dataset"]
|
83 |
|
84 |
-
models_enabled = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
fusion_available = ["Concat"]
|
87 |
|
88 |
-
global log_df
|
89 |
-
log_df = pd.DataFrame(columns=["Selected Models", "Dataset", "Task", "Result"])
|
90 |
-
|
91 |
-
|
92 |
-
def log_selection(models, dataset, task_type, result, log_df):
|
93 |
-
# Append the new entry to the DataFrame
|
94 |
-
new_entry = {"Selected Models": str(models), "Dataset": dataset, "Task": task_type, "Result": result}
|
95 |
-
updated_log_df = log_df.append(new_entry, ignore_index=True)
|
96 |
-
return updated_log_df
|
97 |
-
|
98 |
|
99 |
# Function to handle evaluation and logging
|
100 |
-
def
|
101 |
-
return
|
102 |
-
def evaluate_and_log(models, dataset, task_type, eval_output):
|
103 |
task_dic = {'Classification': 'CLS', 'Regression': 'RGR'}
|
104 |
-
result = f"{eval_output}"
|
105 |
result = result.replace(" Score", "")
|
106 |
|
107 |
-
new_entry = {
|
|
|
|
|
|
|
|
|
|
|
108 |
new_entry_df = pd.DataFrame([new_entry])
|
109 |
|
110 |
-
log_df = pd.
|
111 |
-
|
112 |
-
|
113 |
-
log_df.to_csv('log.csv')
|
114 |
-
|
115 |
-
return log_df
|
116 |
-
|
117 |
-
|
118 |
-
try:
|
119 |
-
log_df = pd.read_csv('log.csv', index_col=0)
|
120 |
-
except:
|
121 |
-
log_df = pd.DataFrame({"":[],
|
122 |
-
'Selected Models': [],
|
123 |
-
'Dataset': [],
|
124 |
-
'Task': [],
|
125 |
-
'Result': []
|
126 |
-
})
|
127 |
-
csv_file_path = 'log.csv'
|
128 |
-
log_df.to_csv(csv_file_path, index=False)
|
129 |
|
130 |
|
131 |
# Load images for selection
|
132 |
def load_image(path):
|
133 |
try:
|
134 |
-
return Image.open(smiles_image_mapping[path]["image"])
|
135 |
except:
|
136 |
pass
|
137 |
|
138 |
|
139 |
-
|
140 |
# Function to handle image selection
|
141 |
def handle_image_selection(image_key):
|
142 |
smiles = smiles_image_mapping[image_key]["smiles"]
|
@@ -160,49 +121,55 @@ def calculate_tanimoto(smiles1, smiles2):
|
|
160 |
mol1 = Chem.MolFromSmiles(smiles1)
|
161 |
mol2 = Chem.MolFromSmiles(smiles2)
|
162 |
if mol1 and mol2:
|
163 |
-
# fp1 = FingerprintMols.FingerprintMol(mol1)
|
164 |
-
# fp2 = FingerprintMols.FingerprintMol(mol2)
|
165 |
fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2)
|
166 |
fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2)
|
167 |
return round(DataStructs.FingerprintSimilarity(fp1, fp2), 2)
|
168 |
return None
|
169 |
|
170 |
|
171 |
-
#with open("models/selfies_model/bart-2908.pickle", "rb") as input_file:
|
172 |
-
# gen_model, gen_tokenizer = pickle.load(input_file)
|
173 |
-
|
174 |
gen_tokenizer = AutoTokenizer.from_pretrained("ibm/materials.selfies-ted")
|
175 |
gen_model = BartForConditionalGeneration.from_pretrained("ibm/materials.selfies-ted")
|
176 |
|
177 |
|
178 |
def generate(latent_vector, mask):
|
179 |
encoder_outputs = BaseModelOutput(latent_vector)
|
180 |
-
decoder_output = gen_model.generate(
|
181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
selfies = gen_tokenizer.batch_decode(decoder_output, skip_special_tokens=True)
|
183 |
-
|
184 |
-
for i in selfies:
|
185 |
-
outs.append(sf.decoder(i.replace("] [", "][")))
|
186 |
-
return outs
|
187 |
|
188 |
|
189 |
def perturb_latent(latent_vecs, noise_scale=0.5):
|
190 |
-
|
191 |
-
|
192 |
-
|
|
|
|
|
|
|
|
|
193 |
|
194 |
|
195 |
def encode(selfies):
|
196 |
-
encoding = gen_tokenizer(
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
input_ids = encoding['input_ids']
|
198 |
attention_mask = encoding['attention_mask']
|
199 |
-
outputs = gen_model.model.encoder(
|
|
|
|
|
200 |
model_output = outputs.last_hidden_state
|
201 |
-
|
202 |
-
"""input_mask_expanded = attention_mask.unsqueeze(-1).expand(model_output.size()).float()
|
203 |
-
sum_embeddings = torch.sum(model_output * input_mask_expanded, 1)
|
204 |
-
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
205 |
-
model_output = sum_embeddings / sum_mask"""
|
206 |
return model_output, attention_mask
|
207 |
|
208 |
|
@@ -217,8 +184,13 @@ def generate_canonical(smiles):
|
|
217 |
noise = i / 10
|
218 |
perturbed_latent = perturb_latent(latent_vec, noise_scale=noise)
|
219 |
gen = generate(perturbed_latent, mask)
|
220 |
-
|
221 |
-
if
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
if gen_mol:
|
224 |
# Calculate properties for ref and gen molecules
|
@@ -230,9 +202,20 @@ def generate_canonical(smiles):
|
|
230 |
# Prepare the table with ref mol and gen mol
|
231 |
data = {
|
232 |
"Property": ["QED", "SA", "LogP", "Mol Wt", "Tanimoto Similarity"],
|
233 |
-
"Reference Mol": [
|
234 |
-
|
235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
}
|
237 |
df = pd.DataFrame(data)
|
238 |
|
@@ -245,7 +228,7 @@ def generate_canonical(smiles):
|
|
245 |
|
246 |
|
247 |
# Function to display evaluation score
|
248 |
-
def display_eval(selected_models, dataset, task_type, downstream, fusion_type):
|
249 |
result = None
|
250 |
|
251 |
try:
|
@@ -260,72 +243,87 @@ def display_eval(selected_models, dataset, task_type, downstream, fusion_type):
|
|
260 |
downstream_model = downstream_model.rstrip()
|
261 |
params = None
|
262 |
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
try:
|
267 |
if not selected_models:
|
268 |
return "Please select at least one enabled model."
|
269 |
|
270 |
-
if task_type == "Classification":
|
271 |
-
global roc_auc, fpr, tpr, x_batch, y_batch
|
272 |
-
elif task_type == "Regression":
|
273 |
-
global RMSE, y_batch_test, y_prob
|
274 |
-
|
275 |
if len(selected_models) > 1:
|
276 |
if task_type == "Classification":
|
277 |
-
#result, roc_auc, fpr, tpr, x_batch, y_batch = fm4m.multi_modal(model_list=selected_models,
|
278 |
-
# downstream_model="XGBClassifier",
|
279 |
-
# dataset=dataset.lower())
|
280 |
if downstream_model == "Default Settings":
|
281 |
downstream_model = "DefaultClassifier"
|
282 |
params = None
|
283 |
-
result, roc_auc, fpr, tpr, x_batch, y_batch = fm4m.multi_modal(model_list=selected_models,
|
284 |
-
downstream_model=downstream_model,
|
285 |
-
params = params,
|
286 |
-
dataset=dataset)
|
287 |
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
|
|
|
293 |
if downstream_model == "Default Settings":
|
294 |
downstream_model = "DefaultRegressor"
|
295 |
params = None
|
296 |
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
301 |
|
302 |
else:
|
303 |
if task_type == "Classification":
|
304 |
-
#result, roc_auc, fpr, tpr, x_batch, y_batch = fm4m.single_modal(model=selected_models[0],
|
305 |
-
# downstream_model="XGBClassifier",
|
306 |
-
# dataset=dataset.lower())
|
307 |
if downstream_model == "Default Settings":
|
308 |
downstream_model = "DefaultClassifier"
|
309 |
params = None
|
310 |
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
|
316 |
elif task_type == "Regression":
|
317 |
-
#result, RMSE, y_batch_test, y_prob = fm4m.single_modal(model=selected_models[0],
|
318 |
-
# downstream_model="XGBRegressor",
|
319 |
-
# dataset=dataset.lower())
|
320 |
-
|
321 |
if downstream_model == "Default Settings":
|
322 |
downstream_model = "DefaultRegressor"
|
323 |
params = None
|
324 |
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
329 |
|
330 |
if result == None:
|
331 |
result = "Data & Model Setting is incorrect"
|
@@ -335,23 +333,15 @@ def display_eval(selected_models, dataset, task_type, downstream, fusion_type):
|
|
335 |
|
336 |
|
337 |
# Function to handle plot display
|
338 |
-
def display_plot(plot_type):
|
339 |
fig, ax = plt.subplots()
|
340 |
|
341 |
if plot_type == "Latent Space":
|
342 |
-
|
343 |
ax.set_title("T-SNE Plot")
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
# index_0 = [index for index in range(len(x)) if x[index] == 0]
|
348 |
-
# index_1 = [index for index in range(len(x)) if x[index] == 1]
|
349 |
-
class_0 = x_batch # features_umap[index_0]
|
350 |
-
class_1 = y_batch # features_umap[index_1]
|
351 |
-
|
352 |
-
"""with open("latent_multi_bace.pkl", "rb") as f:
|
353 |
-
class_0, class_1 = pickle.load(f)
|
354 |
-
"""
|
355 |
plt.scatter(class_1[:, 0], class_1[:, 1], c='red', label='Class 1')
|
356 |
plt.scatter(class_0[:, 0], class_0[:, 1], c='blue', label='Class 0')
|
357 |
|
@@ -360,10 +350,16 @@ def display_plot(plot_type):
|
|
360 |
ax.set_title('Dataset Distribution')
|
361 |
|
362 |
elif plot_type == "ROC-AUC":
|
363 |
-
|
364 |
ax.set_title("ROC-AUC Curve")
|
365 |
try:
|
366 |
-
ax.plot(
|
|
|
|
|
|
|
|
|
|
|
|
|
367 |
ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
|
368 |
ax.set_xlim([0.0, 1.0])
|
369 |
ax.set_ylim([0.0, 1.05])
|
@@ -375,7 +371,11 @@ def display_plot(plot_type):
|
|
375 |
ax.legend(loc='lower right')
|
376 |
|
377 |
elif plot_type == "Parity Plot":
|
378 |
-
|
|
|
|
|
|
|
|
|
379 |
ax.set_title("Parity plot")
|
380 |
|
381 |
# change format
|
@@ -384,7 +384,12 @@ def display_plot(plot_type):
|
|
384 |
print(y_prob)
|
385 |
y_batch_test = np.array(y_batch_test, dtype=float)
|
386 |
y_prob = np.array(y_prob, dtype=float)
|
387 |
-
ax.scatter(
|
|
|
|
|
|
|
|
|
|
|
388 |
min_val = min(min(y_batch_test), min(y_prob))
|
389 |
max_val = max(max(y_batch_test), max(y_prob))
|
390 |
ax.plot([min_val, max_val], [min_val, max_val], 'r-')
|
@@ -397,10 +402,6 @@ def display_plot(plot_type):
|
|
397 |
print(y_batch_test)
|
398 |
print(y_prob)
|
399 |
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
ax.set_xlabel('Actual Values')
|
405 |
ax.set_ylabel('Predicted Values')
|
406 |
|
@@ -419,13 +420,25 @@ predefined_datasets = {
|
|
419 |
# Function to load a predefined dataset from the local path
|
420 |
def load_predefined_dataset(dataset_name):
|
421 |
val = predefined_datasets.get(dataset_name)
|
422 |
-
try:
|
423 |
-
|
|
|
|
|
424 |
|
425 |
if file_path:
|
426 |
df = pd.read_csv(file_path)
|
427 |
-
return
|
428 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
429 |
|
430 |
|
431 |
# Function to display the head of the uploaded CSV file
|
@@ -433,7 +446,11 @@ def display_csv_head(file):
|
|
433 |
if file is not None:
|
434 |
# Load the CSV file into a DataFrame
|
435 |
df = pd.read_csv(file.name)
|
436 |
-
return
|
|
|
|
|
|
|
|
|
437 |
return pd.DataFrame(), gr.update(choices=[]), gr.update(choices=[])
|
438 |
|
439 |
|
@@ -441,28 +458,54 @@ def display_csv_head(file):
|
|
441 |
def handle_dataset_selection(selected_dataset):
|
442 |
if selected_dataset == "Custom Dataset":
|
443 |
# Show file upload fields for train and test datasets if "Custom Dataset" is selected
|
444 |
-
return
|
445 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
446 |
else:
|
447 |
-
return
|
448 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
449 |
|
450 |
|
451 |
# Function to select input and output columns and display a message
|
452 |
-
def select_columns(input_column, output_column, train_data, test_data,dataset_name):
|
453 |
if input_column and output_column:
|
454 |
return f"{train_data.name},{test_data.name},{input_column},{output_column},{dataset_name}"
|
455 |
return "Please select both input and output columns."
|
456 |
|
457 |
-
|
|
|
458 |
if dataset_selector == "Custom Dataset":
|
459 |
return f"{dataset_name}"
|
460 |
return f"{dataset_selector}"
|
461 |
|
|
|
462 |
# Function to create model based on user input
|
463 |
-
def create_model(
|
|
|
|
|
464 |
if model_name == "XGBClassifier":
|
465 |
-
model = xgb.XGBClassifier(
|
|
|
|
|
|
|
|
|
|
|
|
|
466 |
elif model_name == "SVR":
|
467 |
model = SVR(degree=degree, kernel=kernel)
|
468 |
elif model_name == "Kernel Ridge":
|
@@ -476,224 +519,339 @@ def create_model(model_name, max_depth=None, n_estimators=None, alpha=None, degr
|
|
476 |
return "Model not supported."
|
477 |
|
478 |
return f"{model_name} * {model.get_params()}"
|
479 |
-
def model_selector(model_name):
|
480 |
-
# Dynamically return the appropriate hyperparameter components based on the selected model
|
481 |
-
if model_name == "XGBClassifier":
|
482 |
-
return (
|
483 |
-
gr.Slider(1, 10, label="max_depth"),
|
484 |
-
gr.Slider(50, 500, label="n_estimators"),
|
485 |
-
gr.Slider(0.1, 10.0, step=0.1, label="alpha")
|
486 |
-
)
|
487 |
-
elif model_name == "SVR":
|
488 |
-
return (
|
489 |
-
gr.Slider(1, 5, label="degree"),
|
490 |
-
gr.Dropdown(["rbf", "poly", "linear"], label="kernel")
|
491 |
-
)
|
492 |
-
elif model_name == "Kernel Ridge":
|
493 |
-
return (
|
494 |
-
gr.Slider(0.1, 10.0, step=0.1, label="alpha"),
|
495 |
-
gr.Slider(1, 5, label="degree"),
|
496 |
-
gr.Dropdown(["rbf", "poly", "linear"], label="kernel")
|
497 |
-
)
|
498 |
-
elif model_name == "Linear Regression":
|
499 |
-
return () # No hyperparameters for Linear Regression
|
500 |
-
else:
|
501 |
-
return ()
|
502 |
-
|
503 |
|
504 |
|
505 |
# Define the Gradio layout
|
506 |
-
# with gr.Blocks(theme=my_theme) as demo:
|
507 |
with gr.Blocks() as demo:
|
|
|
|
|
|
|
|
|
508 |
with gr.Row():
|
509 |
# Left Column
|
510 |
with gr.Column():
|
511 |
-
gr.HTML(
|
|
|
512 |
<div style="background-color: #6A8EAE; color: #FFFFFF; padding: 10px;">
|
513 |
<h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Data & Model Setting</h3>
|
514 |
</div>
|
515 |
-
'''
|
516 |
-
|
517 |
-
#dataset_dropdown = gr.Dropdown(choices=datasets, label="Select Dat")
|
518 |
-
|
519 |
# Dropdown menu for predefined datasets including "Custom Dataset" option
|
520 |
-
dataset_selector = gr.Dropdown(
|
521 |
-
|
|
|
|
|
522 |
# Display the message for selected columns
|
523 |
-
selected_columns_message = gr.Textbox(
|
|
|
|
|
524 |
|
525 |
with gr.Accordion("Dataset Settings", open=True):
|
526 |
# File upload options for custom dataset (train and test)
|
527 |
dataset_name = gr.Textbox(label="Dataset Name", visible=False)
|
528 |
-
train_file = gr.File(
|
529 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
530 |
|
531 |
-
test_file = gr.File(
|
532 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
533 |
|
534 |
# Predefined dataset displays
|
535 |
-
predefined_display = gr.Dataframe(
|
536 |
-
|
537 |
-
|
538 |
-
|
|
|
539 |
|
540 |
# Dropdowns for selecting input and output columns for the custom dataset
|
541 |
-
input_column_selector = gr.Dropdown(
|
542 |
-
|
543 |
-
|
544 |
-
|
|
|
|
|
545 |
|
546 |
# When a dataset is selected, show either file upload fields (for custom) or load predefined datasets
|
547 |
-
dataset_selector.change(
|
548 |
-
|
549 |
-
|
550 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
551 |
|
552 |
# When a predefined dataset is selected, load its head and update column selectors
|
553 |
-
dataset_selector.change(
|
554 |
-
|
555 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
556 |
|
557 |
# When a custom train file is uploaded, display its head and update column selectors
|
558 |
-
train_file.change(
|
559 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
560 |
|
561 |
# When a custom test file is uploaded, display its head
|
562 |
-
test_file.change(
|
563 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
564 |
|
565 |
-
dataset_selector.change(
|
566 |
-
|
567 |
-
|
|
|
|
|
568 |
|
569 |
# Update the selected columns information when dropdown values are changed
|
570 |
-
input_column_selector.change(
|
571 |
-
|
572 |
-
|
573 |
-
|
574 |
-
|
575 |
-
|
576 |
-
|
|
|
|
|
|
|
|
|
577 |
|
578 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
579 |
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
|
584 |
-
task_radiobutton = gr.Radio(
|
|
|
|
|
585 |
|
586 |
####### adding hyper parameter tuning ###########
|
587 |
-
model_name = gr.Dropdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
588 |
with gr.Accordion("Downstream Hyperparameter Settings", open=True):
|
589 |
# Create placeholders for hyperparameter components
|
590 |
-
max_depth = gr.Slider(1, 20, step=1,visible=False, label="max_depth")
|
591 |
-
n_estimators = gr.Slider(
|
|
|
|
|
592 |
alpha = gr.Slider(0.1, 10.0, step=0.1, visible=False, label="alpha")
|
593 |
-
degree = gr.Slider(1, 20, step=1,visible=False, label="degree")
|
594 |
-
kernel = gr.Dropdown(
|
|
|
|
|
595 |
|
596 |
# Output textbox
|
597 |
output = gr.Textbox(label="Loaded Parameters")
|
598 |
|
599 |
-
|
600 |
# Dynamically show relevant hyperparameters based on selected model
|
601 |
def update_hyperparameters(model_name):
|
602 |
if model_name == "XGBClassifier":
|
603 |
-
return
|
604 |
-
|
|
|
|
|
|
|
|
|
|
|
605 |
elif model_name == "SVR":
|
606 |
-
return
|
607 |
-
|
|
|
|
|
|
|
|
|
|
|
608 |
elif model_name == "Kernel Ridge":
|
609 |
-
return
|
610 |
-
|
|
|
|
|
|
|
|
|
|
|
611 |
elif model_name == "Linear Regression":
|
612 |
-
return
|
613 |
-
|
|
|
|
|
|
|
|
|
|
|
614 |
elif model_name == "Default - Auto":
|
615 |
-
return
|
616 |
-
|
617 |
-
|
|
|
|
|
|
|
|
|
618 |
|
619 |
# When model is selected, update which hyperparameters are visible
|
620 |
-
model_name.change(
|
621 |
-
|
|
|
|
|
|
|
622 |
|
623 |
# Submit button to create the model with selected hyperparameters
|
624 |
submit_button = gr.Button("Create Downstream Model")
|
625 |
|
626 |
-
|
627 |
# Function to handle model creation based on input parameters
|
628 |
def on_submit(model_name, max_depth, n_estimators, alpha, degree, kernel):
|
629 |
if model_name == "XGBClassifier":
|
630 |
-
return create_model(
|
|
|
|
|
|
|
|
|
|
|
631 |
elif model_name == "SVR":
|
632 |
return create_model(model_name, degree=degree, kernel=kernel)
|
633 |
elif model_name == "Kernel Ridge":
|
634 |
-
return create_model(
|
|
|
|
|
635 |
elif model_name == "Linear Regression":
|
636 |
return create_model(model_name)
|
637 |
elif model_name == "Default - Auto":
|
638 |
return create_model(model_name)
|
639 |
|
640 |
# When the submit button is clicked, run the on_submit function
|
641 |
-
submit_button.click(
|
642 |
-
|
|
|
|
|
|
|
643 |
###### End of hyper param tuning #########
|
644 |
|
645 |
fusion_radiobutton = gr.Radio(choices=fusion_available, label="Fusion Type")
|
646 |
|
647 |
-
|
648 |
-
|
649 |
eval_button = gr.Button("Train downstream model")
|
650 |
-
#eval_button.style(css_class="custom-button-left")
|
651 |
|
652 |
# Middle Column
|
653 |
with gr.Column():
|
654 |
-
gr.HTML(
|
|
|
655 |
<div style="background-color: #8F9779; color: #FFFFFF; padding: 10px;">
|
656 |
<h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Downstream Task 1: Property Prediction</h3>
|
657 |
</div>
|
658 |
-
'''
|
659 |
-
|
660 |
eval_output = gr.Textbox(label="Train downstream model")
|
661 |
|
662 |
-
plot_radio = gr.Radio(
|
663 |
-
|
664 |
-
|
665 |
-
|
|
|
666 |
|
667 |
create_log = gr.Button("Store log")
|
668 |
|
669 |
-
log_table = gr.Dataframe(
|
670 |
-
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
|
676 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
677 |
|
678 |
# Function to gather selected models
|
679 |
def gather_selected_models(*models):
|
680 |
selected = [model for model in models if model]
|
681 |
return selected
|
682 |
|
683 |
-
|
684 |
-
|
685 |
-
|
686 |
-
|
687 |
-
|
688 |
-
|
|
|
|
|
|
|
|
|
|
|
689 |
# Right Column
|
690 |
with gr.Column():
|
691 |
-
gr.HTML(
|
|
|
692 |
<div style="background-color: #D2B48C; color: #FFFFFF; padding: 10px;">
|
693 |
<h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Downstream Task 2: Molecule Generation</h3>
|
694 |
</div>
|
695 |
-
'''
|
696 |
-
|
697 |
smiles_input = gr.Textbox(label="Input SMILES String")
|
698 |
image_display = gr.Image(label="Molecule Image", height=250, width=250)
|
699 |
# Show images for selection
|
@@ -702,24 +860,32 @@ with gr.Blocks() as demo:
|
|
702 |
choices=list(smiles_image_mapping.keys()),
|
703 |
label="Select from sample molecules",
|
704 |
value=None,
|
705 |
-
#item_images=[load_image(smiles_image_mapping[key]["image"]) for key in smiles_image_mapping.keys()]
|
706 |
)
|
707 |
image_selector.change(load_image, image_selector, image_display)
|
708 |
generate_button = gr.Button("Generate")
|
709 |
-
gen_image_display = gr.Image(
|
|
|
|
|
710 |
generated_output = gr.Textbox(label="Generated Output")
|
711 |
property_table = gr.Dataframe(label="Molecular Properties Comparison")
|
712 |
|
713 |
-
|
714 |
-
|
715 |
# Handle image selection
|
716 |
-
image_selector.change(
|
717 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
718 |
|
719 |
# Generate button to display canonical SMILES and molecule image
|
720 |
-
generate_button.click(
|
721 |
-
|
|
|
|
|
|
|
722 |
|
723 |
|
724 |
if __name__ == "__main__":
|
725 |
-
demo.launch(
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import matplotlib.pyplot as plt
|
3 |
+
import numpy as np
|
4 |
+
import os
|
|
|
5 |
import pandas as pd
|
6 |
+
import re
|
|
|
|
|
|
|
7 |
import selfies as sf
|
|
|
8 |
import torch
|
|
|
|
|
|
|
9 |
import xgboost as xgb
|
10 |
+
from PIL import Image
|
11 |
+
from rdkit import Chem, RDLogger
|
12 |
+
from rdkit.Chem import DataStructs, AllChem, Descriptors, QED, Draw
|
13 |
+
from rdkit.Chem.Crippen import MolLogP
|
14 |
+
from rdkit.Contrib.SA_Score import sascorer
|
15 |
from sklearn.kernel_ridge import KernelRidge
|
16 |
+
from sklearn.linear_model import LinearRegression
|
17 |
+
from sklearn.svm import SVR
|
18 |
+
from transformers import BartForConditionalGeneration, AutoTokenizer
|
19 |
+
from transformers.modeling_outputs import BaseModelOutput
|
20 |
|
21 |
os.environ["OMP_MAX_ACTIVE_LEVELS"] = "1"
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
import models.fm4m as fm4m
|
24 |
|
25 |
+
RDLogger.logger().setLevel(RDLogger.ERROR)
|
26 |
+
|
27 |
|
28 |
# Function to display molecule image from SMILES
|
29 |
def smiles_to_image(smiles):
|
30 |
mol = Chem.MolFromSmiles(smiles)
|
31 |
+
return Draw.MolToImage(mol) if mol else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
|
34 |
# Dictionary for SMILES strings and corresponding images (you can replace with your actual image paths)
|
35 |
smiles_image_mapping = {
|
36 |
+
"Mol 1": {
|
37 |
+
"smiles": "C=C(C)CC(=O)NC[C@H](CO)NC(=O)C=Cc1ccc(C)c(Cl)c1",
|
38 |
+
"image": "img/img1.png",
|
39 |
+
},
|
40 |
# Example SMILES for ethanol
|
41 |
+
"Mol 2": {
|
42 |
+
"smiles": "C=CC1(CC(=O)NC[C@@H](CCCC)NC(=O)c2cc(Cl)cc(Br)c2)CC1",
|
43 |
+
"image": "img/img2.png",
|
44 |
+
},
|
45 |
# Example SMILES for butane
|
46 |
+
"Mol 3": {
|
47 |
+
"smiles": "C=C(C)C[C@H](NC(C)=O)C(=O)N1CC[C@H](NC(=O)[C@H]2C[C@@]2(C)Br)C(C)(C)C1",
|
48 |
+
"image": "img/img3.png",
|
49 |
+
}, # Example SMILES for ethylamine
|
50 |
+
"Mol 4": {
|
51 |
+
"smiles": "C=C1CC(CC(=O)N[C@H]2CCN(C(=O)c3ncccc3SC)C23CC3)C1",
|
52 |
+
"image": "img/img4.png",
|
53 |
+
},
|
54 |
# Example SMILES for diethyl ether
|
55 |
+
"Mol 5": {
|
56 |
+
"smiles": "C=CCS[C@@H](C)CC(=O)OCC",
|
57 |
+
"image": "img/img5.png",
|
58 |
+
}, # Example SMILES for chloroethane
|
59 |
}
|
60 |
|
61 |
datasets = [" ", "BACE", "ESOL", "Load Custom Dataset"]
|
62 |
|
63 |
+
models_enabled = [
|
64 |
+
"SELFIES-TED",
|
65 |
+
"MHG-GED",
|
66 |
+
"MolFormer",
|
67 |
+
"SMI-TED",
|
68 |
+
"Mordred",
|
69 |
+
"MorganFingerprint",
|
70 |
+
]
|
71 |
|
72 |
fusion_available = ["Concat"]
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
# Function to handle evaluation and logging
|
76 |
+
def evaluate_and_log(models, dataset, task_type, eval_output, state):
|
|
|
|
|
77 |
task_dic = {'Classification': 'CLS', 'Regression': 'RGR'}
|
78 |
+
result = f"{eval_output}"
|
79 |
result = result.replace(" Score", "")
|
80 |
|
81 |
+
new_entry = {
|
82 |
+
"Selected Models": str(models),
|
83 |
+
"Dataset": dataset,
|
84 |
+
"Task": task_dic[task_type],
|
85 |
+
"Result": result,
|
86 |
+
}
|
87 |
new_entry_df = pd.DataFrame([new_entry])
|
88 |
|
89 |
+
state["log_df"] = pd.concat([new_entry_df, state["log_df"]])
|
90 |
+
return state["log_df"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
|
93 |
# Load images for selection
|
94 |
def load_image(path):
|
95 |
try:
|
96 |
+
return Image.open(smiles_image_mapping[path]["image"])
|
97 |
except:
|
98 |
pass
|
99 |
|
100 |
|
|
|
101 |
# Function to handle image selection
|
102 |
def handle_image_selection(image_key):
|
103 |
smiles = smiles_image_mapping[image_key]["smiles"]
|
|
|
121 |
mol1 = Chem.MolFromSmiles(smiles1)
|
122 |
mol2 = Chem.MolFromSmiles(smiles2)
|
123 |
if mol1 and mol2:
|
|
|
|
|
124 |
fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2)
|
125 |
fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2)
|
126 |
return round(DataStructs.FingerprintSimilarity(fp1, fp2), 2)
|
127 |
return None
|
128 |
|
129 |
|
|
|
|
|
|
|
130 |
gen_tokenizer = AutoTokenizer.from_pretrained("ibm/materials.selfies-ted")
|
131 |
gen_model = BartForConditionalGeneration.from_pretrained("ibm/materials.selfies-ted")
|
132 |
|
133 |
|
134 |
def generate(latent_vector, mask):
|
135 |
encoder_outputs = BaseModelOutput(latent_vector)
|
136 |
+
decoder_output = gen_model.generate(
|
137 |
+
encoder_outputs=encoder_outputs,
|
138 |
+
attention_mask=mask,
|
139 |
+
max_new_tokens=64,
|
140 |
+
do_sample=True,
|
141 |
+
top_k=5,
|
142 |
+
top_p=0.95,
|
143 |
+
num_return_sequences=1,
|
144 |
+
)
|
145 |
selfies = gen_tokenizer.batch_decode(decoder_output, skip_special_tokens=True)
|
146 |
+
return [sf.decoder(re.sub(r'\]\s*(.*?)\s*\[', r']\1[', i)) for i in selfies]
|
|
|
|
|
|
|
147 |
|
148 |
|
149 |
def perturb_latent(latent_vecs, noise_scale=0.5):
|
150 |
+
return (
|
151 |
+
torch.tensor(
|
152 |
+
np.random.uniform(0, 1, latent_vecs.shape) * noise_scale,
|
153 |
+
dtype=torch.float32,
|
154 |
+
)
|
155 |
+
+ latent_vecs
|
156 |
+
)
|
157 |
|
158 |
|
159 |
def encode(selfies):
|
160 |
+
encoding = gen_tokenizer(
|
161 |
+
selfies,
|
162 |
+
return_tensors='pt',
|
163 |
+
max_length=128,
|
164 |
+
truncation=True,
|
165 |
+
padding='max_length',
|
166 |
+
)
|
167 |
input_ids = encoding['input_ids']
|
168 |
attention_mask = encoding['attention_mask']
|
169 |
+
outputs = gen_model.model.encoder(
|
170 |
+
input_ids=input_ids, attention_mask=attention_mask
|
171 |
+
)
|
172 |
model_output = outputs.last_hidden_state
|
|
|
|
|
|
|
|
|
|
|
173 |
return model_output, attention_mask
|
174 |
|
175 |
|
|
|
184 |
noise = i / 10
|
185 |
perturbed_latent = perturb_latent(latent_vec, noise_scale=noise)
|
186 |
gen = generate(perturbed_latent, mask)
|
187 |
+
mol = Chem.MolFromSmiles(gen[0])
|
188 |
+
if mol:
|
189 |
+
gen_mol = Chem.MolToSmiles(mol)
|
190 |
+
if gen_mol != Chem.MolToSmiles(Chem.MolFromSmiles(smiles)):
|
191 |
+
break
|
192 |
+
else:
|
193 |
+
print('Abnormal molecule:', gen[0])
|
194 |
|
195 |
if gen_mol:
|
196 |
# Calculate properties for ref and gen molecules
|
|
|
202 |
# Prepare the table with ref mol and gen mol
|
203 |
data = {
|
204 |
"Property": ["QED", "SA", "LogP", "Mol Wt", "Tanimoto Similarity"],
|
205 |
+
"Reference Mol": [
|
206 |
+
ref_properties[0],
|
207 |
+
ref_properties[1],
|
208 |
+
ref_properties[2],
|
209 |
+
ref_properties[3],
|
210 |
+
tanimoto_similarity,
|
211 |
+
],
|
212 |
+
"Generated Mol": [
|
213 |
+
gen_properties[0],
|
214 |
+
gen_properties[1],
|
215 |
+
gen_properties[2],
|
216 |
+
gen_properties[3],
|
217 |
+
"",
|
218 |
+
],
|
219 |
}
|
220 |
df = pd.DataFrame(data)
|
221 |
|
|
|
228 |
|
229 |
|
230 |
# Function to display evaluation score
|
231 |
+
def display_eval(selected_models, dataset, task_type, downstream, fusion_type, state):
|
232 |
result = None
|
233 |
|
234 |
try:
|
|
|
243 |
downstream_model = downstream_model.rstrip()
|
244 |
params = None
|
245 |
|
|
|
|
|
|
|
246 |
try:
|
247 |
if not selected_models:
|
248 |
return "Please select at least one enabled model."
|
249 |
|
|
|
|
|
|
|
|
|
|
|
250 |
if len(selected_models) > 1:
|
251 |
if task_type == "Classification":
|
|
|
|
|
|
|
252 |
if downstream_model == "Default Settings":
|
253 |
downstream_model = "DefaultClassifier"
|
254 |
params = None
|
|
|
|
|
|
|
|
|
255 |
|
256 |
+
(
|
257 |
+
result,
|
258 |
+
state["roc_auc"],
|
259 |
+
state["fpr"],
|
260 |
+
state["tpr"],
|
261 |
+
state["x_batch"],
|
262 |
+
state["y_batch"],
|
263 |
+
) = fm4m.multi_modal(
|
264 |
+
model_list=selected_models,
|
265 |
+
downstream_model=downstream_model,
|
266 |
+
params=params,
|
267 |
+
dataset=dataset,
|
268 |
+
)
|
269 |
|
270 |
+
elif task_type == "Regression":
|
271 |
if downstream_model == "Default Settings":
|
272 |
downstream_model = "DefaultRegressor"
|
273 |
params = None
|
274 |
|
275 |
+
(
|
276 |
+
result,
|
277 |
+
state["RMSE"],
|
278 |
+
state["y_batch_test"],
|
279 |
+
state["y_prob"],
|
280 |
+
state["x_batch"],
|
281 |
+
state["y_batch"],
|
282 |
+
) = fm4m.multi_modal(
|
283 |
+
model_list=selected_models,
|
284 |
+
downstream_model=downstream_model,
|
285 |
+
params=params,
|
286 |
+
dataset=dataset,
|
287 |
+
)
|
288 |
|
289 |
else:
|
290 |
if task_type == "Classification":
|
|
|
|
|
|
|
291 |
if downstream_model == "Default Settings":
|
292 |
downstream_model = "DefaultClassifier"
|
293 |
params = None
|
294 |
|
295 |
+
(
|
296 |
+
result,
|
297 |
+
state["roc_auc"],
|
298 |
+
state["fpr"],
|
299 |
+
state["tpr"],
|
300 |
+
state["x_batch"],
|
301 |
+
state["y_batch"],
|
302 |
+
) = fm4m.single_modal(
|
303 |
+
model=selected_models[0],
|
304 |
+
downstream_model=downstream_model,
|
305 |
+
params=params,
|
306 |
+
dataset=dataset,
|
307 |
+
)
|
308 |
|
309 |
elif task_type == "Regression":
|
|
|
|
|
|
|
|
|
310 |
if downstream_model == "Default Settings":
|
311 |
downstream_model = "DefaultRegressor"
|
312 |
params = None
|
313 |
|
314 |
+
(
|
315 |
+
result,
|
316 |
+
state["RMSE"],
|
317 |
+
state["y_batch_test"],
|
318 |
+
state["y_prob"],
|
319 |
+
state["x_batch"],
|
320 |
+
state["y_batch"],
|
321 |
+
) = fm4m.single_modal(
|
322 |
+
model=selected_models[0],
|
323 |
+
downstream_model=downstream_model,
|
324 |
+
params=params,
|
325 |
+
dataset=dataset,
|
326 |
+
)
|
327 |
|
328 |
if result == None:
|
329 |
result = "Data & Model Setting is incorrect"
|
|
|
333 |
|
334 |
|
335 |
# Function to handle plot display
|
336 |
+
def display_plot(plot_type, state):
|
337 |
fig, ax = plt.subplots()
|
338 |
|
339 |
if plot_type == "Latent Space":
|
340 |
+
x_batch, y_batch = state.get("x_batch"), state.get("y_batch")
|
341 |
ax.set_title("T-SNE Plot")
|
342 |
+
class_0 = x_batch
|
343 |
+
class_1 = y_batch
|
344 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
345 |
plt.scatter(class_1[:, 0], class_1[:, 1], c='red', label='Class 1')
|
346 |
plt.scatter(class_0[:, 0], class_0[:, 1], c='blue', label='Class 0')
|
347 |
|
|
|
350 |
ax.set_title('Dataset Distribution')
|
351 |
|
352 |
elif plot_type == "ROC-AUC":
|
353 |
+
roc_auc, fpr, tpr = state.get("roc_auc"), state.get("fpr"), state.get("tpr")
|
354 |
ax.set_title("ROC-AUC Curve")
|
355 |
try:
|
356 |
+
ax.plot(
|
357 |
+
fpr,
|
358 |
+
tpr,
|
359 |
+
color='darkorange',
|
360 |
+
lw=2,
|
361 |
+
label=f'ROC curve (area = {roc_auc:.4f})',
|
362 |
+
)
|
363 |
ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
|
364 |
ax.set_xlim([0.0, 1.0])
|
365 |
ax.set_ylim([0.0, 1.05])
|
|
|
371 |
ax.legend(loc='lower right')
|
372 |
|
373 |
elif plot_type == "Parity Plot":
|
374 |
+
RMSE, y_batch_test, y_prob = (
|
375 |
+
state.get("RMSE"),
|
376 |
+
state.get("y_batch_test"),
|
377 |
+
state.get("y_prob"),
|
378 |
+
)
|
379 |
ax.set_title("Parity plot")
|
380 |
|
381 |
# change format
|
|
|
384 |
print(y_prob)
|
385 |
y_batch_test = np.array(y_batch_test, dtype=float)
|
386 |
y_prob = np.array(y_prob, dtype=float)
|
387 |
+
ax.scatter(
|
388 |
+
y_batch_test,
|
389 |
+
y_prob,
|
390 |
+
color="blue",
|
391 |
+
label=f"Predicted vs Actual (RMSE: {RMSE:.4f})",
|
392 |
+
)
|
393 |
min_val = min(min(y_batch_test), min(y_prob))
|
394 |
max_val = max(max(y_batch_test), max(y_prob))
|
395 |
ax.plot([min_val, max_val], [min_val, max_val], 'r-')
|
|
|
402 |
print(y_batch_test)
|
403 |
print(y_prob)
|
404 |
|
|
|
|
|
|
|
|
|
405 |
ax.set_xlabel('Actual Values')
|
406 |
ax.set_ylabel('Predicted Values')
|
407 |
|
|
|
420 |
# Function to load a predefined dataset from the local path
|
421 |
def load_predefined_dataset(dataset_name):
|
422 |
val = predefined_datasets.get(dataset_name)
|
423 |
+
try:
|
424 |
+
file_path = val.split(",")[0]
|
425 |
+
except:
|
426 |
+
file_path = False
|
427 |
|
428 |
if file_path:
|
429 |
df = pd.read_csv(file_path)
|
430 |
+
return (
|
431 |
+
df.head(),
|
432 |
+
gr.update(choices=list(df.columns)),
|
433 |
+
gr.update(choices=list(df.columns)),
|
434 |
+
f"{dataset_name.lower()}",
|
435 |
+
)
|
436 |
+
return (
|
437 |
+
pd.DataFrame(),
|
438 |
+
gr.update(choices=[]),
|
439 |
+
gr.update(choices=[]),
|
440 |
+
f"Dataset not found",
|
441 |
+
)
|
442 |
|
443 |
|
444 |
# Function to display the head of the uploaded CSV file
|
|
|
446 |
if file is not None:
|
447 |
# Load the CSV file into a DataFrame
|
448 |
df = pd.read_csv(file.name)
|
449 |
+
return (
|
450 |
+
df.head(),
|
451 |
+
gr.update(choices=list(df.columns)),
|
452 |
+
gr.update(choices=list(df.columns)),
|
453 |
+
)
|
454 |
return pd.DataFrame(), gr.update(choices=[]), gr.update(choices=[])
|
455 |
|
456 |
|
|
|
458 |
def handle_dataset_selection(selected_dataset):
|
459 |
if selected_dataset == "Custom Dataset":
|
460 |
# Show file upload fields for train and test datasets if "Custom Dataset" is selected
|
461 |
+
return (
|
462 |
+
gr.update(visible=True),
|
463 |
+
gr.update(visible=True),
|
464 |
+
gr.update(visible=True),
|
465 |
+
gr.update(visible=True),
|
466 |
+
gr.update(visible=True),
|
467 |
+
gr.update(visible=False),
|
468 |
+
gr.update(visible=True),
|
469 |
+
gr.update(visible=True),
|
470 |
+
)
|
471 |
else:
|
472 |
+
return (
|
473 |
+
gr.update(visible=True),
|
474 |
+
gr.update(visible=False),
|
475 |
+
gr.update(visible=False),
|
476 |
+
gr.update(visible=False),
|
477 |
+
gr.update(visible=False),
|
478 |
+
gr.update(visible=False),
|
479 |
+
gr.update(visible=False),
|
480 |
+
gr.update(visible=False),
|
481 |
+
)
|
482 |
|
483 |
|
484 |
# Function to select input and output columns and display a message
|
485 |
+
def select_columns(input_column, output_column, train_data, test_data, dataset_name):
|
486 |
if input_column and output_column:
|
487 |
return f"{train_data.name},{test_data.name},{input_column},{output_column},{dataset_name}"
|
488 |
return "Please select both input and output columns."
|
489 |
|
490 |
+
|
491 |
+
def set_dataname(dataset_name, dataset_selector):
|
492 |
if dataset_selector == "Custom Dataset":
|
493 |
return f"{dataset_name}"
|
494 |
return f"{dataset_selector}"
|
495 |
|
496 |
+
|
497 |
# Function to create model based on user input
|
498 |
+
def create_model(
|
499 |
+
model_name, max_depth=None, n_estimators=None, alpha=None, degree=None, kernel=None
|
500 |
+
):
|
501 |
if model_name == "XGBClassifier":
|
502 |
+
model = xgb.XGBClassifier(
|
503 |
+
objective='binary:logistic',
|
504 |
+
eval_metric='auc',
|
505 |
+
max_depth=max_depth,
|
506 |
+
n_estimators=n_estimators,
|
507 |
+
alpha=alpha,
|
508 |
+
)
|
509 |
elif model_name == "SVR":
|
510 |
model = SVR(degree=degree, kernel=kernel)
|
511 |
elif model_name == "Kernel Ridge":
|
|
|
519 |
return "Model not supported."
|
520 |
|
521 |
return f"{model_name} * {model.get_params()}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
522 |
|
523 |
|
524 |
# Define the Gradio layout
|
|
|
525 |
with gr.Blocks() as demo:
|
526 |
+
log_df = pd.DataFrame(
|
527 |
+
{"": [], 'Selected Models': [], 'Dataset': [], 'Task': [], 'Result': []}
|
528 |
+
)
|
529 |
+
state = gr.State({"log_df": log_df})
|
530 |
with gr.Row():
|
531 |
# Left Column
|
532 |
with gr.Column():
|
533 |
+
gr.HTML(
|
534 |
+
'''
|
535 |
<div style="background-color: #6A8EAE; color: #FFFFFF; padding: 10px;">
|
536 |
<h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Data & Model Setting</h3>
|
537 |
</div>
|
538 |
+
'''
|
539 |
+
)
|
|
|
|
|
540 |
# Dropdown menu for predefined datasets including "Custom Dataset" option
|
541 |
+
dataset_selector = gr.Dropdown(
|
542 |
+
label="Select Dataset",
|
543 |
+
choices=list(predefined_datasets.keys()) + ["Custom Dataset"],
|
544 |
+
)
|
545 |
# Display the message for selected columns
|
546 |
+
selected_columns_message = gr.Textbox(
|
547 |
+
label="Selected Columns Info", visible=False
|
548 |
+
)
|
549 |
|
550 |
with gr.Accordion("Dataset Settings", open=True):
|
551 |
# File upload options for custom dataset (train and test)
|
552 |
dataset_name = gr.Textbox(label="Dataset Name", visible=False)
|
553 |
+
train_file = gr.File(
|
554 |
+
label="Upload Custom Train Dataset",
|
555 |
+
file_types=[".csv"],
|
556 |
+
visible=False,
|
557 |
+
)
|
558 |
+
train_display = gr.Dataframe(
|
559 |
+
label="Train Dataset Preview (First 5 Rows)",
|
560 |
+
visible=False,
|
561 |
+
interactive=False,
|
562 |
+
)
|
563 |
|
564 |
+
test_file = gr.File(
|
565 |
+
label="Upload Custom Test Dataset",
|
566 |
+
file_types=[".csv"],
|
567 |
+
visible=False,
|
568 |
+
)
|
569 |
+
test_display = gr.Dataframe(
|
570 |
+
label="Test Dataset Preview (First 5 Rows)",
|
571 |
+
visible=False,
|
572 |
+
interactive=False,
|
573 |
+
)
|
574 |
|
575 |
# Predefined dataset displays
|
576 |
+
predefined_display = gr.Dataframe(
|
577 |
+
label="Predefined Dataset Preview (First 5 Rows)",
|
578 |
+
visible=False,
|
579 |
+
interactive=False,
|
580 |
+
)
|
581 |
|
582 |
# Dropdowns for selecting input and output columns for the custom dataset
|
583 |
+
input_column_selector = gr.Dropdown(
|
584 |
+
label="Select Input Column", choices=[], visible=False
|
585 |
+
)
|
586 |
+
output_column_selector = gr.Dropdown(
|
587 |
+
label="Select Output Column", choices=[], visible=False
|
588 |
+
)
|
589 |
|
590 |
# When a dataset is selected, show either file upload fields (for custom) or load predefined datasets
|
591 |
+
dataset_selector.change(
|
592 |
+
handle_dataset_selection,
|
593 |
+
inputs=dataset_selector,
|
594 |
+
outputs=[
|
595 |
+
dataset_name,
|
596 |
+
train_file,
|
597 |
+
train_display,
|
598 |
+
test_file,
|
599 |
+
test_display,
|
600 |
+
predefined_display,
|
601 |
+
input_column_selector,
|
602 |
+
output_column_selector,
|
603 |
+
],
|
604 |
+
)
|
605 |
|
606 |
# When a predefined dataset is selected, load its head and update column selectors
|
607 |
+
dataset_selector.change(
|
608 |
+
load_predefined_dataset,
|
609 |
+
inputs=dataset_selector,
|
610 |
+
outputs=[
|
611 |
+
predefined_display,
|
612 |
+
input_column_selector,
|
613 |
+
output_column_selector,
|
614 |
+
selected_columns_message,
|
615 |
+
],
|
616 |
+
)
|
617 |
|
618 |
# When a custom train file is uploaded, display its head and update column selectors
|
619 |
+
train_file.change(
|
620 |
+
display_csv_head,
|
621 |
+
inputs=train_file,
|
622 |
+
outputs=[
|
623 |
+
train_display,
|
624 |
+
input_column_selector,
|
625 |
+
output_column_selector,
|
626 |
+
],
|
627 |
+
)
|
628 |
|
629 |
# When a custom test file is uploaded, display its head
|
630 |
+
test_file.change(
|
631 |
+
display_csv_head,
|
632 |
+
inputs=test_file,
|
633 |
+
outputs=[
|
634 |
+
test_display,
|
635 |
+
input_column_selector,
|
636 |
+
output_column_selector,
|
637 |
+
],
|
638 |
+
)
|
639 |
|
640 |
+
dataset_selector.change(
|
641 |
+
set_dataname,
|
642 |
+
inputs=[dataset_name, dataset_selector],
|
643 |
+
outputs=dataset_name,
|
644 |
+
)
|
645 |
|
646 |
# Update the selected columns information when dropdown values are changed
|
647 |
+
input_column_selector.change(
|
648 |
+
select_columns,
|
649 |
+
inputs=[
|
650 |
+
input_column_selector,
|
651 |
+
output_column_selector,
|
652 |
+
train_file,
|
653 |
+
test_file,
|
654 |
+
dataset_name,
|
655 |
+
],
|
656 |
+
outputs=selected_columns_message,
|
657 |
+
)
|
658 |
|
659 |
+
output_column_selector.change(
|
660 |
+
select_columns,
|
661 |
+
inputs=[
|
662 |
+
input_column_selector,
|
663 |
+
output_column_selector,
|
664 |
+
train_file,
|
665 |
+
test_file,
|
666 |
+
dataset_name,
|
667 |
+
],
|
668 |
+
outputs=selected_columns_message,
|
669 |
+
)
|
670 |
|
671 |
+
model_checkbox = gr.CheckboxGroup(
|
672 |
+
choices=models_enabled, label="Select Model"
|
673 |
+
)
|
674 |
|
675 |
+
task_radiobutton = gr.Radio(
|
676 |
+
choices=["Classification", "Regression"], label="Task Type"
|
677 |
+
)
|
678 |
|
679 |
####### adding hyper parameter tuning ###########
|
680 |
+
model_name = gr.Dropdown(
|
681 |
+
[
|
682 |
+
"Default - Auto",
|
683 |
+
"XGBClassifier",
|
684 |
+
"SVR",
|
685 |
+
"Kernel Ridge",
|
686 |
+
"Linear Regression",
|
687 |
+
],
|
688 |
+
label="Select Downstream Model",
|
689 |
+
)
|
690 |
with gr.Accordion("Downstream Hyperparameter Settings", open=True):
|
691 |
# Create placeholders for hyperparameter components
|
692 |
+
max_depth = gr.Slider(1, 20, step=1, visible=False, label="max_depth")
|
693 |
+
n_estimators = gr.Slider(
|
694 |
+
100, 5000, step=100, visible=False, label="n_estimators"
|
695 |
+
)
|
696 |
alpha = gr.Slider(0.1, 10.0, step=0.1, visible=False, label="alpha")
|
697 |
+
degree = gr.Slider(1, 20, step=1, visible=False, label="degree")
|
698 |
+
kernel = gr.Dropdown(
|
699 |
+
choices=["rbf", "poly", "linear"], visible=False, label="kernel"
|
700 |
+
)
|
701 |
|
702 |
# Output textbox
|
703 |
output = gr.Textbox(label="Loaded Parameters")
|
704 |
|
|
|
705 |
# Dynamically show relevant hyperparameters based on selected model
|
706 |
def update_hyperparameters(model_name):
|
707 |
if model_name == "XGBClassifier":
|
708 |
+
return (
|
709 |
+
gr.update(visible=True),
|
710 |
+
gr.update(visible=True),
|
711 |
+
gr.update(visible=True),
|
712 |
+
gr.update(visible=False),
|
713 |
+
gr.update(visible=False),
|
714 |
+
)
|
715 |
elif model_name == "SVR":
|
716 |
+
return (
|
717 |
+
gr.update(visible=False),
|
718 |
+
gr.update(visible=False),
|
719 |
+
gr.update(visible=False),
|
720 |
+
gr.update(visible=True),
|
721 |
+
gr.update(visible=True),
|
722 |
+
)
|
723 |
elif model_name == "Kernel Ridge":
|
724 |
+
return (
|
725 |
+
gr.update(visible=False),
|
726 |
+
gr.update(visible=False),
|
727 |
+
gr.update(visible=True),
|
728 |
+
gr.update(visible=True),
|
729 |
+
gr.update(visible=True),
|
730 |
+
)
|
731 |
elif model_name == "Linear Regression":
|
732 |
+
return (
|
733 |
+
gr.update(visible=False),
|
734 |
+
gr.update(visible=False),
|
735 |
+
gr.update(visible=False),
|
736 |
+
gr.update(visible=False),
|
737 |
+
gr.update(visible=False),
|
738 |
+
)
|
739 |
elif model_name == "Default - Auto":
|
740 |
+
return (
|
741 |
+
gr.update(visible=False),
|
742 |
+
gr.update(visible=False),
|
743 |
+
gr.update(visible=False),
|
744 |
+
gr.update(visible=False),
|
745 |
+
gr.update(visible=False),
|
746 |
+
)
|
747 |
|
748 |
# When model is selected, update which hyperparameters are visible
|
749 |
+
model_name.change(
|
750 |
+
update_hyperparameters,
|
751 |
+
inputs=[model_name],
|
752 |
+
outputs=[max_depth, n_estimators, alpha, degree, kernel],
|
753 |
+
)
|
754 |
|
755 |
# Submit button to create the model with selected hyperparameters
|
756 |
submit_button = gr.Button("Create Downstream Model")
|
757 |
|
|
|
758 |
# Function to handle model creation based on input parameters
|
759 |
def on_submit(model_name, max_depth, n_estimators, alpha, degree, kernel):
|
760 |
if model_name == "XGBClassifier":
|
761 |
+
return create_model(
|
762 |
+
model_name,
|
763 |
+
max_depth=max_depth,
|
764 |
+
n_estimators=n_estimators,
|
765 |
+
alpha=alpha,
|
766 |
+
)
|
767 |
elif model_name == "SVR":
|
768 |
return create_model(model_name, degree=degree, kernel=kernel)
|
769 |
elif model_name == "Kernel Ridge":
|
770 |
+
return create_model(
|
771 |
+
model_name, alpha=alpha, degree=degree, kernel=kernel
|
772 |
+
)
|
773 |
elif model_name == "Linear Regression":
|
774 |
return create_model(model_name)
|
775 |
elif model_name == "Default - Auto":
|
776 |
return create_model(model_name)
|
777 |
|
778 |
# When the submit button is clicked, run the on_submit function
|
779 |
+
submit_button.click(
|
780 |
+
on_submit,
|
781 |
+
inputs=[model_name, max_depth, n_estimators, alpha, degree, kernel],
|
782 |
+
outputs=output,
|
783 |
+
)
|
784 |
###### End of hyper param tuning #########
|
785 |
|
786 |
fusion_radiobutton = gr.Radio(choices=fusion_available, label="Fusion Type")
|
787 |
|
|
|
|
|
788 |
eval_button = gr.Button("Train downstream model")
|
|
|
789 |
|
790 |
# Middle Column
|
791 |
with gr.Column():
|
792 |
+
gr.HTML(
|
793 |
+
'''
|
794 |
<div style="background-color: #8F9779; color: #FFFFFF; padding: 10px;">
|
795 |
<h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Downstream Task 1: Property Prediction</h3>
|
796 |
</div>
|
797 |
+
'''
|
798 |
+
)
|
799 |
eval_output = gr.Textbox(label="Train downstream model")
|
800 |
|
801 |
+
plot_radio = gr.Radio(
|
802 |
+
choices=["ROC-AUC", "Parity Plot", "Latent Space"],
|
803 |
+
label="Select Plot Type",
|
804 |
+
)
|
805 |
+
plot_output = gr.Plot(label="Visualization")
|
806 |
|
807 |
create_log = gr.Button("Store log")
|
808 |
|
809 |
+
log_table = gr.Dataframe(
|
810 |
+
value=log_df, label="Log of Selections and Results", interactive=False
|
811 |
+
)
|
812 |
+
|
813 |
+
eval_button.click(
|
814 |
+
display_eval,
|
815 |
+
inputs=[
|
816 |
+
model_checkbox,
|
817 |
+
selected_columns_message,
|
818 |
+
task_radiobutton,
|
819 |
+
output,
|
820 |
+
fusion_radiobutton,
|
821 |
+
state,
|
822 |
+
],
|
823 |
+
outputs=eval_output,
|
824 |
+
)
|
825 |
+
|
826 |
+
plot_radio.change(
|
827 |
+
display_plot, inputs=[plot_radio, state], outputs=plot_output
|
828 |
+
)
|
829 |
|
830 |
# Function to gather selected models
|
831 |
def gather_selected_models(*models):
|
832 |
selected = [model for model in models if model]
|
833 |
return selected
|
834 |
|
835 |
+
create_log.click(
|
836 |
+
evaluate_and_log,
|
837 |
+
inputs=[
|
838 |
+
model_checkbox,
|
839 |
+
dataset_name,
|
840 |
+
task_radiobutton,
|
841 |
+
eval_output,
|
842 |
+
state,
|
843 |
+
],
|
844 |
+
outputs=log_table,
|
845 |
+
)
|
846 |
# Right Column
|
847 |
with gr.Column():
|
848 |
+
gr.HTML(
|
849 |
+
'''
|
850 |
<div style="background-color: #D2B48C; color: #FFFFFF; padding: 10px;">
|
851 |
<h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Downstream Task 2: Molecule Generation</h3>
|
852 |
</div>
|
853 |
+
'''
|
854 |
+
)
|
855 |
smiles_input = gr.Textbox(label="Input SMILES String")
|
856 |
image_display = gr.Image(label="Molecule Image", height=250, width=250)
|
857 |
# Show images for selection
|
|
|
860 |
choices=list(smiles_image_mapping.keys()),
|
861 |
label="Select from sample molecules",
|
862 |
value=None,
|
|
|
863 |
)
|
864 |
image_selector.change(load_image, image_selector, image_display)
|
865 |
generate_button = gr.Button("Generate")
|
866 |
+
gen_image_display = gr.Image(
|
867 |
+
label="Generated Molecule Image", height=250, width=250
|
868 |
+
)
|
869 |
generated_output = gr.Textbox(label="Generated Output")
|
870 |
property_table = gr.Dataframe(label="Molecular Properties Comparison")
|
871 |
|
|
|
|
|
872 |
# Handle image selection
|
873 |
+
image_selector.change(
|
874 |
+
handle_image_selection,
|
875 |
+
inputs=image_selector,
|
876 |
+
outputs=[smiles_input, image_display],
|
877 |
+
)
|
878 |
+
smiles_input.change(
|
879 |
+
smiles_to_image, inputs=smiles_input, outputs=image_display
|
880 |
+
)
|
881 |
|
882 |
# Generate button to display canonical SMILES and molecule image
|
883 |
+
generate_button.click(
|
884 |
+
generate_canonical,
|
885 |
+
inputs=smiles_input,
|
886 |
+
outputs=[property_table, generated_output, gen_image_display],
|
887 |
+
)
|
888 |
|
889 |
|
890 |
if __name__ == "__main__":
|
891 |
+
demo.launch(server_name="0.0.0.0")
|
data/lce/test.csv
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
smi1,conc1,smi2,conc2,smi3,conc3,smi4,conc4,smi5,conc5,smi6,conc6,LCE
|
2 |
+
C1C(OC(=O)O1)F,0.733,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.267,O,0.0,O,0.0,O,0.0,O,0.0,1.629
|
3 |
+
C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.431,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,O,0.0,1.085
|
4 |
+
COC(=O)OC,0.299,C(C(F)(F)F)OCC(F)(F)F,0.598,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.103,O,0.0,O,0.0,O,0.0,2.056
|
5 |
+
COCCOC,0.358,O1CCOC1,0.532,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.074,[Li+].[N+](=O)([O-])[O-],,O,0.0,O,0.0,1.658
|
6 |
+
C1COC(=O)O1,0.197,COC(=O)OC,0.156,COCCOCCOCCOCCOC,0.59,[Li+].F[P-](F)(F)(F)(F)F,0.026,[Li+].[N+](=O)([O-])[O-],0.031,O,0.0,1.638
|
7 |
+
C1COC(=O)O1,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.002,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.276
|
8 |
+
O1CCOC1,0.368,COCCOC,0.547,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.076,CSi(C)(C)([N+]).C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,0.008,O,0.0,O,0.0,1.569
|
9 |
+
COCCOC,0.507,COC(C(F)(F)F)C(F)(F)F,0.399,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.095,O,0.0,O,0.0,O,0.0,2.268
|
10 |
+
C1COC(=O)O1,0.425,O=C(OCC)OCC(F)(F)F,0.481,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.094,O,0.0,O,0.0,O,0.0,1.602
|
11 |
+
C1C(OC(=O)O1)F,0.318,CCOC(=O)OC,0.504,COC(=O)OC,0.094,B(O[Si](C)(C)C)(O[Si](C)(C)C)O[Si](C)(C),0.083,[Li+].F[P-](F)(F)(F)(F)F,0.001,O,0.0,1.678
|
12 |
+
O=S1(=O)CCCC1,0.359,C(C(F)(F)F)OC(C(F)F)(F)F,0.504,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.133,[Li+].[N+](=O)([O-])[O-],0.004,O,0.0,O,0.0,2.0
|
13 |
+
C1COC(=O)O1,0.594,O=C(OCC)OCC,0.327,[Li+].F[P-](F)(F)(F)(F)F,0.079,O,0.0,O,0.0,O,0.0,0.921
|
14 |
+
C1COC(=O)O1,0.331,O=C(OCC)OCC,0.577,[Li+].[B-]1(OC(=O)C(=O)O1)(F)F,0.092,O,0.0,O,0.0,O,0.0,1.301
|
15 |
+
C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].C(C(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(C(F)(F)F)(F)F)(F)(F)F,0.069,O,0.0,O,0.0,0.854
|
16 |
+
C1C(OC(=O)O1)F,0.107,C1COC(=O)O1,0.526,O=C(OCC)OCC,0.289,[Li+].F[P-](F)(F)(F)(F)F,0.078,O,0.0,O,0.0,1.108
|
17 |
+
O1CCOC1,0.322,COCCOC,0.478,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,0.2,O,0.0,O,0.0,O,0.0,1.523
|
18 |
+
CC1COC(=O)O1,0.595,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.405,O,0.0,O,0.0,O,0.0,O,0.0,1.921
|
19 |
+
CC1COC(=O)O1,0.702,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.298,O,0.0,O,0.0,O,0.0,O,0.0,1.602
|
20 |
+
O1CCOC1,0.375,COCCOC,0.557,[Li+][S-]SSS[S-][Li+],,[Li+].[N+](=O)([O-])[O-],0.008,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.061,O,0.0,1.523
|
21 |
+
COC(=O)OC,0.161,FC(F)C(F)(F)COC(F)(F)C(F)F,0.355,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.484,O,0.0,O,0.0,O,0.0,2.155
|
22 |
+
C1COC(=O)O1,0.338,COC(=O)OC,0.625,[Li+].[O-]P(=O)(F)F,0.008,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.03,O,0.0,O,0.0,1.26
|
23 |
+
CN(C)C(=O)C(F)(F)F,0.362,C1C(OC(=O)O1)F,0.556,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.081,O,0.0,O,0.0,O,0.0,2.155
|
24 |
+
C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.0,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.225
|
25 |
+
COCCOC,0.231,FC1CCCCC1,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0.0,O,0.0,O,0.0,2.155
|
26 |
+
COCCOC,0.277,FC(F)C(F)(F)COC(F)(F)C(F)F,0.555,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.168,O,0.0,O,0.0,O,0.0,2.155
|
27 |
+
O1C(C)CCC1,0.331,FC(F)C(F)(F)COC(F)(F)C(F)F,0.498,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.171,O,0.0,O,0.0,O,0.0,2.301
|
28 |
+
COCC(F)(F)C(F)(F)COC,0.864,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.136,O,0.0,O,0.0,O,0.0,O,0.0,1.991
|
29 |
+
COC(=O)OC,0.29,C(C(F)(F)F)OCC(F)(F)F,0.589,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.121,O,0.0,O,0.0,O,0.0,2.301
|
30 |
+
C1COC(=O)O1,0.425,O=C(OCC)OCC,0.234,[Li+].F[P-](F)(F)(F)(F)F,0.34,O,0.0,O,0.0,O,0.0,1.398
|
31 |
+
COCCOC,0.707,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.147,[Li+].[B-]1(OC(=O)C(=O)O1)(F)F,0.147,O,0.0,O,0.0,O,0.0,1.268
|
data/lce/test_data.csv
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
smiles1,conc1,mol1,smiles2,conc2,mol2,smiles3,conc3,mol3,smiles4,conc4,mol4,smiles5,conc5,mol5,smiles6,conc6,LCE_Predicted,LCE
|
2 |
+
C1COC(=O)O1,0.519,51.92400559,COC(=O)OC,0.411,41.14791596,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.069,6.928078454,O,0,0,O,0,0,O,0,1.187,1.094
|
3 |
+
COCCOC,0.596,59.5609428,COCCOCCOCCOCCOC,0.281,28.07124115,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.124,12.36781605,O,0,0,O,0,0,O,0,1.691,1.384
|
4 |
+
C1COC(=O)O1,0.285,28.50894036,C1C(OC(=O)O1)F,0.261,26.07552384,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.228,22.82322096,COC(=O)OC,0.226,22.59231484,O,0,0,O,0,1.508,1.468
|
5 |
+
COCCOC,0.434,43.4423376,COCCOCCOCCOCCOC,0.205,20.47449683,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.361,36.08316557,O,0,0,O,0,0,O,0,1.882,1.71
|
6 |
+
C1C(OC(=O)O1)F,0.187,18.72872664,COC(=O)OC,0.162,16.22691423,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.109,10.92850826,FC(F)C(F)(F)COC(F)(F)C(F)F,0.541,54.11585087,O,0,0,O,0,2.103,1.832
|
7 |
+
C1COC(=O)O1,0.134,13.35070843,C1C(OC(=O)O1)F,0.122,12.2111419,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.107,10.72028474,COC(=O)OC,0.106,10.57995858,FC(F)C(F)(F)COC(F)(F)C(F)F,0.531,53.13790635,O,0,2.077,2.104
|
8 |
+
COCCOC,0.096,9.614613177,COCCOCCOCCOCCOC,0.045,4.53139444,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.12,12.01491409,C1COCO1,0.143,14.28400162,FC(F)C(F)(F)COC(F)(F)C(F)F,0.596,59.55507668,O,0,2.211,2.274
|
9 |
+
C1COC(=O)O1,0.519,51.92400559,COC(=O)OC,0.411,41.14791596,[Li+].F[P-](F)(F)(F)(F)F,0.069,6.928078454,O,0,0,O,0,0,O,0,1.17,1.071
|
10 |
+
C1COC(=O)O1,0.519,51.92400559,COC(=O)OC,0.411,41.14791596,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.069,6.928078454,O,0,0,O,0,0,O,0,1.077,1.166
|
11 |
+
C1COC(=O)O1,0.519,51.85215842,COC(=O)OC,0.411,41.09097965,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.069,6.918492083,[Li+].[N+](=O)([O-])[O-],0.001,0.138369842,O,0,0,O,0,1.19,1.335
|
12 |
+
C1COC(=O)O1,0.513,51.33049845,COC(=O)OC,0.407,40.6775828,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.069,6.9173773,C1=COC(=O)O1,0.011,1.07454145,O,0,0,O,0,1.114,1.129
|
13 |
+
COCCOC,0.53,53.00533987,COCCOCCOCCOCCOC,0.25,24.98156691,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.22,22.01309322,O,0,0,O,0,0,O,0,1.758,1.501
|
14 |
+
COCCOC,0.477,47.74974224,COCCOCCOCCOCCOC,0.225,22.50458884,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.297,29.74566892,O,0,0,O,0,0,O,0,1.821,1.663
|
data/lce/train.csv
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
smi1,conc1,smi2,conc2,smi3,conc3,smi4,conc4,smi5,conc5,smi6,conc6,LCE
|
2 |
+
C1COC(=O)O1,0.327,O=C(OCC)OCC,0.594,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.079,O,0.0,O,0.0,O,0.0,1.155
|
3 |
+
C1COC(=O)O1,0.356,COC(=O)OC,0.566,FC(F)(F)COB(OCC(F)(F)F)OCC(F)(F)F,0.007,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.046
|
4 |
+
O=S1(=O)CCCC1,0.25,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.75,O,0.0,O,0.0,O,0.0,O,0.0,1.569
|
5 |
+
C1COC(=O)O1,0.331,O=C(OCC)OCC,0.577,[Li+].F[P-](F)(F)(F)(F)F,0.092,O,0.0,O,0.0,O,0.0,0.886
|
6 |
+
COCCOC,0.763,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.237,O,0.0,O,0.0,O,0.0,O,0.0,1.367
|
7 |
+
COCCOC,0.2,FC(F)C(F)(F)COC(F)(F)C(F)F,0.6,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.2,O,0.0,O,0.0,O,0.0,2.301
|
8 |
+
C1C(OC(=O)O1)F,0.873,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.127,O,0.0,O,0.0,O,0.0,O,0.0,1.489
|
9 |
+
COCCOC,0.706,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.008,[Li+].[O-]P(=O)(F)F,0.286,O,0.0,O,0.0,O,0.0,1.244
|
10 |
+
C1COC(=O)O1,0.3,CCOC(=O)OC,0.593,C1=COC(=O)O1,0.026,[Li+].F[P-](F)(F)(F)(F)F,0.081,O,0.0,O,0.0,0.745
|
11 |
+
COCCOC,0.763,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.174,[Li+].[O-]P(=O)(F)F,0.063,O,0.0,O,0.0,O,0.0,1.292
|
12 |
+
CCOCC,0.313,C(C(F)(F)F)OCC(F)(F)F,0.51,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.177,O,0.0,O,0.0,O,0.0,2.301
|
13 |
+
O=S1(=O)CCCC1,0.75,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.25,O,0.0,O,0.0,O,0.0,O,0.0,1.745
|
14 |
+
COC(=O)OC,0.29,C(C(F)(F)F)OCC(F)(F)F,0.589,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.121,O,0.0,O,0.0,O,0.0,1.745
|
15 |
+
C1COC(=O)O1,0.682,CCOC(=O)OC,0.247,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.043,[Li+].O=C1O[B-]2(OC1=O)OC(=O)C(=O)O2,0.028,O,0.0,O,0.0,1.076
|
16 |
+
C1COC(=O)O1,0.359,COC(=O)OC,0.569,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,O,0.0,0.854
|
17 |
+
C1COC(=O)O1,0.305,COC(=O)OC,0.242,COCCOCCOCCOCCOC,0.392,[Li+].F[P-](F)(F)(F)(F)F,0.041,[Li+].[N+](=O)([O-])[O-],0.02,O,0.0,1.678
|
18 |
+
FC(F)(F)COCCOCC,0.838,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.162,O,0.0,O,0.0,O,0.0,O,0.0,2.155
|
19 |
+
CC#N,0.882,FC,0.065,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,,O,0.0,O,0.0,O,0.0,2.222
|
20 |
+
COC(C)C(C)OC,0.879,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.121,O,0.0,O,0.0,O,0.0,O,0.0,1.638
|
21 |
+
CCOP(=O)(OCC)OCC,0.728,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.272,O,0.0,O,0.0,O,0.0,O,0.0,2.0
|
22 |
+
COC(=O)OC,0.375,FC(F)C(F)(F)COC(F)(F)C(F)F,0.375,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.25,O,0.0,O,0.0,O,0.0,1.854
|
23 |
+
O1CCOC1,0.371,COCCOC,0.552,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.077,O,0.0,O,0.0,O,0.0,1.959
|
24 |
+
C1C(OC(=O)O1)F,0.774,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.226,O,0.0,O,0.0,O,0.0,O,0.0,1.587
|
25 |
+
CC1COC(=O)O1,0.875,C1C(OC(=O)O1)F,0.051,[Li+].[O-]Cl(=O)(=O)=O,0.074,O,0.0,O,0.0,O,0.0,0.699
|
26 |
+
C1C(OC(=O)O1)F,0.264,COC(=O)OCCF,0.479,C(C(F)(F)F)OC(C(F)F)(F)F,0.155,[Li+].F[P-](F)(F)(F)(F)F,0.103,O,0.0,O,0.0,2.097
|
27 |
+
C1C(OC(=O)O1)F,0.413,O=C(OCC)OCC,0.497,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.09,O,0.0,O,0.0,O,0.0,1.59
|
28 |
+
C1C(OC(=O)O1)F,0.106,C1COC(=O)O1,0.522,O=C(OCC)OCC,0.287,[Li+].F[P-](F)(F)(F)(F)F,0.077,[Rb+].[O-][N+]([O-])=O,0.004,O1CCOCCOCCOCCOCCOCC1,0.004,1.252
|
29 |
+
COCCOC,0.259,B(OCC(F)(F)F)(OCC(F)(F)F)OCC(F)(F)F,0.556,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.185,O,0.0,O,0.0,O,0.0,1.337
|
30 |
+
C1CCOC1,0.925,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.075,O,0.0,O,0.0,O,0.0,O,0.0,1.377
|
31 |
+
C1C(OC(=O)O1)F,0.82,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.18,O,0.0,O,0.0,O,0.0,O,0.0,1.544
|
32 |
+
CCOP(=O)(OCC)OCC,0.5,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.5,O,0.0,O,0.0,O,0.0,O,0.0,2.097
|
33 |
+
COCCOC,0.731,[Li+].[O-]P(=O)(F)F,0.064,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.205,O,0.0,O,0.0,O,0.0,1.215
|
34 |
+
COCCOCCOCCOCCOC,0.819,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.181,O,0.0,O,0.0,O,0.0,O,0.0,1.222
|
35 |
+
C1COC(=O)O1,0.338,COC(=O)OC,0.625,[Li+].[O-]P(=O)(F)F,0.008,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.03,O,0.0,O,0.0,1.194
|
36 |
+
O1CCOC1,0.463,COCCOC,0.312,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.194,[Li+].[N+](=O)([O-])[O-],0.03,O,0.0,O,0.0,1.824
|
37 |
+
C1C(OC(=O)O1)F,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.002,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.333
|
38 |
+
O1CCOC1,0.539,COCCOC,0.363,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.075,[Li+].[N+](=O)([O-])[O-],0.023,O,0.0,O,0.0,1.824
|
39 |
+
COCCOC,0.257,C(C(F)(F)F)OCC(F)(F)F,0.508,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.235,O,0.0,O,0.0,O,0.0,2.051
|
40 |
+
COCCOC,0.906,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.047,[Li+].FP(F)(=O)([O-]),0.047,O,0.0,O,0.0,O,0.0,1.444
|
41 |
+
O1CCOC1,0.478,COCCOC,0.322,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.134,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.067,O,0.0,O,0.0,1.854
|
42 |
+
CCOCC,0.707,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.293,O,0.0,O,0.0,O,0.0,O,0.0,2.046
|
43 |
+
C1COC(=O)O1,0.563,O=C(OCC)OCC,0.31,C1C(OC(=O)O1)F,0.052,[Li+].F[P-](F)(F)(F)(F)F,0.075,O,0.0,O,0.0,1.301
|
44 |
+
C1CCOC1,0.942,FC,0.029,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,,O,0.0,O,0.0,O,0.0,2.222
|
45 |
+
O1CCOC1,0.478,COCCOC,0.322,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.2,O,0.0,O,0.0,O,0.0,1.903
|
46 |
+
COCCOC,0.906,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.094,O,0.0,O,0.0,O,0.0,O,0.0,1.561
|
47 |
+
C1C(OC(=O)O1)F,0.149,COC(=O)OCCF,0.178,C(C(F)(F)F)OC(C(F)F)(F)F,0.564,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.108,O,0.0,O,0.0,1.735
|
48 |
+
FC(F)COCCOCC(F)(F),0.845,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.155,O,0.0,O,0.0,O,0.0,O,0.0,2.301
|
49 |
+
C1C(OC(=O)O1)F,0.495,COC(=O)OC,0.429,O1CCOCCOCCOCC1,0.003,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.498
|
50 |
+
C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.069,O,0.0,O,0.0,0.745
|
51 |
+
O=S1(=O)CCCC1,0.758,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.235,[Li+].[N+](=O)([O-])[O-],0.007,O,0.0,O,0.0,O,0.0,1.824
|
52 |
+
CCOCC,0.856,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.144,O,0.0,O,0.0,O,0.0,O,0.0,2.0
|
53 |
+
O=C(OCC)C,0.105,ClCCl,0.64,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.255,O,0.0,O,0.0,O,0.0,1.456
|
54 |
+
COCCOCCOCC(F)(F)OC(F)(F)OC(F)(F)COCCOCCOC,0.708,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.292,O,0.0,O,0.0,O,0.0,O,0.0,1.301
|
55 |
+
COCCOC,0.583,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.278,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.139,O,0.0,O,0.0,O,0.0,1.678
|
56 |
+
C1C(OC(=O)O1)F,0.662,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.338,O,0.0,O,0.0,O,0.0,O,0.0,1.646
|
57 |
+
O1CCOC1,0.397,COCCOC,0.589,[Li+][S-]SSS[S-][Li+],,[Li+].[N+](=O)([O-])[O-],0.012,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.002,O,0.0,1.301
|
58 |
+
C1COC(=O)O1,0.308,O=C(OCC)OCC(F)(F)F,0.349,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.343,O,0.0,O,0.0,O,0.0,2.046
|
59 |
+
C1COC(=O)O1,0.362,O=C(OCC)OCC,0.548,[Li+].F[P-](F)(F)(F)(F)F,0.09,O,0.0,O,0.0,O,0.0,0.788
|
60 |
+
C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.001,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.373
|
61 |
+
O1CCOCC1,0.912,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.088,O,0.0,O,0.0,O,0.0,O,0.0,1.602
|
62 |
+
CC#N,0.621,C1=COC(=O)O1,0.056,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.323,O,0.0,O,0.0,O,0.0,1.854
|
63 |
+
COC(=O)OC,0.684,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.316,O,0.0,O,0.0,O,0.0,O,0.0,2.097
|
64 |
+
O=S1(=O)CCCC1,0.714,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.286,O,0.0,O,0.0,O,0.0,O,0.0,1.699
|
65 |
+
FC(F)(F)COCCOCC(F)(F)(F),0.838,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.162,O,0.0,O,0.0,O,0.0,O,0.0,2.155
|
66 |
+
CCOCC,0.64,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.36,O,0.0,O,0.0,O,0.0,O,0.0,2.208
|
67 |
+
COC(=O)OC,0.6,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.4,O,0.0,O,0.0,O,0.0,O,0.0,1.77
|
68 |
+
CC1COC(=O)O1,0.887,[Li+].F[As-](F)(F)(F)(F)F,0.113,O,0.0,O,0.0,O,0.0,O,0.0,0.824
|
69 |
+
C1COC(=O)O1,0.5,CCOC(=O)OC,0.423,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.046,[Li+].O=C1O[B-]2(OC1=O)OC(=O)C(=O)O2,0.031,O,0.0,O,0.0,0.924
|
70 |
+
CCOP(=O)(OCC)OCC,0.214,C(C(F)(F)F)OCC(F)(F)F,0.642,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.144,O,0.0,O,0.0,O,0.0,2.097
|
71 |
+
COCCOC,0.682,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.318,O,0.0,O,0.0,O,0.0,O,0.0,2.108
|
72 |
+
CC1COC(=O)O1,0.922,[LI+].F[B-](F)(F)OC(C(F)(F)(F))(C(F)(F)(F))C(F)(F)(F),0.078,O,0.0,O,0.0,O,0.0,O,0.0,0.712
|
73 |
+
C1COC(=O)O1,0.854,CCOC(=O)OC,0.08,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.039,[Li+].O=C1O[B-]2(OC1=O)OC(=O)C(=O)O2,0.026,O,0.0,O,0.0,1.081
|
74 |
+
C1COC(=O)O1,0.519,O=C(OCC)OCC,0.387,[Li+].F[P-](F)(F)(F)(F)F,0.082,[Li+].[O-]P(=O)(F)F,0.012,O,0.0,O,0.0,1.319
|
75 |
+
COC(=O)CC(F)(F)F,0.768,C1C(OC(=O)O1)F,0.134,[Li+].F[P-](F)(F)(F)(F)F,0.098,O,0.0,O,0.0,O,0.0,1.62
|
76 |
+
C1C(OC(=O)O1)F,0.144,COC(=O)OCCF,0.173,C(C(F)(F)F)OC(C(F)F)(F)F,0.548,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.135,O,0.0,O,0.0,2.222
|
77 |
+
C1COC(=O)O1,0.326,COC(=O)OC,0.602,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,O,0.0,0.777
|
78 |
+
CCOCC,0.877,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.123,O,0.0,O,0.0,O,0.0,O,0.0,2.018
|
79 |
+
COC(=O)OC,0.664,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.336,O,0.0,O,0.0,O,0.0,O,0.0,1.886
|
80 |
+
C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].F[B-](F)(F)F,0.069,O,0.0,O,0.0,0.699
|
81 |
+
CCOP(=O)(OCC)OCC,0.648,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.352,O,0.0,O,0.0,O,0.0,O,0.0,1.569
|
82 |
+
C1C(OC(=O)O1)F,0.481,O=C(OCC)OCC,0.432,[Li+].F[P-](F)(F)(F)(F)F,0.087,O,0.0,O,0.0,O,0.0,1.523
|
83 |
+
COCCOC,0.231,FC(F)C(F)(F)COC(F)(F)C(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0.0,O,0.0,O,0.0,2.155
|
84 |
+
C1C(OC(=O)O1)F,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.001,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.488
|
85 |
+
O1CCOC1,0.453,COCCOC,0.305,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.127,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.063,[Li+].[N+](=O)([O-])[O-],0.051,O,0.0,2.046
|
86 |
+
C1C(OC(=O)O1)F,0.932,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.068,O,0.0,O,0.0,O,0.0,O,0.0,1.41
|
87 |
+
COCCOC,0.139,COCC(F)(F)C(F)(F)C(F)(F)C(F)(F)COC,0.692,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.169,O,0.0,O,0.0,O,0.0,2.222
|
88 |
+
C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.431,O1CCOCCOCCOCC1,0.0,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.559
|
89 |
+
COCCOC,0.231,FC(COC(OCC(F)(F)F)OCC(F)(F)F)(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0.0,O,0.0,O,0.0,2.301
|
90 |
+
CN(C)S(=O)(=O)F,0.921,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.079,O,0.0,O,0.0,O,0.0,O,0.0,1.672
|
91 |
+
C1C(OC(=O)O1)F,0.105,C1COC(=O)O1,0.518,O=C(OCC)OCC,0.285,[Li+].F[P-](F)(F)(F)(F)F,0.077,[Rb+].[O-][N+]([O-])=O,0.008,O1CCOCCOCCOCCOCCOCC1,0.008,1.538
|
92 |
+
CC1CCC(C)O1,0.893,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.107,O,0.0,O,0.0,O,0.0,O,0.0,1.796
|
93 |
+
C1C(OC(=O)O1)F,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.002,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.355
|
94 |
+
C1COC(=O)O1,0.444,C1COS(=O)O1,0.497,[Li+].[O-]Cl(=O)(=O)=O,0.059,O,0.0,O,0.0,O,0.0,1.523
|
95 |
+
COCCOC,0.371,O1CCOC1,0.552,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.031,[Li+].[N+](=O)([O-])[O-],0.046,O,0.0,O,0.0,1.78
|
96 |
+
O=S1(=O)CCCC1,0.764,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.236,O,0.0,O,0.0,O,0.0,O,0.0,1.456
|
97 |
+
O1C(C)CCC1,0.908,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.092,O,0.0,O,0.0,O,0.0,O,0.0,1.745
|
98 |
+
O1CCOC1,0.362,C(C(F)(F)F)OCC(F)(F)F,0.59,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.048,O,0.0,O,0.0,O,0.0,1.967
|
99 |
+
COC(=O)OC,0.543,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.457,O,0.0,O,0.0,O,0.0,O,0.0,2.097
|
100 |
+
COCCOC,0.73,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.27,O,0.0,O,0.0,O,0.0,O,0.0,1.143
|
101 |
+
O1CCOC1,0.552,COCCOC,0.371,[Li+].[N+](=O)([O-])[O-],0.039,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.039,O,0.0,O,0.0,1.523
|
102 |
+
COCCOC,0.242,FC(COC(OCC(F)(F)F)OCC(F)(F)F)(F)F,0.604,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.154,O,0.0,O,0.0,O,0.0,2.301
|
103 |
+
CCOP(=O)(OCC)OCC,0.6,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.4,O,0.0,O,0.0,O,0.0,O,0.0,2.155
|
104 |
+
C1C(OC(=O)O1)F,0.318,CCOC(=O)OC,0.504,COC(=O)OC,0.094,[Li+].F[P-](F)(F)(F)(F)F,0.083,O,0.0,O,0.0,1.301
|
105 |
+
COCCOC,0.231,C(C(F)(F)F)OCC(F)(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0.0,O,0.0,O,0.0,2.222
|
106 |
+
C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].F[P-](F)(F)(F)(F)F,0.069,O,0.0,O,0.0,0.699
|
107 |
+
COCCOC,0.231,C(C(F)(F)F)OC(=O)OCC(F)(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0.0,O,0.0,O,0.0,1.495
|
108 |
+
C1COC(=O)O1,0.32,COC(=O)OC,0.253,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.427,O,0.0,O,0.0,O,0.0,2.155
|
109 |
+
C1C(OC(=O)O1)F,0.312,O=C1OCCC1,0.599,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.068,[Li+].[N+](=O)([O-])[O-],0.021,O,0.0,O,0.0,1.921
|
110 |
+
COC(=O)OC,0.478,FC(F)C(F)(F)COC(F)(F)C(F)F,0.322,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.067,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.134,O,0.0,O,0.0,1.886
|
111 |
+
CCOP(=O)(OCC)OCC,0.259,FC(F)C(F)(F)COC(F)(F)C(F)F,0.556,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.185,O,0.0,O,0.0,O,0.0,2.046
|
112 |
+
COCCOC,0.677,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.323,O,0.0,O,0.0,O,0.0,O,0.0,1.745
|
113 |
+
C1C(OC(=O)O1)F,0.696,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.304,O,0.0,O,0.0,O,0.0,O,0.0,1.633
|
114 |
+
C1CCOC1,0.47,O1C(C)CCC1,0.378,[Li+].F[P-](F)(F)(F)(F)F,0.152,O,0.0,O,0.0,O,0.0,2.097
|
115 |
+
FC(F)COCCOCC(F)(F)(F),0.838,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.162,O,0.0,O,0.0,O,0.0,O,0.0,2.301
|
116 |
+
C1COC(=O)O1,0.496,COC(=O)OC,0.393,C1C(OC(=O)O1)F,0.045,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.066,O,0.0,O,0.0,1.108
|
117 |
+
C1C(OC(=O)O1)F,0.62,C(C(F)(F)F)OC(=O)OCC(F)(F)F,0.291,[Li+].F[P-](F)(F)(F)(F)F,0.089,O,0.0,O,0.0,O,0.0,1.62
|
118 |
+
CCOCC,0.906,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.094,O,0.0,O,0.0,O,0.0,O,0.0,1.959
|
119 |
+
C1COC(=O)O1,0.526,O=C(OCC)OCC,0.392,[Li+].F[P-](F)(F)(F)(F)F,0.083,O,0.0,O,0.0,O,0.0,1.013
|
120 |
+
C1COC(=O)O1,0.05,CCOC(=O)OC,0.237,C(C(F)(F)F)OCC(F)(F)F,0.575,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.123,[Li+].[B-]1(OC(=O)C(=O)O1)(F)F,0.015,O,0.0,1.824
|
121 |
+
O=S1(=O)CCCC1,0.429,FC(F)C(F)(F)COC(F)(F)C(F)F,0.429,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.143,O,0.0,O,0.0,O,0.0,1.921
|
data/lce/train_data.csv
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
smiles1,conc1,smiles2,conc2,smiles3,conc3,smiles4,conc4,smiles5,conc5,smiles6,conc6,LCE
|
2 |
+
CC1COC(=O)O1,0.875,C1C(OC(=O)O1)F,0.051,[Li+].[O-]Cl(=O)(=O)=O,0.074,O,0,O,0,O,0,0.699
|
3 |
+
C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].F[P-](F)(F)(F)(F)F,0.069,O,0,O,0,0.699
|
4 |
+
FC(F)COCCOCC(F)(F),0.845,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.155,O,0,O,0,O,0,O,0,2.301
|
5 |
+
FC(F)COCCOCC(F)(F)(F),0.838,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.162,O,0,O,0,O,0,O,0,2.301
|
6 |
+
CN(C)C(=O)C(F)(F)F,0.362,C1C(OC(=O)O1)F,0.556,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.081,O,0,O,0,O,0,2.155
|
7 |
+
COCCOC,0.231,FC1CCCCC1,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0,O,0,O,0,2.155
|
8 |
+
CCOP(=O)(OCC)OCC,0.6,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.4,O,0,O,0,O,0,O,0,2.155
|
9 |
+
O1CCOC1,0.362,C(C(F)(F)F)OCC(F)(F)F,0.59,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.048,O,0,O,0,O,0,1.967
|
10 |
+
COCC(F)(F)C(F)(F)COC,0.864,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.136,O,0,O,0,O,0,O,0,1.991
|
11 |
+
C1C(OC(=O)O1)F,0.662,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.338,O,0,O,0,O,0,O,0,1.646
|
12 |
+
COCCOC,0.358,O1CCOC1,0.532,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.074,[Li+].[N+](=O)([O-])[O-],0.035,O,0,O,0,1.658
|
13 |
+
CN(C)S(=O)(=O)F,0.921,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.079,O,0,O,0,O,0,O,0,1.672
|
14 |
+
C1C(OC(=O)O1)F,0.106,C1COC(=O)O1,0.522,O=C(OCC)OCC,0.287,[Li+].F[P-](F)(F)(F)(F)F,0.077,[Rb+].[O-][N+]([O-])=O,0.004,O1CCOCCOCCOCCOCCOCC1,0.004,1.252
|
15 |
+
C1COC(=O)O1,0.32,COC(=O)OC,0.253,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.427,O,0,O,0,O,0,2.155
|
16 |
+
COCCOC,0.277,FC(F)C(F)(F)COC(F)(F)C(F)F,0.555,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.168,O,0,O,0,O,0,2.155
|
17 |
+
COC(=O)OC,0.161,FC(F)C(F)(F)COC(F)(F)C(F)F,0.355,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.484,O,0,O,0,O,0,2.155
|
18 |
+
FC(F)(F)COCCOCC,0.838,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.162,O,0,O,0,O,0,O,0,2.155
|
19 |
+
FC(F)(F)COCCOCC(F)(F)(F),0.838,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.162,O,0,O,0,O,0,O,0,2.155
|
20 |
+
CCOCC,0.64,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.36,O,0,O,0,O,0,O,0,2.208
|
21 |
+
C1C(OC(=O)O1)F,0.144,COC(=O)OCCF,0.173,C(C(F)(F)F)OC(C(F)F)(F)F,0.548,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.135,O,0,O,0,2.222
|
22 |
+
CC#N,0.882,FC,0.065,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.054,O,0,O,0,O,0,2.222
|
23 |
+
C1CCOC1,0.942,FC,0.029,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.029,O,0,O,0,O,0,2.222
|
24 |
+
COCCOC,0.139,COCC(F)(F)C(F)(F)C(F)(F)C(F)(F)COC,0.692,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.169,O,0,O,0,O,0,2.222
|
25 |
+
COCCOC,0.231,C(C(F)(F)F)OCC(F)(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0,O,0,O,0,2.222
|
26 |
+
COCCOC,0.507,COC(C(F)(F)F)C(F)(F)F,0.399,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.095,O,0,O,0,O,0,2.268
|
27 |
+
CCOCC,0.313,C(C(F)(F)F)OCC(F)(F)F,0.51,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.177,O,0,O,0,O,0,2.301
|
28 |
+
COC(=O)OC,0.29,C(C(F)(F)F)OCC(F)(F)F,0.589,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.121,O,0,O,0,O,0,2.301
|
29 |
+
COCCOC,0.242,FC(COC(OCC(F)(F)F)OCC(F)(F)F)(F)F,0.604,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.154,O,0,O,0,O,0,2.301
|
30 |
+
O1C(C)CCC1,0.331,FC(F)C(F)(F)COC(F)(F)C(F)F,0.498,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.171,O,0,O,0,O,0,2.301
|
31 |
+
COCCOC,0.2,FC(F)C(F)(F)COC(F)(F)C(F)F,0.6,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.2,O,0,O,0,O,0,2.301
|
32 |
+
COCCOC,0.231,FC(COC(OCC(F)(F)F)OCC(F)(F)F)(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0,O,0,O,0,2.301
|
33 |
+
O=S1(=O)CCCC1,0.359,C(C(F)(F)F)OC(C(F)F)(F)F,0.504,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.133,[Li+].[N+](=O)([O-])[O-],0.004,O,0,O,0,2
|
34 |
+
CCOCC,0.856,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.144,O,0,O,0,O,0,O,0,2
|
35 |
+
CCOCC,0.877,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.123,O,0,O,0,O,0,O,0,2.018
|
36 |
+
CCOCC,0.707,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.293,O,0,O,0,O,0,O,0,2.046
|
37 |
+
C1COC(=O)O1,0.308,O=C(OCC)OCC(F)(F)F,0.349,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.343,O,0,O,0,O,0,2.046
|
38 |
+
O1CCOC1,0.453,COCCOC,0.305,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.127,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.063,[Li+].[N+](=O)([O-])[O-],0.051,O,0,2.046
|
39 |
+
CCOP(=O)(OCC)OCC,0.259,FC(F)C(F)(F)COC(F)(F)C(F)F,0.556,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.185,O,0,O,0,O,0,2.046
|
40 |
+
COCCOC,0.257,C(C(F)(F)F)OCC(F)(F)F,0.508,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.235,O,0,O,0,O,0,2.051
|
41 |
+
COC(=O)OC,0.299,C(C(F)(F)F)OCC(F)(F)F,0.598,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.103,O,0,O,0,O,0,2.056
|
42 |
+
CCOP(=O)(OCC)OCC,0.214,C(C(F)(F)F)OCC(F)(F)F,0.642,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.144,O,0,O,0,O,0,2.097
|
43 |
+
COC(=O)OC,0.684,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.316,O,0,O,0,O,0,O,0,2.097
|
44 |
+
C1CCOC1,0.47,O1C(C)CCC1,0.378,[Li+].F[P-](F)(F)(F)(F)F,0.152,O,0,O,0,O,0,2.097
|
45 |
+
C1C(OC(=O)O1)F,0.264,COC(=O)OCCF,0.479,C(C(F)(F)F)OC(C(F)F)(F)F,0.155,[Li+].F[P-](F)(F)(F)(F)F,0.103,O,0,O,0,2.097
|
46 |
+
CCOP(=O)(OCC)OCC,0.5,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.5,O,0,O,0,O,0,O,0,2.097
|
47 |
+
COC(=O)OC,0.543,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.457,O,0,O,0,O,0,O,0,2.097
|
48 |
+
COCCOC,0.682,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.318,O,0,O,0,O,0,O,0,2.108
|
49 |
+
COCCOC,0.231,FC(F)C(F)(F)COC(F)(F)C(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0,O,0,O,0,2.155
|
50 |
+
CCOP(=O)(OCC)OCC,0.728,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.272,O,0,O,0,O,0,O,0,2
|
51 |
+
COCCOC,0.583,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.278,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.139,O,0,O,0,O,0,1.678
|
52 |
+
C1COC(=O)O1,0.305,COC(=O)OC,0.242,COCCOCCOCCOCCOC,0.392,[Li+].F[P-](F)(F)(F)(F)F,0.041,[Li+].[N+](=O)([O-])[O-],0.02,O,0,1.678
|
53 |
+
C1C(OC(=O)O1)F,0.318,CCOC(=O)OC,0.504,COC(=O)OC,0.094,B(O[Si](C)(C)C)(O[Si](C)(C)C)O[Si](C)(C),0.083,[Li+].F[P-](F)(F)(F)(F)F,0.001,O,0,1.678
|
54 |
+
O=S1(=O)CCCC1,0.714,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.286,O,0,O,0,O,0,O,0,1.699
|
55 |
+
C1C(OC(=O)O1)F,0.149,COC(=O)OCCF,0.178,C(C(F)(F)F)OC(C(F)F)(F)F,0.564,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.108,O,0,O,0,1.735
|
56 |
+
O=S1(=O)CCCC1,0.75,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.25,O,0,O,0,O,0,O,0,1.745
|
57 |
+
COC(=O)OC,0.29,C(C(F)(F)F)OCC(F)(F)F,0.589,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.121,O,0,O,0,O,0,1.745
|
58 |
+
COCCOC,0.677,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.323,O,0,O,0,O,0,O,0,1.745
|
59 |
+
O1C(C)CCC1,0.908,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.092,O,0,O,0,O,0,O,0,1.745
|
60 |
+
COC(=O)OC,0.6,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.4,O,0,O,0,O,0,O,0,1.77
|
61 |
+
COCCOC,0.371,O1CCOC1,0.552,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.031,[Li+].[N+](=O)([O-])[O-],0.046,O,0,O,0,1.78
|
62 |
+
CC1CCC(C)O1,0.893,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.107,O,0,O,0,O,0,O,0,1.796
|
63 |
+
C1COC(=O)O1,0.05,CCOC(=O)OC,0.237,C(C(F)(F)F)OCC(F)(F)F,0.575,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.123,[Li+].[B-]1(OC(=O)C(=O)O1)(F)F,0.015,O,0,1.824
|
64 |
+
O=S1(=O)CCCC1,0.758,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.235,[Li+].[N+](=O)([O-])[O-],0.007,O,0,O,0,O,0,1.824
|
65 |
+
O1CCOC1,0.463,COCCOC,0.312,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.194,[Li+].[N+](=O)([O-])[O-],0.03,O,0,O,0,1.824
|
66 |
+
O1CCOC1,0.539,COCCOC,0.363,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.075,[Li+].[N+](=O)([O-])[O-],0.023,O,0,O,0,1.824
|
67 |
+
COC(=O)OC,0.375,FC(F)C(F)(F)COC(F)(F)C(F)F,0.375,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.25,O,0,O,0,O,0,1.854
|
68 |
+
O1CCOC1,0.478,COCCOC,0.322,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.134,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.067,O,0,O,0,1.854
|
69 |
+
CC#N,0.621,C1=COC(=O)O1,0.056,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.323,O,0,O,0,O,0,1.854
|
70 |
+
COC(=O)OC,0.478,FC(F)C(F)(F)COC(F)(F)C(F)F,0.322,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.067,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.134,O,0,O,0,1.886
|
71 |
+
COC(=O)OC,0.664,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.336,O,0,O,0,O,0,O,0,1.886
|
72 |
+
O1CCOC1,0.478,COCCOC,0.322,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.2,O,0,O,0,O,0,1.903
|
73 |
+
O=S1(=O)CCCC1,0.429,FC(F)C(F)(F)COC(F)(F)C(F)F,0.429,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.143,O,0,O,0,O,0,1.921
|
74 |
+
C1C(OC(=O)O1)F,0.312,O=C1OCCC1,0.599,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.068,[Li+].[N+](=O)([O-])[O-],0.021,O,0,O,0,1.921
|
75 |
+
CC1COC(=O)O1,0.595,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.405,O,0,O,0,O,0,O,0,1.921
|
76 |
+
O1CCOC1,0.371,COCCOC,0.552,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.077,O,0,O,0,O,0,1.959
|
77 |
+
CCOCC,0.906,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.094,O,0,O,0,O,0,O,0,1.959
|
78 |
+
C1CCOC1,0.925,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.075,O,0,O,0,O,0,O,0,1.377
|
79 |
+
C1COC(=O)O1,0.425,O=C(OCC)OCC,0.234,[Li+].F[P-](F)(F)(F)(F)F,0.34,O,0,O,0,O,0,1.398
|
80 |
+
C1C(OC(=O)O1)F,0.932,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.068,O,0,O,0,O,0,O,0,1.41
|
81 |
+
COCCOC,0.906,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.047,[Li+].FP(F)(=O)([O-]),0.047,O,0,O,0,O,0,1.444
|
82 |
+
O=S1(=O)CCCC1,0.764,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.236,O,0,O,0,O,0,O,0,1.456
|
83 |
+
O=C(OCC)C,0.105,ClCCl,0.64,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.255,O,0,O,0,O,0,1.456
|
84 |
+
C1C(OC(=O)O1)F,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.001,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.488
|
85 |
+
C1C(OC(=O)O1)F,0.873,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.127,O,0,O,0,O,0,O,0,1.489
|
86 |
+
COCCOC,0.231,C(C(F)(F)F)OC(=O)OCC(F)(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0,O,0,O,0,1.495
|
87 |
+
C1C(OC(=O)O1)F,0.495,COC(=O)OC,0.429,O1CCOCCOCCOCC1,0.003,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.498
|
88 |
+
C1C(OC(=O)O1)F,0.481,O=C(OCC)OCC,0.432,[Li+].F[P-](F)(F)(F)(F)F,0.087,O,0,O,0,O,0,1.523
|
89 |
+
O1CCOC1,0.322,COCCOC,0.478,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,0.2,O,0,O,0,O,0,1.523
|
90 |
+
O1CCOC1,0.552,COCCOC,0.371,[Li+].[N+](=O)([O-])[O-],0.039,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.039,O,0,O,0,1.523
|
91 |
+
C1COC(=O)O1,0.444,C1COS(=O)O1,0.497,[Li+].[O-]Cl(=O)(=O)=O,0.059,O,0,O,0,O,0,1.523
|
92 |
+
C1C(OC(=O)O1)F,0.105,C1COC(=O)O1,0.518,O=C(OCC)OCC,0.285,[Li+].F[P-](F)(F)(F)(F)F,0.077,[Rb+].[O-][N+]([O-])=O,0.008,O1CCOCCOCCOCCOCCOCC1,0.008,1.538
|
93 |
+
C1C(OC(=O)O1)F,0.82,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.18,O,0,O,0,O,0,O,0,1.544
|
94 |
+
C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.431,O1CCOCCOCCOCC1,0,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.559
|
95 |
+
COCCOC,0.906,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.094,O,0,O,0,O,0,O,0,1.561
|
96 |
+
CCOP(=O)(OCC)OCC,0.648,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.352,O,0,O,0,O,0,O,0,1.569
|
97 |
+
O=S1(=O)CCCC1,0.25,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.75,O,0,O,0,O,0,O,0,1.569
|
98 |
+
C1C(OC(=O)O1)F,0.774,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.226,O,0,O,0,O,0,O,0,1.587
|
99 |
+
C1C(OC(=O)O1)F,0.413,O=C(OCC)OCC,0.497,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.09,O,0,O,0,O,0,1.59
|
100 |
+
C1COC(=O)O1,0.425,O=C(OCC)OCC(F)(F)F,0.481,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.094,O,0,O,0,O,0,1.602
|
101 |
+
CC1COC(=O)O1,0.702,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.298,O,0,O,0,O,0,O,0,1.602
|
102 |
+
O1CCOCC1,0.912,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.088,O,0,O,0,O,0,O,0,1.602
|
103 |
+
C1C(OC(=O)O1)F,0.62,C(C(F)(F)F)OC(=O)OCC(F)(F)F,0.291,[Li+].F[P-](F)(F)(F)(F)F,0.089,O,0,O,0,O,0,1.62
|
104 |
+
COC(=O)CC(F)(F)F,0.768,C1C(OC(=O)O1)F,0.134,[Li+].F[P-](F)(F)(F)(F)F,0.098,O,0,O,0,O,0,1.62
|
105 |
+
C1C(OC(=O)O1)F,0.733,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.267,O,0,O,0,O,0,O,0,1.629
|
106 |
+
C1C(OC(=O)O1)F,0.696,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.304,O,0,O,0,O,0,O,0,1.633
|
107 |
+
COC(C)C(C)OC,0.879,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.121,O,0,O,0,O,0,O,0,1.638
|
108 |
+
C1COC(=O)O1,0.197,COC(=O)OC,0.156,COCCOCCOCCOCCOC,0.59,[Li+].F[P-](F)(F)(F)(F)F,0.026,[Li+].[N+](=O)([O-])[O-],0.031,O,0,1.638
|
109 |
+
C1COC(=O)O1,0.338,COC(=O)OC,0.625,[Li+].[O-]P(=O)(F)F,0.008,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.03,O,0,O,0,1.26
|
110 |
+
COCCOC,0.707,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.147,[Li+].[B-]1(OC(=O)C(=O)O1)(F)F,0.147,O,0,O,0,O,0,1.268
|
111 |
+
C1COC(=O)O1,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.002,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.276
|
112 |
+
COCCOC,0.763,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.174,[Li+].[O-]P(=O)(F)F,0.063,O,0,O,0,O,0,1.292
|
113 |
+
C1COC(=O)O1,0.563,O=C(OCC)OCC,0.31,C1C(OC(=O)O1)F,0.052,[Li+].F[P-](F)(F)(F)(F)F,0.075,O,0,O,0,1.301
|
114 |
+
COCCOCCOCC(F)(F)OC(F)(F)OC(F)(F)COCCOCCOC,0.708,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.292,O,0,O,0,O,0,O,0,1.301
|
115 |
+
C1COC(=O)O1,0.331,O=C(OCC)OCC,0.577,[Li+].[B-]1(OC(=O)C(=O)O1)(F)F,0.092,O,0,O,0,O,0,1.301
|
116 |
+
C1C(OC(=O)O1)F,0.318,CCOC(=O)OC,0.504,COC(=O)OC,0.094,[Li+].F[P-](F)(F)(F)(F)F,0.083,O,0,O,0,1.301
|
117 |
+
C1COC(=O)O1,0.519,O=C(OCC)OCC,0.387,[Li+].F[P-](F)(F)(F)(F)F,0.082,[Li+].[O-]P(=O)(F)F,0.012,O,0,O,0,1.319
|
118 |
+
C1C(OC(=O)O1)F,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.002,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.333
|
119 |
+
COCCOC,0.259,B(OCC(F)(F)F)(OCC(F)(F)F)OCC(F)(F)F,0.556,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.185,O,0,O,0,O,0,1.337
|
120 |
+
C1C(OC(=O)O1)F,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.002,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.355
|
121 |
+
COCCOC,0.763,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.237,O,0,O,0,O,0,O,0,1.367
|
122 |
+
C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.001,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.373
|
123 |
+
C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].F[B-](F)(F)F,0.069,O,0,O,0,0.699
|
124 |
+
CC1COC(=O)O1,0.922,[Li+].F[B-](F)(F)OC(C(F)(F)(F))(C(F)(F)(F))C(F)(F)(F),0.078,O,0,O,0,O,0,O,0,0.712
|
125 |
+
C1COC(=O)O1,0.3,CCOC(=O)OC,0.593,C1=COC(=O)O1,0.026,[Li+].F[P-](F)(F)(F)(F)F,0.081,O,0,O,0,0.745
|
126 |
+
C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.069,O,0,O,0,0.745
|
127 |
+
C1COC(=O)O1,0.326,COC(=O)OC,0.602,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,O,0,0.777
|
128 |
+
C1COC(=O)O1,0.362,O=C(OCC)OCC,0.548,[Li+].F[P-](F)(F)(F)(F)F,0.09,O,0,O,0,O,0,0.788
|
129 |
+
CC1COC(=O)O1,0.887,[Li+].F[As-](F)(F)(F)(F)F,0.113,O,0,O,0,O,0,O,0,0.824
|
130 |
+
C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].C(C(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(C(F)(F)F)(F)F)(F)(F)F,0.069,O,0,O,0,0.854
|
131 |
+
C1COC(=O)O1,0.359,COC(=O)OC,0.569,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,O,0,0.854
|
132 |
+
C1COC(=O)O1,0.331,O=C(OCC)OCC,0.577,[Li+].F[P-](F)(F)(F)(F)F,0.092,O,0,O,0,O,0,0.886
|
133 |
+
C1COC(=O)O1,0.594,O=C(OCC)OCC,0.327,[Li+].F[P-](F)(F)(F)(F)F,0.079,O,0,O,0,O,0,0.921
|
134 |
+
C1COC(=O)O1,0.5,CCOC(=O)OC,0.423,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.046,[Li+].O=C1O[B-]2(OC1=O)OC(=O)C(=O)O2,0.031,O,0,O,0,0.924
|
135 |
+
C1COC(=O)O1,0.526,O=C(OCC)OCC,0.392,[Li+].F[P-](F)(F)(F)(F)F,0.083,O,0,O,0,O,0,1.013
|
136 |
+
C1COC(=O)O1,0.356,COC(=O)OC,0.566,FC(F)(F)COB(OCC(F)(F)F)OCC(F)(F)F,0.007,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.046
|
137 |
+
C1COC(=O)O1,0.682,CCOC(=O)OC,0.247,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.043,[Li+].O=C1O[B-]2(OC1=O)OC(=O)C(=O)O2,0.028,O,0,O,0,1.076
|
138 |
+
C1COC(=O)O1,0.854,CCOC(=O)OC,0.08,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.039,[Li+].O=C1O[B-]2(OC1=O)OC(=O)C(=O)O2,0.026,O,0,O,0,1.081
|
139 |
+
C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.431,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,O,0,1.085
|
140 |
+
C1C(OC(=O)O1)F,0.107,C1COC(=O)O1,0.526,O=C(OCC)OCC,0.289,[Li+].F[P-](F)(F)(F)(F)F,0.078,O,0,O,0,1.108
|
141 |
+
C1COC(=O)O1,0.496,COC(=O)OC,0.393,C1C(OC(=O)O1)F,0.045,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.066,O,0,O,0,1.108
|
142 |
+
COCCOC,0.73,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.27,O,0,O,0,O,0,O,0,1.143
|
143 |
+
C1COC(=O)O1,0.327,O=C(OCC)OCC,0.594,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.079,O,0,O,0,O,0,1.155
|
144 |
+
C1COC(=O)O1,0.338,COC(=O)OC,0.625,[Li+].[O-]P(=O)(F)F,0.008,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.03,O,0,O,0,1.194
|
145 |
+
COCCOC,0.731,[Li+].[O-]P(=O)(F)F,0.064,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.205,O,0,O,0,O,0,1.215
|
146 |
+
COCCOCCOCCOCCOC,0.819,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.181,O,0,O,0,O,0,O,0,1.222
|
147 |
+
C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.225
|
148 |
+
COCCOC,0.706,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.008,[Li+].[O-]P(=O)(F)F,0.286,O,0,O,0,O,0,1.244
|
models/.gitattributes
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
*.csv filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
models/fm4m.py
CHANGED
@@ -25,9 +25,17 @@ from sklearn.preprocessing import MinMaxScaler
|
|
25 |
import torch
|
26 |
from transformers import AutoTokenizer, AutoModel
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
datasets = {}
|
33 |
models = {}
|
@@ -48,7 +56,7 @@ def avail_models_data():
|
|
48 |
|
49 |
|
50 |
models = [{"Name": "bart","Model Name": "SELFIES-TED","Description": "BART model for string based SELFIES modality", "Timestamp": "2024-06-21 12:32:20"},
|
51 |
-
{"Name": "mol-xl","Model Name": "
|
52 |
{"Name": "mhg", "Model Name": "MHG-GED","Description": "Molecular hypergraph model", "Timestamp": "2024-07-10 00:09:42"},
|
53 |
{"Name": "smi-ted", "Model Name": "SMI-TED","Description": "SMILES based encoder decoder model", "Timestamp": "2024-07-10 00:09:42"}]
|
54 |
|
@@ -58,8 +66,10 @@ def avail_models(raw=False):
|
|
58 |
|
59 |
models = [{"Name": "smi-ted", "Model Name": "SMI-TED","Description": "SMILES based encoder decoder model"},
|
60 |
{"Name": "bart","Model Name": "SELFIES-TED","Description": "BART model for string based SELFIES modality"},
|
61 |
-
{"Name": "mol-xl","Model Name": "
|
62 |
{"Name": "mhg", "Model Name": "MHG-GED","Description": "Molecular hypergraph model"},
|
|
|
|
|
63 |
]
|
64 |
|
65 |
|
@@ -70,12 +80,22 @@ def avail_models(raw=False):
|
|
70 |
|
71 |
return models
|
72 |
|
73 |
-
def avail_downstream_models():
|
74 |
global downstream_models
|
75 |
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
def avail_datasets():
|
81 |
global datasets
|
@@ -178,13 +198,15 @@ def update_downstream_model_list(list_model):
|
|
178 |
|
179 |
avail_models_data()
|
180 |
|
|
|
|
|
181 |
def get_representation(train_data,test_data,model_type, return_tensor=True):
|
182 |
alias = {"MHG-GED": "mhg", "SELFIES-TED": "bart", "MolFormer": "mol-xl", "Molformer": "mol-xl", "SMI-TED": "smi-ted"}
|
183 |
if model_type in alias.keys():
|
184 |
model_type = alias[model_type]
|
185 |
|
186 |
if model_type == "mhg":
|
187 |
-
model = mhg.load("models/mhg_model/pickles/mhggnn_pretrained_model_0724_2023.pickle")
|
188 |
with torch.no_grad():
|
189 |
train_emb = model.encode(train_data)
|
190 |
x_batch = torch.stack(train_emb)
|
@@ -196,7 +218,6 @@ def get_representation(train_data,test_data,model_type, return_tensor=True):
|
|
196 |
x_batch_test = pd.DataFrame(x_batch_test)
|
197 |
|
198 |
|
199 |
-
|
200 |
elif model_type == "bart":
|
201 |
model = bart()
|
202 |
model.load()
|
@@ -204,7 +225,7 @@ def get_representation(train_data,test_data,model_type, return_tensor=True):
|
|
204 |
x_batch_test = model.encode(test_data, return_tensor=return_tensor)
|
205 |
|
206 |
elif model_type == "smi-ted":
|
207 |
-
model = load_smi_ted(folder='
|
208 |
with torch.no_grad():
|
209 |
x_batch = model.encode(train_data, return_torch=return_tensor)
|
210 |
x_batch_test = model.encode(test_data, return_torch=return_tensor)
|
@@ -237,35 +258,78 @@ def get_representation(train_data,test_data,model_type, return_tensor=True):
|
|
237 |
if not return_tensor:
|
238 |
x_batch = pd.DataFrame(x_batch)
|
239 |
x_batch_test = pd.DataFrame(x_batch_test)
|
240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
|
242 |
return x_batch, x_batch_test
|
243 |
|
244 |
-
def single_modal(model,dataset, downstream_model,params):
|
245 |
print(model)
|
246 |
-
alias = {"MHG-GED":"mhg", "SELFIES-TED": "bart", "MolFormer":"mol-xl", "SMI-TED": "smi-ted"}
|
247 |
data = avail_models(raw=True)
|
248 |
df = pd.DataFrame(data)
|
249 |
-
print(list(df["Name"].values))
|
250 |
-
|
251 |
-
|
|
|
|
|
252 |
model_type = alias[model]
|
253 |
-
else:
|
254 |
-
model_type = model
|
255 |
else:
|
256 |
print("Model not available")
|
257 |
return
|
|
|
258 |
|
259 |
data = avail_datasets()
|
260 |
df = pd.DataFrame(data)
|
261 |
-
print(list(df["Dataset"].values))
|
262 |
|
263 |
if dataset in list(df["Dataset"].values):
|
264 |
task = dataset
|
265 |
-
with open(f"
|
266 |
x_batch, y_batch, x_batch_test, y_batch_test = pickle.load(f1)
|
267 |
print(f" Representation loaded successfully")
|
268 |
-
|
|
|
269 |
|
270 |
print("Custom Dataset")
|
271 |
#return
|
@@ -283,14 +347,40 @@ def single_modal(model,dataset, downstream_model,params):
|
|
283 |
|
284 |
print(f" Representation loaded successfully")
|
285 |
|
|
|
286 |
|
287 |
-
|
288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
|
290 |
print(f" Calculating ROC AUC Score ...")
|
291 |
|
292 |
if downstream_model == "XGBClassifier":
|
293 |
-
|
|
|
|
|
|
|
294 |
xgb_predict_concat.fit(x_batch, y_batch)
|
295 |
|
296 |
y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1]
|
@@ -300,21 +390,26 @@ def single_modal(model,dataset, downstream_model,params):
|
|
300 |
print(f"ROC-AUC Score: {roc_auc:.4f}")
|
301 |
|
302 |
try:
|
303 |
-
with open(f"
|
304 |
class_0,class_1 = pickle.load(f1)
|
305 |
except:
|
306 |
print("Generating latent plots")
|
307 |
reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1,
|
308 |
verbose=False)
|
309 |
n_samples = np.minimum(1000, len(x_batch))
|
310 |
-
|
311 |
try:x = y_batch.values[:n_samples]
|
312 |
except: x = y_batch[:n_samples]
|
313 |
index_0 = [index for index in range(len(x)) if x[index] == 0]
|
314 |
index_1 = [index for index in range(len(x)) if x[index] == 1]
|
315 |
|
316 |
-
|
317 |
-
|
|
|
|
|
|
|
|
|
|
|
318 |
print("Generating latent plots : Done")
|
319 |
|
320 |
#vizualize(roc_auc,fpr, tpr, x_batch, y_batch )
|
@@ -334,20 +429,29 @@ def single_modal(model,dataset, downstream_model,params):
|
|
334 |
print(f"ROC-AUC Score: {roc_auc:.4f}")
|
335 |
|
336 |
try:
|
337 |
-
with open(f"
|
338 |
class_0,class_1 = pickle.load(f1)
|
339 |
except:
|
340 |
print("Generating latent plots")
|
341 |
reducer = umap.UMAP(metric='euclidean', n_neighbors= 10, n_components=2, low_memory=True, min_dist=0.1, verbose=False)
|
342 |
n_samples = np.minimum(1000,len(x_batch))
|
343 |
-
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
344 |
-
try:x = y_batch.values[:n_samples]
|
345 |
-
except:x = y_batch[:n_samples]
|
346 |
-
index_0 = [index for index in range(len(x)) if x[index] == 0]
|
347 |
-
index_1 = [index for index in range(len(x)) if x[index] == 1]
|
348 |
|
349 |
-
|
350 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
print("Generating latent plots : Done")
|
352 |
|
353 |
#vizualize(roc_auc,fpr, tpr, x_batch, y_batch )
|
@@ -355,16 +459,19 @@ def single_modal(model,dataset, downstream_model,params):
|
|
355 |
result = f"ROC-AUC Score: {roc_auc:.4f}"
|
356 |
|
357 |
return result, roc_auc,fpr, tpr, class_0, class_1
|
358 |
-
|
359 |
elif downstream_model == "SVR":
|
360 |
-
|
|
|
|
|
|
|
361 |
model = TransformedTargetRegressor(regressor= regressor,
|
362 |
transformer = MinMaxScaler(feature_range=(-1, 1))
|
363 |
).fit(x_batch,y_batch)
|
364 |
-
|
365 |
y_prob = model.predict(x_batch_test)
|
366 |
RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
|
367 |
-
|
368 |
print(f"RMSE Score: {RMSE_score:.4f}")
|
369 |
result = f"RMSE Score: {RMSE_score:.4f}"
|
370 |
|
@@ -372,20 +479,28 @@ def single_modal(model,dataset, downstream_model,params):
|
|
372 |
reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1,
|
373 |
verbose=False)
|
374 |
n_samples = np.minimum(1000, len(x_batch))
|
375 |
-
|
376 |
-
try:x = y_batch.values[:n_samples]
|
377 |
-
except:x = y_batch[:n_samples]
|
378 |
#index_0 = [index for index in range(len(x)) if x[index] == 0]
|
379 |
#index_1 = [index for index in range(len(x)) if x[index] == 1]
|
380 |
|
381 |
-
|
382 |
-
|
|
|
|
|
|
|
|
|
|
|
383 |
print("Generating latent plots : Done")
|
384 |
-
|
385 |
return result, RMSE_score,y_batch_test, y_prob, class_0, class_1
|
386 |
|
387 |
elif downstream_model == "Kernel Ridge":
|
388 |
-
|
|
|
|
|
|
|
389 |
model = TransformedTargetRegressor(regressor=regressor,
|
390 |
transformer=MinMaxScaler(feature_range=(-1, 1))
|
391 |
).fit(x_batch, y_batch)
|
@@ -401,8 +516,8 @@ def single_modal(model,dataset, downstream_model,params):
|
|
401 |
verbose=False)
|
402 |
n_samples = np.minimum(1000, len(x_batch))
|
403 |
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
404 |
-
try:x = y_batch.values[:n_samples]
|
405 |
-
except:x = y_batch[:n_samples]
|
406 |
# index_0 = [index for index in range(len(x)) if x[index] == 0]
|
407 |
# index_1 = [index for index in range(len(x)) if x[index] == 1]
|
408 |
|
@@ -414,7 +529,10 @@ def single_modal(model,dataset, downstream_model,params):
|
|
414 |
|
415 |
|
416 |
elif downstream_model == "Linear Regression":
|
417 |
-
|
|
|
|
|
|
|
418 |
model = TransformedTargetRegressor(regressor=regressor,
|
419 |
transformer=MinMaxScaler(feature_range=(-1, 1))
|
420 |
).fit(x_batch, y_batch)
|
@@ -431,7 +549,7 @@ def single_modal(model,dataset, downstream_model,params):
|
|
431 |
n_samples = np.minimum(1000, len(x_batch))
|
432 |
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
433 |
try:x = y_batch.values[:n_samples]
|
434 |
-
except:x = y_batch[:n_samples]
|
435 |
# index_0 = [index for index in range(len(x)) if x[index] == 0]
|
436 |
# index_1 = [index for index in range(len(x)) if x[index] == 1]
|
437 |
|
@@ -460,7 +578,7 @@ def single_modal(model,dataset, downstream_model,params):
|
|
460 |
n_samples = np.minimum(1000, len(x_batch))
|
461 |
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
462 |
try:x = y_batch.values[:n_samples]
|
463 |
-
except:x = y_batch[:n_samples]
|
464 |
# index_0 = [index for index in range(len(x)) if x[index] == 0]
|
465 |
# index_1 = [index for index in range(len(x)) if x[index] == 1]
|
466 |
|
@@ -469,10 +587,10 @@ def single_modal(model,dataset, downstream_model,params):
|
|
469 |
print("Generating latent plots : Done")
|
470 |
|
471 |
return result, RMSE_score, y_batch_test, y_prob, class_0, class_1
|
|
|
472 |
|
473 |
-
|
474 |
-
|
475 |
-
print(model_list)
|
476 |
data = avail_datasets()
|
477 |
df = pd.DataFrame(data)
|
478 |
list(df["Dataset"].values)
|
@@ -480,7 +598,7 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
480 |
if dataset in list(df["Dataset"].values):
|
481 |
task = dataset
|
482 |
predefined = True
|
483 |
-
|
484 |
predefined = False
|
485 |
components = dataset.split(",")
|
486 |
train_data = pd.read_csv(components[0])[components[2]]
|
@@ -490,13 +608,18 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
490 |
y_batch_test = pd.read_csv(components[1])[components[3]]
|
491 |
|
492 |
print("Custom Dataset loaded")
|
493 |
-
|
|
|
|
|
|
|
|
|
|
|
494 |
|
495 |
data = avail_models(raw=True)
|
496 |
df = pd.DataFrame(data)
|
497 |
list(df["Name"].values)
|
498 |
|
499 |
-
alias = {"MHG-GED":"mhg", "SELFIES-TED": "bart", "MolFormer":"mol-xl", "SMI-TED":"smi-ted"}
|
500 |
#if set(model_list).issubset(list(df["Name"].values)):
|
501 |
if set(model_list).issubset(list(alias.keys())):
|
502 |
for i, model in enumerate(model_list):
|
@@ -507,7 +630,7 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
507 |
|
508 |
if i == 0:
|
509 |
if predefined:
|
510 |
-
with open(f"
|
511 |
x_batch, y_batch, x_batch_test, y_batch_test = pickle.load(f1)
|
512 |
print(f" Loaded representation/{task}_{model_type}.pkl")
|
513 |
else:
|
@@ -517,7 +640,7 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
517 |
|
518 |
else:
|
519 |
if predefined:
|
520 |
-
with open(f"
|
521 |
x_batch_1, y_batch_1, x_batch_test_1, y_batch_test_1 = pickle.load(f1)
|
522 |
print(f" Loaded representation/{task}_{model_type}.pkl")
|
523 |
else:
|
@@ -528,7 +651,6 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
528 |
x_batch = pd.concat([x_batch, x_batch_1], axis=1)
|
529 |
x_batch_test = pd.concat([x_batch_test, x_batch_test_1], axis=1)
|
530 |
|
531 |
-
|
532 |
else:
|
533 |
print("Model not available")
|
534 |
return
|
@@ -538,11 +660,31 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
538 |
|
539 |
num_columns = x_batch.shape[1]
|
540 |
x_batch.columns = [f'{i + 1}' for i in range(num_columns)]
|
541 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
542 |
|
543 |
print(f"Representations loaded successfully")
|
544 |
try:
|
545 |
-
with open(f"
|
546 |
class_0, class_1 = pickle.load(f1)
|
547 |
except:
|
548 |
print("Generating latent plots")
|
@@ -552,8 +694,8 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
552 |
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
553 |
|
554 |
if "Classifier" in downstream_model:
|
555 |
-
try:x = y_batch.values[:n_samples]
|
556 |
-
except:x = y_batch[:n_samples]
|
557 |
index_0 = [index for index in range(len(x)) if x[index] == 0]
|
558 |
index_1 = [index for index in range(len(x)) if x[index] == 1]
|
559 |
|
@@ -570,7 +712,10 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
570 |
|
571 |
|
572 |
if downstream_model == "XGBClassifier":
|
573 |
-
|
|
|
|
|
|
|
574 |
xgb_predict_concat.fit(x_batch, y_batch)
|
575 |
|
576 |
y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1]
|
@@ -608,21 +753,27 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
608 |
return result, roc_auc,fpr, tpr, class_0, class_1
|
609 |
|
610 |
elif downstream_model == "SVR":
|
611 |
-
|
|
|
|
|
|
|
612 |
model = TransformedTargetRegressor(regressor= regressor,
|
613 |
transformer = MinMaxScaler(feature_range=(-1, 1))
|
614 |
).fit(x_batch,y_batch)
|
615 |
-
|
616 |
y_prob = model.predict(x_batch_test)
|
617 |
RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
|
618 |
-
|
619 |
print(f"RMSE Score: {RMSE_score:.4f}")
|
620 |
result = f"RMSE Score: {RMSE_score:.4f}"
|
621 |
-
|
622 |
return result, RMSE_score,y_batch_test, y_prob, class_0, class_1
|
623 |
|
624 |
elif downstream_model == "Linear Regression":
|
625 |
-
|
|
|
|
|
|
|
626 |
model = TransformedTargetRegressor(regressor=regressor,
|
627 |
transformer=MinMaxScaler(feature_range=(-1, 1))
|
628 |
).fit(x_batch, y_batch)
|
@@ -636,7 +787,10 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
636 |
return result, RMSE_score, y_batch_test, y_prob, class_0, class_1
|
637 |
|
638 |
elif downstream_model == "Kernel Ridge":
|
639 |
-
|
|
|
|
|
|
|
640 |
model = TransformedTargetRegressor(regressor=regressor,
|
641 |
transformer=MinMaxScaler(feature_range=(-1, 1))
|
642 |
).fit(x_batch, y_batch)
|
@@ -665,6 +819,144 @@ def multi_modal(model_list,dataset, downstream_model,params):
|
|
665 |
|
666 |
|
667 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
668 |
|
669 |
|
670 |
|
|
|
25 |
import torch
|
26 |
from transformers import AutoTokenizer, AutoModel
|
27 |
|
28 |
+
import sys
|
29 |
+
sys.path.append("models/")
|
30 |
+
|
31 |
+
from models.selfies_ted.load import SELFIES as bart
|
32 |
+
from models.mhg_model import load as mhg
|
33 |
+
from models.smi_ted.smi_ted_light.load import load_smi_ted
|
34 |
+
|
35 |
+
import mordred
|
36 |
+
from mordred import Calculator, descriptors
|
37 |
+
from rdkit import Chem
|
38 |
+
from rdkit.Chem import AllChem
|
39 |
|
40 |
datasets = {}
|
41 |
models = {}
|
|
|
56 |
|
57 |
|
58 |
models = [{"Name": "bart","Model Name": "SELFIES-TED","Description": "BART model for string based SELFIES modality", "Timestamp": "2024-06-21 12:32:20"},
|
59 |
+
{"Name": "mol-xl","Model Name": "MolFormer", "Description": "MolFormer model for string based SMILES modality", "Timestamp": "2024-06-21 12:35:56"},
|
60 |
{"Name": "mhg", "Model Name": "MHG-GED","Description": "Molecular hypergraph model", "Timestamp": "2024-07-10 00:09:42"},
|
61 |
{"Name": "smi-ted", "Model Name": "SMI-TED","Description": "SMILES based encoder decoder model", "Timestamp": "2024-07-10 00:09:42"}]
|
62 |
|
|
|
66 |
|
67 |
models = [{"Name": "smi-ted", "Model Name": "SMI-TED","Description": "SMILES based encoder decoder model"},
|
68 |
{"Name": "bart","Model Name": "SELFIES-TED","Description": "BART model for string based SELFIES modality"},
|
69 |
+
{"Name": "mol-xl","Model Name": "MolFormer", "Description": "MolFormer model for string based SMILES modality"},
|
70 |
{"Name": "mhg", "Model Name": "MHG-GED","Description": "Molecular hypergraph model"},
|
71 |
+
{"Name": "Mordred", "Model Name": "Mordred","Description": "Baseline: A descriptor-calculation software application that can calculate more than 1800 two- and three-dimensional descriptors"},
|
72 |
+
{"Name": "MorganFingerprint", "Model Name": "MorganFingerprint","Description": "Baseline: Circular atom environments based descriptor"}
|
73 |
]
|
74 |
|
75 |
|
|
|
80 |
|
81 |
return models
|
82 |
|
83 |
+
def avail_downstream_models(raw=False):
|
84 |
global downstream_models
|
85 |
|
86 |
+
downstream_models = [{"Name": "XGBClassifier", "Task Type": "Classfication"},
|
87 |
+
{"Name": "DefaultClassifier", "Task Type": "Classfication"},
|
88 |
+
{"Name": "SVR", "Task Type": "Regression"},
|
89 |
+
{"Name": "Kernel Ridge", "Task Type": "Regression"},
|
90 |
+
{"Name": "Linear Regression", "Task Type": "Regression"},
|
91 |
+
{"Name": "DefaultRegressor", "Task Type": "Regression"},
|
92 |
+
]
|
93 |
+
|
94 |
+
if raw: return downstream_models
|
95 |
+
else:
|
96 |
+
return pd.DataFrame(downstream_models)
|
97 |
+
|
98 |
+
|
99 |
|
100 |
def avail_datasets():
|
101 |
global datasets
|
|
|
198 |
|
199 |
avail_models_data()
|
200 |
|
201 |
+
|
202 |
+
|
203 |
def get_representation(train_data,test_data,model_type, return_tensor=True):
|
204 |
alias = {"MHG-GED": "mhg", "SELFIES-TED": "bart", "MolFormer": "mol-xl", "Molformer": "mol-xl", "SMI-TED": "smi-ted"}
|
205 |
if model_type in alias.keys():
|
206 |
model_type = alias[model_type]
|
207 |
|
208 |
if model_type == "mhg":
|
209 |
+
model = mhg.load("../models/mhg_model/pickles/mhggnn_pretrained_model_0724_2023.pickle")
|
210 |
with torch.no_grad():
|
211 |
train_emb = model.encode(train_data)
|
212 |
x_batch = torch.stack(train_emb)
|
|
|
218 |
x_batch_test = pd.DataFrame(x_batch_test)
|
219 |
|
220 |
|
|
|
221 |
elif model_type == "bart":
|
222 |
model = bart()
|
223 |
model.load()
|
|
|
225 |
x_batch_test = model.encode(test_data, return_tensor=return_tensor)
|
226 |
|
227 |
elif model_type == "smi-ted":
|
228 |
+
model = load_smi_ted(folder='../models/smi_ted/smi_ted_light', ckpt_filename='smi-ted-Light_40.pt')
|
229 |
with torch.no_grad():
|
230 |
x_batch = model.encode(train_data, return_torch=return_tensor)
|
231 |
x_batch_test = model.encode(test_data, return_torch=return_tensor)
|
|
|
258 |
if not return_tensor:
|
259 |
x_batch = pd.DataFrame(x_batch)
|
260 |
x_batch_test = pd.DataFrame(x_batch_test)
|
261 |
+
|
262 |
+
elif model_type == 'Mordred':
|
263 |
+
all_data = train_data + test_data
|
264 |
+
calc = Calculator(descriptors, ignore_3D=True)
|
265 |
+
mol_list = [Chem.MolFromSmiles(sm) for sm in all_data]
|
266 |
+
x_all = calc.pandas(mol_list)
|
267 |
+
print (f'original mordred fv dim: {x_all.shape}')
|
268 |
+
|
269 |
+
for j in x_all.columns:
|
270 |
+
for k in range(len(x_all[j])):
|
271 |
+
i = x_all.loc[k, j]
|
272 |
+
if type(i) is mordred.error.Missing or type(i) is mordred.error.Error:
|
273 |
+
x_all.loc[k, j] = np.nan
|
274 |
+
|
275 |
+
x_all.dropna(how="any", axis = 1, inplace=True)
|
276 |
+
print (f'Nan excluded mordred fv dim: {x_all.shape}')
|
277 |
+
|
278 |
+
x_batch = x_all.iloc[:len(train_data)]
|
279 |
+
x_batch_test = x_all.iloc[len(train_data):]
|
280 |
+
# print(f'x_batch: {len(x_batch)}, x_batch_test: {len(x_batch_test)}')
|
281 |
+
|
282 |
+
elif model_type == 'MorganFingerprint':
|
283 |
+
params = {'radius':2, 'nBits':1024}
|
284 |
+
|
285 |
+
mol_train = [Chem.MolFromSmiles(sm) for sm in train_data]
|
286 |
+
mol_test = [Chem.MolFromSmiles(sm) for sm in test_data]
|
287 |
+
|
288 |
+
x_batch = []
|
289 |
+
for mol in mol_train:
|
290 |
+
info = {}
|
291 |
+
fp = AllChem.GetMorganFingerprintAsBitVect(mol, **params, bitInfo=info)
|
292 |
+
vector = list(fp)
|
293 |
+
x_batch.append(vector)
|
294 |
+
x_batch = pd.DataFrame(x_batch)
|
295 |
+
|
296 |
+
x_batch_test = []
|
297 |
+
for mol in mol_test:
|
298 |
+
info = {}
|
299 |
+
fp = AllChem.GetMorganFingerprintAsBitVect(mol, **params, bitInfo=info)
|
300 |
+
vector = list(fp)
|
301 |
+
x_batch_test.append(vector)
|
302 |
+
x_batch_test = pd.DataFrame(x_batch_test)
|
303 |
|
304 |
return x_batch, x_batch_test
|
305 |
|
306 |
+
def single_modal(model,dataset=None, downstream_model=None, params=None, x_train=None, x_test=None, y_train=None, y_test=None):
|
307 |
print(model)
|
308 |
+
alias = {"MHG-GED":"mhg", "SELFIES-TED": "bart", "MolFormer":"mol-xl", "Molformer": "mol-xl", "SMI-TED": "smi-ted"}
|
309 |
data = avail_models(raw=True)
|
310 |
df = pd.DataFrame(data)
|
311 |
+
#print(list(df["Name"].values))
|
312 |
+
|
313 |
+
if model in list(df["Name"].values):
|
314 |
+
model_type = model
|
315 |
+
elif alias[model] in list(df["Name"].values):
|
316 |
model_type = alias[model]
|
|
|
|
|
317 |
else:
|
318 |
print("Model not available")
|
319 |
return
|
320 |
+
|
321 |
|
322 |
data = avail_datasets()
|
323 |
df = pd.DataFrame(data)
|
324 |
+
#print(list(df["Dataset"].values))
|
325 |
|
326 |
if dataset in list(df["Dataset"].values):
|
327 |
task = dataset
|
328 |
+
with open(f"representation/{task}_{model_type}.pkl", "rb") as f1:
|
329 |
x_batch, y_batch, x_batch_test, y_batch_test = pickle.load(f1)
|
330 |
print(f" Representation loaded successfully")
|
331 |
+
|
332 |
+
elif x_train==None:
|
333 |
|
334 |
print("Custom Dataset")
|
335 |
#return
|
|
|
347 |
|
348 |
print(f" Representation loaded successfully")
|
349 |
|
350 |
+
else:
|
351 |
|
352 |
+
y_batch = y_train
|
353 |
+
y_batch_test = y_test
|
354 |
+
x_batch, x_batch_test = get_representation(x_train, x_test, model_type)
|
355 |
+
|
356 |
+
# exclude row containing Nan value
|
357 |
+
if isinstance(x_batch, torch.Tensor):
|
358 |
+
x_batch = pd.DataFrame(x_batch)
|
359 |
+
nan_indices = x_batch.index[x_batch.isna().any(axis=1)]
|
360 |
+
if len(nan_indices) > 0:
|
361 |
+
x_batch.dropna(inplace = True)
|
362 |
+
for index in sorted(nan_indices, reverse=True):
|
363 |
+
del y_batch[index]
|
364 |
+
print(f'x_batch Nan index: {nan_indices}')
|
365 |
+
print(f'x_batch shape: {x_batch.shape}, y_batch len: {len(y_batch)}')
|
366 |
+
|
367 |
+
if isinstance(x_batch_test, torch.Tensor):
|
368 |
+
x_batch_test = pd.DataFrame(x_batch_test)
|
369 |
+
nan_indices = x_batch_test.index[x_batch_test.isna().any(axis=1)]
|
370 |
+
if len(nan_indices) > 0:
|
371 |
+
x_batch_test.dropna(inplace = True)
|
372 |
+
for index in sorted(nan_indices, reverse=True):
|
373 |
+
del y_batch_test[index]
|
374 |
+
print(f'x_batch_test Nan index: {nan_indices}')
|
375 |
+
print(f'x_batch_test shape: {x_batch_test.shape}, y_batch_test len: {len(y_batch_test)}')
|
376 |
|
377 |
print(f" Calculating ROC AUC Score ...")
|
378 |
|
379 |
if downstream_model == "XGBClassifier":
|
380 |
+
if params == None:
|
381 |
+
xgb_predict_concat = XGBClassifier()
|
382 |
+
else:
|
383 |
+
xgb_predict_concat = XGBClassifier(**params) # n_estimators=5000, learning_rate=0.01, max_depth=10
|
384 |
xgb_predict_concat.fit(x_batch, y_batch)
|
385 |
|
386 |
y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1]
|
|
|
390 |
print(f"ROC-AUC Score: {roc_auc:.4f}")
|
391 |
|
392 |
try:
|
393 |
+
with open(f"plot_emb/{task}_{model_type}.pkl", "rb") as f1:
|
394 |
class_0,class_1 = pickle.load(f1)
|
395 |
except:
|
396 |
print("Generating latent plots")
|
397 |
reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1,
|
398 |
verbose=False)
|
399 |
n_samples = np.minimum(1000, len(x_batch))
|
400 |
+
|
401 |
try:x = y_batch.values[:n_samples]
|
402 |
except: x = y_batch[:n_samples]
|
403 |
index_0 = [index for index in range(len(x)) if x[index] == 0]
|
404 |
index_1 = [index for index in range(len(x)) if x[index] == 1]
|
405 |
|
406 |
+
try:
|
407 |
+
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
408 |
+
class_0 = features_umap[index_0]
|
409 |
+
class_1 = features_umap[index_1]
|
410 |
+
except:
|
411 |
+
class_0 = []
|
412 |
+
class_1 = []
|
413 |
print("Generating latent plots : Done")
|
414 |
|
415 |
#vizualize(roc_auc,fpr, tpr, x_batch, y_batch )
|
|
|
429 |
print(f"ROC-AUC Score: {roc_auc:.4f}")
|
430 |
|
431 |
try:
|
432 |
+
with open(f"plot_emb/{task}_{model_type}.pkl", "rb") as f1:
|
433 |
class_0,class_1 = pickle.load(f1)
|
434 |
except:
|
435 |
print("Generating latent plots")
|
436 |
reducer = umap.UMAP(metric='euclidean', n_neighbors= 10, n_components=2, low_memory=True, min_dist=0.1, verbose=False)
|
437 |
n_samples = np.minimum(1000,len(x_batch))
|
|
|
|
|
|
|
|
|
|
|
438 |
|
439 |
+
try:
|
440 |
+
x = y_batch.values[:n_samples]
|
441 |
+
except:
|
442 |
+
x = y_batch[:n_samples]
|
443 |
+
|
444 |
+
try:
|
445 |
+
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
446 |
+
index_0 = [index for index in range(len(x)) if x[index] == 0]
|
447 |
+
index_1 = [index for index in range(len(x)) if x[index] == 1]
|
448 |
+
|
449 |
+
class_0 = features_umap[index_0]
|
450 |
+
class_1 = features_umap[index_1]
|
451 |
+
except:
|
452 |
+
class_0 = []
|
453 |
+
class_1 = []
|
454 |
+
|
455 |
print("Generating latent plots : Done")
|
456 |
|
457 |
#vizualize(roc_auc,fpr, tpr, x_batch, y_batch )
|
|
|
459 |
result = f"ROC-AUC Score: {roc_auc:.4f}"
|
460 |
|
461 |
return result, roc_auc,fpr, tpr, class_0, class_1
|
462 |
+
|
463 |
elif downstream_model == "SVR":
|
464 |
+
if params == None:
|
465 |
+
regressor = SVR()
|
466 |
+
else:
|
467 |
+
regressor = SVR(**params)
|
468 |
model = TransformedTargetRegressor(regressor= regressor,
|
469 |
transformer = MinMaxScaler(feature_range=(-1, 1))
|
470 |
).fit(x_batch,y_batch)
|
471 |
+
|
472 |
y_prob = model.predict(x_batch_test)
|
473 |
RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
|
474 |
+
|
475 |
print(f"RMSE Score: {RMSE_score:.4f}")
|
476 |
result = f"RMSE Score: {RMSE_score:.4f}"
|
477 |
|
|
|
479 |
reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1,
|
480 |
verbose=False)
|
481 |
n_samples = np.minimum(1000, len(x_batch))
|
482 |
+
|
483 |
+
try: x = y_batch.values[:n_samples]
|
484 |
+
except: x = y_batch[:n_samples]
|
485 |
#index_0 = [index for index in range(len(x)) if x[index] == 0]
|
486 |
#index_1 = [index for index in range(len(x)) if x[index] == 1]
|
487 |
|
488 |
+
try:
|
489 |
+
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
490 |
+
class_0 = features_umap#[index_0]
|
491 |
+
class_1 = features_umap#[index_1]
|
492 |
+
except:
|
493 |
+
class_0 = []
|
494 |
+
class_1 = []
|
495 |
print("Generating latent plots : Done")
|
496 |
+
|
497 |
return result, RMSE_score,y_batch_test, y_prob, class_0, class_1
|
498 |
|
499 |
elif downstream_model == "Kernel Ridge":
|
500 |
+
if params == None:
|
501 |
+
regressor = KernelRidge()
|
502 |
+
else:
|
503 |
+
regressor = KernelRidge(**params)
|
504 |
model = TransformedTargetRegressor(regressor=regressor,
|
505 |
transformer=MinMaxScaler(feature_range=(-1, 1))
|
506 |
).fit(x_batch, y_batch)
|
|
|
516 |
verbose=False)
|
517 |
n_samples = np.minimum(1000, len(x_batch))
|
518 |
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
519 |
+
try: x = y_batch.values[:n_samples]
|
520 |
+
except: x = y_batch[:n_samples]
|
521 |
# index_0 = [index for index in range(len(x)) if x[index] == 0]
|
522 |
# index_1 = [index for index in range(len(x)) if x[index] == 1]
|
523 |
|
|
|
529 |
|
530 |
|
531 |
elif downstream_model == "Linear Regression":
|
532 |
+
if params == None:
|
533 |
+
regressor = LinearRegression()
|
534 |
+
else:
|
535 |
+
regressor = LinearRegression(**params)
|
536 |
model = TransformedTargetRegressor(regressor=regressor,
|
537 |
transformer=MinMaxScaler(feature_range=(-1, 1))
|
538 |
).fit(x_batch, y_batch)
|
|
|
549 |
n_samples = np.minimum(1000, len(x_batch))
|
550 |
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
551 |
try:x = y_batch.values[:n_samples]
|
552 |
+
except: x = y_batch[:n_samples]
|
553 |
# index_0 = [index for index in range(len(x)) if x[index] == 0]
|
554 |
# index_1 = [index for index in range(len(x)) if x[index] == 1]
|
555 |
|
|
|
578 |
n_samples = np.minimum(1000, len(x_batch))
|
579 |
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
580 |
try:x = y_batch.values[:n_samples]
|
581 |
+
except: x = y_batch[:n_samples]
|
582 |
# index_0 = [index for index in range(len(x)) if x[index] == 0]
|
583 |
# index_1 = [index for index in range(len(x)) if x[index] == 1]
|
584 |
|
|
|
587 |
print("Generating latent plots : Done")
|
588 |
|
589 |
return result, RMSE_score, y_batch_test, y_prob, class_0, class_1
|
590 |
+
|
591 |
|
592 |
+
def multi_modal(model_list,dataset=None, downstream_model=None,params=None, x_train=None, x_test=None, y_train=None, y_test=None):
|
593 |
+
#print(model_list)
|
|
|
594 |
data = avail_datasets()
|
595 |
df = pd.DataFrame(data)
|
596 |
list(df["Dataset"].values)
|
|
|
598 |
if dataset in list(df["Dataset"].values):
|
599 |
task = dataset
|
600 |
predefined = True
|
601 |
+
elif x_train==None:
|
602 |
predefined = False
|
603 |
components = dataset.split(",")
|
604 |
train_data = pd.read_csv(components[0])[components[2]]
|
|
|
608 |
y_batch_test = pd.read_csv(components[1])[components[3]]
|
609 |
|
610 |
print("Custom Dataset loaded")
|
611 |
+
else:
|
612 |
+
predefined = False
|
613 |
+
y_batch = y_train
|
614 |
+
y_batch_test = y_test
|
615 |
+
train_data = x_train
|
616 |
+
test_data = x_test
|
617 |
|
618 |
data = avail_models(raw=True)
|
619 |
df = pd.DataFrame(data)
|
620 |
list(df["Name"].values)
|
621 |
|
622 |
+
alias = {"MHG-GED":"mhg", "SELFIES-TED": "bart", "MolFormer":"mol-xl", "Molformer": "mol-xl","SMI-TED":"smi-ted", "Mordred": "Mordred", "MorganFingerprint": "MorganFingerprint"}
|
623 |
#if set(model_list).issubset(list(df["Name"].values)):
|
624 |
if set(model_list).issubset(list(alias.keys())):
|
625 |
for i, model in enumerate(model_list):
|
|
|
630 |
|
631 |
if i == 0:
|
632 |
if predefined:
|
633 |
+
with open(f"representation/{task}_{model_type}.pkl", "rb") as f1:
|
634 |
x_batch, y_batch, x_batch_test, y_batch_test = pickle.load(f1)
|
635 |
print(f" Loaded representation/{task}_{model_type}.pkl")
|
636 |
else:
|
|
|
640 |
|
641 |
else:
|
642 |
if predefined:
|
643 |
+
with open(f"representation/{task}_{model_type}.pkl", "rb") as f1:
|
644 |
x_batch_1, y_batch_1, x_batch_test_1, y_batch_test_1 = pickle.load(f1)
|
645 |
print(f" Loaded representation/{task}_{model_type}.pkl")
|
646 |
else:
|
|
|
651 |
x_batch = pd.concat([x_batch, x_batch_1], axis=1)
|
652 |
x_batch_test = pd.concat([x_batch_test, x_batch_test_1], axis=1)
|
653 |
|
|
|
654 |
else:
|
655 |
print("Model not available")
|
656 |
return
|
|
|
660 |
|
661 |
num_columns = x_batch.shape[1]
|
662 |
x_batch.columns = [f'{i + 1}' for i in range(num_columns)]
|
663 |
+
|
664 |
+
# exclude row containing Nan value
|
665 |
+
if isinstance(x_batch, torch.Tensor):
|
666 |
+
x_batch = pd.DataFrame(x_batch)
|
667 |
+
nan_indices = x_batch.index[x_batch.isna().any(axis=1)]
|
668 |
+
if len(nan_indices) > 0:
|
669 |
+
x_batch.dropna(inplace = True)
|
670 |
+
for index in sorted(nan_indices, reverse=True):
|
671 |
+
del y_batch[index]
|
672 |
+
print(f'x_batch Nan index: {nan_indices}')
|
673 |
+
print(f'x_batch shape: {x_batch.shape}, y_batch len: {len(y_batch)}')
|
674 |
+
|
675 |
+
if isinstance(x_batch_test, torch.Tensor):
|
676 |
+
x_batch_test = pd.DataFrame(x_batch_test)
|
677 |
+
nan_indices = x_batch_test.index[x_batch_test.isna().any(axis=1)]
|
678 |
+
if len(nan_indices) > 0:
|
679 |
+
x_batch_test.dropna(inplace = True)
|
680 |
+
for index in sorted(nan_indices, reverse=True):
|
681 |
+
del y_batch_test[index]
|
682 |
+
print(f'x_batch_test Nan index: {nan_indices}')
|
683 |
+
print(f'x_batch_test shape: {x_batch_test.shape}, y_batch_test len: {len(y_batch_test)}')
|
684 |
|
685 |
print(f"Representations loaded successfully")
|
686 |
try:
|
687 |
+
with open(f"plot_emb/{task}_multi.pkl", "rb") as f1:
|
688 |
class_0, class_1 = pickle.load(f1)
|
689 |
except:
|
690 |
print("Generating latent plots")
|
|
|
694 |
features_umap = reducer.fit_transform(x_batch[:n_samples])
|
695 |
|
696 |
if "Classifier" in downstream_model:
|
697 |
+
try: x = y_batch.values[:n_samples]
|
698 |
+
except: x = y_batch[:n_samples]
|
699 |
index_0 = [index for index in range(len(x)) if x[index] == 0]
|
700 |
index_1 = [index for index in range(len(x)) if x[index] == 1]
|
701 |
|
|
|
712 |
|
713 |
|
714 |
if downstream_model == "XGBClassifier":
|
715 |
+
if params == None:
|
716 |
+
xgb_predict_concat = XGBClassifier()
|
717 |
+
else:
|
718 |
+
xgb_predict_concat = XGBClassifier(**params)#n_estimators=5000, learning_rate=0.01, max_depth=10)
|
719 |
xgb_predict_concat.fit(x_batch, y_batch)
|
720 |
|
721 |
y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1]
|
|
|
753 |
return result, roc_auc,fpr, tpr, class_0, class_1
|
754 |
|
755 |
elif downstream_model == "SVR":
|
756 |
+
if params == None:
|
757 |
+
regressor = SVR()
|
758 |
+
else:
|
759 |
+
regressor = SVR(**params)
|
760 |
model = TransformedTargetRegressor(regressor= regressor,
|
761 |
transformer = MinMaxScaler(feature_range=(-1, 1))
|
762 |
).fit(x_batch,y_batch)
|
763 |
+
|
764 |
y_prob = model.predict(x_batch_test)
|
765 |
RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
|
766 |
+
|
767 |
print(f"RMSE Score: {RMSE_score:.4f}")
|
768 |
result = f"RMSE Score: {RMSE_score:.4f}"
|
769 |
+
|
770 |
return result, RMSE_score,y_batch_test, y_prob, class_0, class_1
|
771 |
|
772 |
elif downstream_model == "Linear Regression":
|
773 |
+
if params == None:
|
774 |
+
regressor = LinearRegression()
|
775 |
+
else:
|
776 |
+
regressor = LinearRegression(**params)
|
777 |
model = TransformedTargetRegressor(regressor=regressor,
|
778 |
transformer=MinMaxScaler(feature_range=(-1, 1))
|
779 |
).fit(x_batch, y_batch)
|
|
|
787 |
return result, RMSE_score, y_batch_test, y_prob, class_0, class_1
|
788 |
|
789 |
elif downstream_model == "Kernel Ridge":
|
790 |
+
if params == None:
|
791 |
+
regressor = KernelRidge()
|
792 |
+
else:
|
793 |
+
regressor = KernelRidge(**params)
|
794 |
model = TransformedTargetRegressor(regressor=regressor,
|
795 |
transformer=MinMaxScaler(feature_range=(-1, 1))
|
796 |
).fit(x_batch, y_batch)
|
|
|
819 |
|
820 |
|
821 |
|
822 |
+
def finetune_optuna(x_batch,y_batch, x_batch_test, y_test ):
|
823 |
+
print(f" Finetuning with Optuna and calculating ROC AUC Score ...")
|
824 |
+
X_train = x_batch.values
|
825 |
+
y_train = y_batch.values
|
826 |
+
X_test = x_batch_test.values
|
827 |
+
y_test = y_test.values
|
828 |
+
def objective(trial):
|
829 |
+
# Define parameters to be optimized
|
830 |
+
params = {
|
831 |
+
# 'objective': 'binary:logistic',
|
832 |
+
'eval_metric': 'auc',
|
833 |
+
'verbosity': 0,
|
834 |
+
'n_estimators': trial.suggest_int('n_estimators', 1000, 10000),
|
835 |
+
# 'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
|
836 |
+
# 'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
|
837 |
+
'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
|
838 |
+
'max_depth': trial.suggest_int('max_depth', 1, 12),
|
839 |
+
# 'eta': trial.suggest_loguniform('eta', 1e-8, 1.0),
|
840 |
+
# 'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
|
841 |
+
# 'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
|
842 |
+
# "subsample": trial.suggest_float("subsample", 0.05, 1.0),
|
843 |
+
# "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
|
844 |
+
}
|
845 |
+
|
846 |
+
# Train XGBoost model
|
847 |
+
dtrain = xgb.DMatrix(X_train, label=y_train)
|
848 |
+
dtest = xgb.DMatrix(X_test, label=y_test)
|
849 |
+
|
850 |
+
model = xgb.train(params, dtrain)
|
851 |
+
|
852 |
+
# Predict probabilities
|
853 |
+
y_pred = model.predict(dtest)
|
854 |
+
|
855 |
+
# Calculate ROC AUC score
|
856 |
+
roc_auc = roc_auc_score(y_test, y_pred)
|
857 |
+
print("ROC_AUC : ", roc_auc)
|
858 |
+
|
859 |
+
return roc_auc
|
860 |
+
|
861 |
+
def add_new_model():
|
862 |
+
models = avail_models(raw=True)
|
863 |
+
|
864 |
+
# Function to display models
|
865 |
+
def display_models():
|
866 |
+
for model in models:
|
867 |
+
model_display = f"Name: {model['Name']}, Description: {model['Description']}, Timestamp: {model['Timestamp']}"
|
868 |
+
print(model_display)
|
869 |
+
|
870 |
+
# Function to update models
|
871 |
+
def update_models(new_name, new_description, new_path):
|
872 |
+
new_model = {
|
873 |
+
"Name": new_name,
|
874 |
+
"Description": new_description,
|
875 |
+
"Timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
876 |
+
#"path": new_path
|
877 |
+
}
|
878 |
+
models.append(new_model)
|
879 |
+
with open("models.json", "w") as outfile:
|
880 |
+
json.dump(models, outfile)
|
881 |
+
|
882 |
+
print("Model uploaded and updated successfully!")
|
883 |
+
list_models()
|
884 |
+
#display_models()
|
885 |
+
|
886 |
+
# Widgets
|
887 |
+
name_text = widgets.Text(description="Name:", layout=Layout(width='50%'))
|
888 |
+
description_text = widgets.Text(description="Description:", layout=Layout(width='50%'))
|
889 |
+
path_text = widgets.Text(description="Path:", layout=Layout(width='50%'))
|
890 |
+
|
891 |
+
def browse_callback(b):
|
892 |
+
root = tk.Tk()
|
893 |
+
root.withdraw() # Hide the main window
|
894 |
+
file_path = filedialog.askopenfilename(title="Select a Model File")
|
895 |
+
if file_path:
|
896 |
+
path_text.value = file_path
|
897 |
+
|
898 |
+
browse_button = widgets.Button(description="Browse")
|
899 |
+
browse_button.on_click(browse_callback)
|
900 |
+
|
901 |
+
def submit_callback(b):
|
902 |
+
update_models(name_text.value, description_text.value, path_text.value)
|
903 |
+
|
904 |
+
submit_button = widgets.Button(description="Submit")
|
905 |
+
submit_button.on_click(submit_callback)
|
906 |
+
|
907 |
+
# Display widgets
|
908 |
+
display(VBox([name_text, description_text, path_text, browse_button, submit_button]))
|
909 |
+
|
910 |
+
|
911 |
+
def add_new_dataset():
|
912 |
+
# Sample data
|
913 |
+
datasets = avail_datasets()
|
914 |
+
|
915 |
+
# Function to display models
|
916 |
+
def display_datasets():
|
917 |
+
for dataset in datasets:
|
918 |
+
dataset_display = f"Name: {dataset['Dataset']}, Input: {dataset['Input']},Output: {dataset['Output']},Path: {dataset['Path']}, Timestamp: {dataset['Timestamp']}"
|
919 |
+
|
920 |
+
# Function to update models
|
921 |
+
def update_datasets(new_dataset, new_input, new_output, new_path):
|
922 |
+
new_model = {
|
923 |
+
"Dataset": new_dataset,
|
924 |
+
"Input": new_input,
|
925 |
+
"Output": new_output,
|
926 |
+
"Timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
927 |
+
"Path": os.path.basename(new_path)
|
928 |
+
}
|
929 |
+
datasets.append(new_model)
|
930 |
+
with open("datasets.json", "w") as outfile:
|
931 |
+
json.dump(datasets, outfile)
|
932 |
+
|
933 |
+
print("Dataset uploaded and updated successfully!")
|
934 |
+
list_data()
|
935 |
+
|
936 |
+
|
937 |
+
# Widgets
|
938 |
+
dataset_text = widgets.Text(description="Dataset:", layout=Layout(width='50%'))
|
939 |
+
input_text = widgets.Text(description="Input:", layout=Layout(width='50%'))
|
940 |
+
output_text = widgets.Text(description="Output:", layout=Layout(width='50%'))
|
941 |
+
path_text = widgets.Text(description="Path:", layout=Layout(width='50%'))
|
942 |
+
|
943 |
+
def browse_callback(b):
|
944 |
+
root = tk.Tk()
|
945 |
+
root.withdraw() # Hide the main window
|
946 |
+
file_path = filedialog.askopenfilename(title="Select a Dataset File")
|
947 |
+
if file_path:
|
948 |
+
path_text.value = file_path
|
949 |
+
|
950 |
+
browse_button = widgets.Button(description="Browse")
|
951 |
+
browse_button.on_click(browse_callback)
|
952 |
+
|
953 |
+
def submit_callback(b):
|
954 |
+
update_datasets(dataset_text.value, input_text.value, output_text.value, path_text.value)
|
955 |
+
|
956 |
+
submit_button = widgets.Button(description="Submit")
|
957 |
+
submit_button.on_click(submit_callback)
|
958 |
+
|
959 |
+
display(VBox([dataset_text, input_text, output_text, path_text, browse_button, submit_button]))
|
960 |
|
961 |
|
962 |
|
models/mhg_model/README.md
CHANGED
@@ -27,7 +27,7 @@ In addition, the decoder inherits the theoretical guarantee of MHG on always gen
|
|
27 |
|
28 |
### Pretrained Models and Training Logs
|
29 |
|
30 |
-
We provide checkpoints of the MHG-GNN model pre-trained on a dataset of ~1.34M molecules curated from PubChem. (later) For model weights: [HuggingFace Link]()
|
31 |
|
32 |
Add the MHG-GNN `pre-trained weights.pt` to the `models/` directory according to your needs.
|
33 |
|
|
|
27 |
|
28 |
### Pretrained Models and Training Logs
|
29 |
|
30 |
+
We provide checkpoints of the MHG-GNN model pre-trained on a dataset of ~1.34M molecules curated from PubChem. (later) For model weights: [HuggingFace Link](https://huggingface.co/ibm/materials.mhg-ged/blob/main/mhggnn_pretrained_model_0724_2023.pickle)
|
31 |
|
32 |
Add the MHG-GNN `pre-trained weights.pt` to the `models/` directory according to your needs.
|
33 |
|
models/mhg_model/images/mhg_example.png
CHANGED
Git LFS Details
|
models/mhg_model/images/mhg_example1.png
CHANGED
Git LFS Details
|
models/mhg_model/images/mhg_example2.png
CHANGED
Git LFS Details
|
models/mhg_model/load.py
CHANGED
@@ -17,6 +17,7 @@ from typing_extensions import Self
|
|
17 |
|
18 |
from .graph_grammar.io.smi import hg_to_mol
|
19 |
from .models.mhgvae import GrammarGINVAE
|
|
|
20 |
from huggingface_hub import hf_hub_download
|
21 |
|
22 |
|
@@ -73,12 +74,30 @@ class PretrainedModelWrapper:
|
|
73 |
return output
|
74 |
|
75 |
|
76 |
-
def load(model_name: str = "
|
77 |
PretrainedModelWrapper]:
|
|
|
78 |
repo_id = "ibm/materials.mhg-ged"
|
79 |
-
filename = "mhggnn_pretrained_model_0724_2023.pickle"
|
80 |
file_path = hf_hub_download(repo_id=repo_id, filename=filename)
|
81 |
with open(file_path, "rb") as f:
|
82 |
-
model_dict =
|
83 |
return PretrainedModelWrapper(model_dict)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
return None
|
|
|
17 |
|
18 |
from .graph_grammar.io.smi import hg_to_mol
|
19 |
from .models.mhgvae import GrammarGINVAE
|
20 |
+
|
21 |
from huggingface_hub import hf_hub_download
|
22 |
|
23 |
|
|
|
74 |
return output
|
75 |
|
76 |
|
77 |
+
def load(model_name: str = "mhg_model/pickles/mhggnn_pretrained_model_0724_2023.pickle") -> Optional[
|
78 |
PretrainedModelWrapper]:
|
79 |
+
|
80 |
repo_id = "ibm/materials.mhg-ged"
|
81 |
+
filename = "pytorch_model.bin" #"mhggnn_pretrained_model_0724_2023.pickle"
|
82 |
file_path = hf_hub_download(repo_id=repo_id, filename=filename)
|
83 |
with open(file_path, "rb") as f:
|
84 |
+
model_dict = torch.load(f)
|
85 |
return PretrainedModelWrapper(model_dict)
|
86 |
+
|
87 |
+
|
88 |
+
"""try:
|
89 |
+
if os.path.isfile(model_name):
|
90 |
+
with open(model_name, "rb") as f:
|
91 |
+
model_dict = pickle.load(f)
|
92 |
+
print("MHG Model Loaded")
|
93 |
+
return PretrainedModelWrapper(model_dict)
|
94 |
+
|
95 |
+
except:
|
96 |
+
|
97 |
+
for p in sys.path:
|
98 |
+
file = p + "/" + model_name
|
99 |
+
if os.path.isfile(file):
|
100 |
+
with open(file, "rb") as f:
|
101 |
+
model_dict = pickle.load(f)
|
102 |
+
return PretrainedModelWrapper(model_dict)"""
|
103 |
return None
|
models/mhg_model/paper/MHG-GNN_Combination of Molecular Hypergraph Grammar with Graph Neural Network.pdf
CHANGED
Binary files a/models/mhg_model/paper/MHG-GNN_Combination of Molecular Hypergraph Grammar with Graph Neural Network.pdf and b/models/mhg_model/paper/MHG-GNN_Combination of Molecular Hypergraph Grammar with Graph Neural Network.pdf differ
|
|
models/selfies_model/selfies-ted.png
CHANGED
Git LFS Details
|
models/selfies_ted/README.md
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
library_name: transformers
|
4 |
+
pipeline_tag: feature-extraction
|
5 |
+
tags:
|
6 |
+
- chemistry
|
7 |
+
---
|
8 |
+
|
9 |
+
# selfies-ted
|
10 |
+
|
11 |
+
selfies-ted is a project for encoding SMILES (Simplified Molecular Input Line Entry System) into SELFIES (SELF-referencing Embedded Strings) and generating embeddings for molecular representations.
|
12 |
+
|
13 |
+
![selfies-ted](selfies-ted.png)
|
14 |
+
## Model Architecture
|
15 |
+
|
16 |
+
Configuration details
|
17 |
+
|
18 |
+
Encoder and Decoder FFN dimensions: 256
|
19 |
+
Number of attention heads: 4
|
20 |
+
Number of encoder and decoder layers: 2
|
21 |
+
Total number of hidden layers: 6
|
22 |
+
Maximum position embeddings: 128
|
23 |
+
Model dimension (d_model): 256
|
24 |
+
|
25 |
+
## Pretrained Models and Training Logs
|
26 |
+
We provide checkpoints of the selfies-ted model pre-trained on a dataset of molecules curated from PubChem. The pre-trained model shows competitive performance on molecular representation tasks. For model weights: "HuggingFace link".
|
27 |
+
|
28 |
+
To install and use the pre-trained model:
|
29 |
+
|
30 |
+
Download the selfies_ted_model.pkl file from the "HuggingFace link".
|
31 |
+
Add the selfies-ted selfies_ted_model.pkl to the models/ directory. The directory structure should look like the following:
|
32 |
+
|
33 |
+
```
|
34 |
+
models/
|
35 |
+
└── selfies_ted_model.pkl
|
36 |
+
```
|
37 |
+
|
38 |
+
## Installation
|
39 |
+
|
40 |
+
To use this project, you'll need to install the required dependencies. We recommend using a virtual environment:
|
41 |
+
|
42 |
+
```bash
|
43 |
+
python -m venv venv
|
44 |
+
source venv/bin/activate # On Windows use `venv\Scripts\activate`
|
45 |
+
```
|
46 |
+
|
47 |
+
Install the required dependencies
|
48 |
+
|
49 |
+
```
|
50 |
+
pip install -r requirements.txt
|
51 |
+
```
|
52 |
+
|
53 |
+
|
54 |
+
## Usage
|
55 |
+
|
56 |
+
### Import
|
57 |
+
|
58 |
+
```
|
59 |
+
import load
|
60 |
+
```
|
61 |
+
### Training the Model
|
62 |
+
|
63 |
+
To train the model, use the train.py script:
|
64 |
+
|
65 |
+
```
|
66 |
+
python train.py -f <path_to_your_data_file>
|
67 |
+
```
|
68 |
+
|
69 |
+
|
70 |
+
Note: The actual usage may depend on the specific implementation in load.py. Please refer to the source code for detailed functionality.
|
71 |
+
|
72 |
+
### Load the model and tokenizer
|
73 |
+
```
|
74 |
+
load.load("path/to/checkpoint.pkl")
|
75 |
+
```
|
76 |
+
### Encode SMILES strings
|
77 |
+
```
|
78 |
+
smiles_list = ["COC", "CCO"]
|
79 |
+
```
|
80 |
+
```
|
81 |
+
embeddings = load.encode(smiles_list)
|
82 |
+
```
|
83 |
+
|
84 |
+
|
85 |
+
## Example Notebook
|
86 |
+
|
87 |
+
Example notebook of this project is `selfies-ted-example.ipynb`.
|
models/selfies_ted/load.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import torch
|
4 |
+
import selfies as sf # selfies>=2.1.1
|
5 |
+
import pickle
|
6 |
+
import pandas as pd
|
7 |
+
import numpy as np
|
8 |
+
from datasets import Dataset
|
9 |
+
from rdkit import Chem
|
10 |
+
from transformers import AutoTokenizer, AutoModel
|
11 |
+
|
12 |
+
|
13 |
+
class SELFIES(torch.nn.Module):
|
14 |
+
|
15 |
+
def __init__(self):
|
16 |
+
super().__init__()
|
17 |
+
self.model = None
|
18 |
+
self.tokenizer = None
|
19 |
+
self.invalid = []
|
20 |
+
|
21 |
+
def get_selfies(self, smiles_list):
|
22 |
+
self.invalid = []
|
23 |
+
spaced_selfies_batch = []
|
24 |
+
for i, smiles in enumerate(smiles_list):
|
25 |
+
try:
|
26 |
+
selfies = sf.encoder(smiles.rstrip())
|
27 |
+
except:
|
28 |
+
try:
|
29 |
+
smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles.rstrip()))
|
30 |
+
selfies = sf.encoder(smiles)
|
31 |
+
except:
|
32 |
+
selfies = "[]"
|
33 |
+
self.invalid.append(i)
|
34 |
+
|
35 |
+
spaced_selfies_batch.append(selfies.replace('][', '] ['))
|
36 |
+
|
37 |
+
return spaced_selfies_batch
|
38 |
+
|
39 |
+
|
40 |
+
def get_embedding(self, selfies):
|
41 |
+
encoding = self.tokenizer(selfies["selfies"], return_tensors='pt', max_length=128, truncation=True, padding='max_length')
|
42 |
+
input_ids = encoding['input_ids']
|
43 |
+
attention_mask = encoding['attention_mask']
|
44 |
+
outputs = self.model.encoder(input_ids=input_ids, attention_mask=attention_mask)
|
45 |
+
model_output = outputs.last_hidden_state
|
46 |
+
|
47 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(model_output.size()).float()
|
48 |
+
sum_embeddings = torch.sum(model_output * input_mask_expanded, 1)
|
49 |
+
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
50 |
+
model_output = sum_embeddings / sum_mask
|
51 |
+
|
52 |
+
del encoding['input_ids']
|
53 |
+
del encoding['attention_mask']
|
54 |
+
|
55 |
+
encoding["embedding"] = model_output
|
56 |
+
|
57 |
+
return encoding
|
58 |
+
|
59 |
+
|
60 |
+
def load(self, checkpoint="bart-2908.pickle"):
|
61 |
+
"""
|
62 |
+
inputs :
|
63 |
+
checkpoint (pickle object)
|
64 |
+
"""
|
65 |
+
|
66 |
+
self.tokenizer = AutoTokenizer.from_pretrained("ibm/materials.selfies-ted")
|
67 |
+
self.model = AutoModel.from_pretrained("ibm/materials.selfies-ted")
|
68 |
+
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
+
|
73 |
+
# TODO: remove `use_gpu` argument in validation pipeline
|
74 |
+
def encode(self, smiles_list=[], use_gpu=False, return_tensor=False):
|
75 |
+
"""
|
76 |
+
inputs :
|
77 |
+
checkpoint (pickle object)
|
78 |
+
:return: embedding
|
79 |
+
"""
|
80 |
+
selfies = self.get_selfies(smiles_list)
|
81 |
+
selfies_df = pd.DataFrame(selfies,columns=["selfies"])
|
82 |
+
data = Dataset.from_pandas(selfies_df)
|
83 |
+
embedding = data.map(self.get_embedding, batched=True, num_proc=1, batch_size=128)
|
84 |
+
emb = np.asarray(embedding["embedding"].copy())
|
85 |
+
|
86 |
+
for idx in self.invalid:
|
87 |
+
emb[idx] = np.nan
|
88 |
+
print("Cannot encode {0} to selfies and embedding replaced by NaN".format(smiles_list[idx]))
|
89 |
+
|
90 |
+
if return_tensor:
|
91 |
+
return torch.tensor(emb)
|
92 |
+
return pd.DataFrame(emb)
|
models/selfies_ted/requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch>=2.1.0
|
2 |
+
transformers>=4.38
|
3 |
+
numpy>=1.26.1
|
4 |
+
datasets>=2.13.1
|
5 |
+
evaluate>=0.4.0
|
6 |
+
selfies>=2.1.0
|
7 |
+
scikit-learn>=1.2.1
|
8 |
+
pyarrow>=14.0.1
|
9 |
+
requests>=2.31.0
|
10 |
+
urllib3>=2.0.7
|
11 |
+
aiohttp>=3.9.0
|
12 |
+
zipp>=3.17.0
|
models/selfies_ted/selfies-ted-example.ipynb
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "9d9b6eb8-9edb-44bd-9e5a-3a6ea67f5117",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"### Import library"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "code",
|
13 |
+
"execution_count": 1,
|
14 |
+
"id": "c3ac4418",
|
15 |
+
"metadata": {},
|
16 |
+
"outputs": [],
|
17 |
+
"source": [
|
18 |
+
"from load import SELFIES"
|
19 |
+
]
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"cell_type": "markdown",
|
23 |
+
"id": "790061cf-5470-4564-987e-aa2e492337db",
|
24 |
+
"metadata": {},
|
25 |
+
"source": [
|
26 |
+
"### Initialize and load"
|
27 |
+
]
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"cell_type": "code",
|
31 |
+
"execution_count": 2,
|
32 |
+
"id": "85847f26-e2f4-475a-a88e-41fd9cccfc0f",
|
33 |
+
"metadata": {},
|
34 |
+
"outputs": [],
|
35 |
+
"source": [
|
36 |
+
"model = SELFIES()"
|
37 |
+
]
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"cell_type": "code",
|
41 |
+
"execution_count": 3,
|
42 |
+
"id": "095e864c",
|
43 |
+
"metadata": {
|
44 |
+
"scrolled": true
|
45 |
+
},
|
46 |
+
"outputs": [],
|
47 |
+
"source": [
|
48 |
+
"model.load(checkpoint=\"bart-2908.pickle\")"
|
49 |
+
]
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"cell_type": "markdown",
|
53 |
+
"id": "55f1a68c-c462-4dee-9139-9befb469f176",
|
54 |
+
"metadata": {},
|
55 |
+
"source": [
|
56 |
+
"### Example to get embeddings"
|
57 |
+
]
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"cell_type": "code",
|
61 |
+
"execution_count": 4,
|
62 |
+
"id": "2357ef0a",
|
63 |
+
"metadata": {},
|
64 |
+
"outputs": [
|
65 |
+
{
|
66 |
+
"data": {
|
67 |
+
"application/vnd.jupyter.widget-view+json": {
|
68 |
+
"model_id": "b494cbf9878a4f5c8f4093e38fb82fd5",
|
69 |
+
"version_major": 2,
|
70 |
+
"version_minor": 0
|
71 |
+
},
|
72 |
+
"text/plain": [
|
73 |
+
"Map: 0%| | 0/3 [00:00<?, ? examples/s]"
|
74 |
+
]
|
75 |
+
},
|
76 |
+
"metadata": {},
|
77 |
+
"output_type": "display_data"
|
78 |
+
}
|
79 |
+
],
|
80 |
+
"source": [
|
81 |
+
"smiles_list = [\"CCO\", \"O=C=O\", \"OC(=O)c1ccccc1C(=O)O\"]\n",
|
82 |
+
"embeddings = model.encode(smiles_list)"
|
83 |
+
]
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"cell_type": "code",
|
87 |
+
"execution_count": 5,
|
88 |
+
"id": "3871c513-d0a9-4e70-9c18-3f0b491e07b2",
|
89 |
+
"metadata": {},
|
90 |
+
"outputs": [
|
91 |
+
{
|
92 |
+
"data": {
|
93 |
+
"text/plain": [
|
94 |
+
"(3, 1024)"
|
95 |
+
]
|
96 |
+
},
|
97 |
+
"execution_count": 5,
|
98 |
+
"metadata": {},
|
99 |
+
"output_type": "execute_result"
|
100 |
+
}
|
101 |
+
],
|
102 |
+
"source": [
|
103 |
+
"embeddings.shape"
|
104 |
+
]
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"cell_type": "code",
|
108 |
+
"execution_count": null,
|
109 |
+
"id": "289a8795-d6d8-4828-b2b2-b4d4a97a4604",
|
110 |
+
"metadata": {},
|
111 |
+
"outputs": [],
|
112 |
+
"source": []
|
113 |
+
}
|
114 |
+
],
|
115 |
+
"metadata": {
|
116 |
+
"kernelspec": {
|
117 |
+
"display_name": "Python 3 (ipykernel)",
|
118 |
+
"language": "python",
|
119 |
+
"name": "python3"
|
120 |
+
},
|
121 |
+
"language_info": {
|
122 |
+
"codemirror_mode": {
|
123 |
+
"name": "ipython",
|
124 |
+
"version": 3
|
125 |
+
},
|
126 |
+
"file_extension": ".py",
|
127 |
+
"mimetype": "text/x-python",
|
128 |
+
"name": "python",
|
129 |
+
"nbconvert_exporter": "python",
|
130 |
+
"pygments_lexer": "ipython3",
|
131 |
+
"version": "3.10.8"
|
132 |
+
}
|
133 |
+
},
|
134 |
+
"nbformat": 4,
|
135 |
+
"nbformat_minor": 5
|
136 |
+
}
|
models/selfies_ted/selfies-ted.png
ADDED
Git LFS Details
|
models/smi_ted/.gitignore
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Model weights
|
2 |
+
inference/smi_ted_light/smi-ted-Light_40.pt
|
3 |
+
|
4 |
+
# pyenv
|
5 |
+
.python-version
|
6 |
+
|
7 |
+
# Environments
|
8 |
+
.env
|
9 |
+
.venv
|
10 |
+
env/
|
11 |
+
venv/
|
12 |
+
ENV/
|
13 |
+
env.bak/
|
14 |
+
venv.bak/
|
15 |
+
|
16 |
+
# editor files
|
17 |
+
.vscode/
|
18 |
+
.DS_Store
|
models/smi_ted/README.md
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# SMILES-based Transformer Encoder-Decoder (SMI-TED)
|
2 |
+
|
3 |
+
This repository provides PyTorch source code associated with our publication, "A Large Encoder-Decoder Family of Foundation Models for Chemical Language".
|
4 |
+
|
5 |
+
**Paper:** [Arxiv Link](https://arxiv.org/abs/2407.20267)
|
6 |
+
|
7 |
+
**HuggingFace:** [HuggingFace Link](https://huggingface.co/ibm/materials.smi-ted)
|
8 |
+
|
9 |
+
For more information contact: [email protected] or [email protected].
|
10 |
+
|
11 |
+
![ted-smi](images/smi-ted.png)
|
12 |
+
|
13 |
+
## Introduction
|
14 |
+
|
15 |
+
We present a large encoder-decoder chemical foundation model, SMILES-based Transformer Encoder-Decoder (SMI-TED), pre-trained on a curated dataset of 91 million SMILES samples sourced from PubChem, equivalent to 4 billion molecular tokens. SMI-TED supports various complex tasks, including quantum property prediction, with two main variants ($289M$ and $8 \times 289M$). Our experiments across multiple benchmark datasets demonstrate state-of-the-art performance for various tasks. Model weights are available at: [HuggingFace Link](https://huggingface.co/ibm/materials.smi-ted).
|
16 |
+
|
17 |
+
## Table of Contents
|
18 |
+
|
19 |
+
1. [Getting Started](#getting-started)
|
20 |
+
1. [Pretrained Models and Training Logs](#pretrained-models-and-training-logs)
|
21 |
+
2. [Replicating Conda Environment](#replicating-conda-environment)
|
22 |
+
2. [Pretraining](#pretraining)
|
23 |
+
3. [Finetuning](#finetuning)
|
24 |
+
4. [Feature Extraction](#feature-extraction)
|
25 |
+
5. [Citations](#citations)
|
26 |
+
|
27 |
+
## Getting Started
|
28 |
+
|
29 |
+
**This code and environment have been tested on Nvidia V100s and Nvidia A100s**
|
30 |
+
|
31 |
+
### Pretrained Models and Training Logs
|
32 |
+
|
33 |
+
We provide checkpoints of the SMI-TED model pre-trained on a dataset of ~91M molecules curated from PubChem. The pre-trained model shows competitive performance on classification and regression benchmarks from MoleculeNet. For model weights: [HuggingFace Link](https://huggingface.co/ibm/materials.smi-ted)
|
34 |
+
|
35 |
+
Add the SMI-TED `pre-trained weights.pt` to the `inference/` or `finetune/` directory according to your needs. The directory structure should look like the following:
|
36 |
+
|
37 |
+
```
|
38 |
+
inference/
|
39 |
+
├── smi_ted_light
|
40 |
+
│ ├── smi_ted_light.pt
|
41 |
+
│ ├── bert_vocab_curated.txt
|
42 |
+
│ └── load.py
|
43 |
+
```
|
44 |
+
and/or:
|
45 |
+
|
46 |
+
```
|
47 |
+
finetune/
|
48 |
+
├── smi_ted_light
|
49 |
+
│ ├── smi_ted_light.pt
|
50 |
+
│ ├── bert_vocab_curated.txt
|
51 |
+
│ └── load.py
|
52 |
+
```
|
53 |
+
|
54 |
+
### Replicating Conda Environment
|
55 |
+
|
56 |
+
Follow these steps to replicate our Conda environment and install the necessary libraries:
|
57 |
+
|
58 |
+
#### Create and Activate Conda Environment
|
59 |
+
|
60 |
+
```
|
61 |
+
conda create --name smi-ted-env python=3.10
|
62 |
+
conda activate smi-ted-env
|
63 |
+
```
|
64 |
+
|
65 |
+
#### Install Packages with Conda
|
66 |
+
|
67 |
+
```
|
68 |
+
conda install pytorch=2.1.0 pytorch-cuda=11.8 -c pytorch -c nvidia
|
69 |
+
```
|
70 |
+
|
71 |
+
#### Install Packages with Pip
|
72 |
+
|
73 |
+
```
|
74 |
+
pip install -r requirements.txt
|
75 |
+
pip install pytorch-fast-transformers
|
76 |
+
```
|
77 |
+
|
78 |
+
## Pretraining
|
79 |
+
|
80 |
+
For pretraining, we use two strategies: the masked language model method to train the encoder part and an encoder-decoder strategy to refine SMILES reconstruction and improve the generated latent space.
|
81 |
+
|
82 |
+
SMI-TED is pre-trained on canonicalized and curated 91M SMILES from PubChem with the following constraints:
|
83 |
+
|
84 |
+
- Compounds are filtered to a maximum length of 202 tokens during preprocessing.
|
85 |
+
- A 95/5/0 split is used for encoder training, with 5% of the data for decoder pretraining.
|
86 |
+
- A 100/0/0 split is also used to train the encoder and decoder directly, enhancing model performance.
|
87 |
+
|
88 |
+
The pretraining code provides examples of data processing and model training on a smaller dataset, requiring 8 A100 GPUs.
|
89 |
+
|
90 |
+
To pre-train the two variants of the SMI-TED model, run:
|
91 |
+
|
92 |
+
```
|
93 |
+
bash training/run_model_light_training.sh
|
94 |
+
```
|
95 |
+
or
|
96 |
+
```
|
97 |
+
bash training/run_model_large_training.sh
|
98 |
+
```
|
99 |
+
|
100 |
+
Use `train_model_D.py` to train only the decoder or `train_model_ED.py` to train both the encoder and decoder.
|
101 |
+
|
102 |
+
## Finetuning
|
103 |
+
|
104 |
+
The finetuning datasets and environment can be found in the [finetune](finetune/) directory. After setting up the environment, you can run a finetuning task with:
|
105 |
+
|
106 |
+
```
|
107 |
+
bash finetune/smi_ted_light/esol/run_finetune_esol.sh
|
108 |
+
```
|
109 |
+
|
110 |
+
Finetuning training/checkpointing resources will be available in directories named `checkpoint_<measure_name>`.
|
111 |
+
|
112 |
+
## Feature Extraction
|
113 |
+
|
114 |
+
The example notebook [smi_ted_encoder_decoder_example.ipynb](notebooks/smi_ted_encoder_decoder_example.ipynb) contains code to load checkpoint files and use the pre-trained model for encoder and decoder tasks. It also includes examples of classification and regression tasks. For model weights: [HuggingFace Link](https://huggingface.co/ibm/materials.smi-ted)
|
115 |
+
|
116 |
+
To load smi-ted, you can simply use:
|
117 |
+
|
118 |
+
```python
|
119 |
+
model = load_smi_ted(
|
120 |
+
folder='../inference/smi_ted_light',
|
121 |
+
ckpt_filename='smi_ted_light.pt'
|
122 |
+
)
|
123 |
+
```
|
124 |
+
|
125 |
+
To encode SMILES into embeddings, you can use:
|
126 |
+
|
127 |
+
```python
|
128 |
+
with torch.no_grad():
|
129 |
+
encoded_embeddings = model.encode(df['SMILES'], return_torch=True)
|
130 |
+
```
|
131 |
+
For decoder, you can use the function, so you can return from embeddings to SMILES strings:
|
132 |
+
|
133 |
+
```python
|
134 |
+
with torch.no_grad():
|
135 |
+
decoded_smiles = model.decode(encoded_embeddings)
|
136 |
+
```
|
137 |
+
|
138 |
+
|
models/smi_ted/finetune/args.py
ADDED
@@ -0,0 +1,337 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
|
3 |
+
|
4 |
+
def get_parser(parser=None):
|
5 |
+
if parser is None:
|
6 |
+
parser = argparse.ArgumentParser()
|
7 |
+
|
8 |
+
# Model
|
9 |
+
# model_arg = parser.add_argument_group('Model')
|
10 |
+
parser.add_argument("--n_head", type=int, default=8, help="GPT number of heads")
|
11 |
+
parser.add_argument("--n_layer", type=int, default=12, help="GPT number of layers")
|
12 |
+
parser.add_argument(
|
13 |
+
"--q_dropout", type=float, default=0.5, help="Encoder layers dropout"
|
14 |
+
)
|
15 |
+
parser.add_argument(
|
16 |
+
"--d_dropout", type=float, default=0.1, help="Decoder layers dropout"
|
17 |
+
)
|
18 |
+
parser.add_argument(
|
19 |
+
"--n_embd", type=int, default=768, help="Latent vector dimensionality"
|
20 |
+
)
|
21 |
+
parser.add_argument(
|
22 |
+
"--fc_h", type=int, default=512, help="Fully connected hidden dimensionality"
|
23 |
+
)
|
24 |
+
parser.add_argument("--n_output", type=int, default=1)
|
25 |
+
|
26 |
+
# Train
|
27 |
+
# train_arg = parser.add_argument_group('Train')
|
28 |
+
parser.add_argument("--n_batch", type=int, default=512, help="Batch size")
|
29 |
+
parser.add_argument(
|
30 |
+
"--unlike_alpha", type=float, default=1.0, help="unlikelihood loss alpha weight"
|
31 |
+
)
|
32 |
+
parser.add_argument(
|
33 |
+
"--from_scratch",
|
34 |
+
action="store_true",
|
35 |
+
default=False,
|
36 |
+
help="train on qm9 from scratch",
|
37 |
+
)
|
38 |
+
parser.add_argument(
|
39 |
+
"--unlikelihood",
|
40 |
+
action="store_true",
|
41 |
+
default=False,
|
42 |
+
help="use unlikelihood loss with gpt pretrain",
|
43 |
+
)
|
44 |
+
parser.add_argument(
|
45 |
+
"--grad_acc",
|
46 |
+
type=int,
|
47 |
+
default=1,
|
48 |
+
help="number of batches to accumulate gradients",
|
49 |
+
)
|
50 |
+
parser.add_argument(
|
51 |
+
"--checkpoint_every",
|
52 |
+
type=int,
|
53 |
+
default=1000,
|
54 |
+
help="save checkpoint every x iterations",
|
55 |
+
)
|
56 |
+
parser.add_argument(
|
57 |
+
"--clip_grad", type=int, default=50, help="Clip gradients to this value"
|
58 |
+
)
|
59 |
+
parser.add_argument(
|
60 |
+
"--lr_start", type=float, default=3 * 1e-4, help="Initial lr value"
|
61 |
+
)
|
62 |
+
parser.add_argument(
|
63 |
+
"--lr_end", type=float, default=3 * 1e-4, help="Maximum lr weight value"
|
64 |
+
)
|
65 |
+
parser.add_argument(
|
66 |
+
"--lr_multiplier", type=int, default=1, help="lr weight multiplier"
|
67 |
+
)
|
68 |
+
parser.add_argument(
|
69 |
+
"--n_last", type=int, default=1000, help="Number of iters to smooth loss calc"
|
70 |
+
)
|
71 |
+
parser.add_argument("--n_jobs", type=int, default=1, help="Number of threads")
|
72 |
+
parser.add_argument(
|
73 |
+
"--accelerator",
|
74 |
+
type=str,
|
75 |
+
default="ddp",
|
76 |
+
help="The accelerator backend to use (previously known as distributed_backend)",
|
77 |
+
)
|
78 |
+
parser.add_argument(
|
79 |
+
"--num_nodes",
|
80 |
+
type=int,
|
81 |
+
default=1,
|
82 |
+
help="number of GPU nodes for distributed training",
|
83 |
+
)
|
84 |
+
parser.add_argument(
|
85 |
+
"--device",
|
86 |
+
type=str,
|
87 |
+
default="cuda",
|
88 |
+
help='Device to run: "cpu" or "cuda:<device number>"',
|
89 |
+
)
|
90 |
+
parser.add_argument("--seed", type=int, default=12345, help="Seed")
|
91 |
+
parser.add_argument(
|
92 |
+
"--init_params_from",
|
93 |
+
type=str,
|
94 |
+
default="",
|
95 |
+
help="Path to a ckpt used to initialize the parameters if no restart_path is provided",
|
96 |
+
)
|
97 |
+
parser.add_argument(
|
98 |
+
"--train_decoder_every",
|
99 |
+
type=int,
|
100 |
+
default=10,
|
101 |
+
help="Optimize decoder params every n batches",
|
102 |
+
)
|
103 |
+
parser.add_argument(
|
104 |
+
"--lr_decoder", type=float, default=1e-4, help="Learning rate for decoder part"
|
105 |
+
)
|
106 |
+
parser.add_argument(
|
107 |
+
"--local_rank",
|
108 |
+
type=int,
|
109 |
+
default=-1,
|
110 |
+
help="local_rank for distributed training on gpus",
|
111 |
+
)
|
112 |
+
parser.add_argument("--gpu", default=None, type=int, help="GPU id to use.")
|
113 |
+
parser.add_argument(
|
114 |
+
"--dist-backend", default="nccl", type=str, help="distributed backend"
|
115 |
+
)
|
116 |
+
parser.add_argument(
|
117 |
+
"--tensorboard_path", default="./runs/deepspeed", help="tensorboard log dir"
|
118 |
+
)
|
119 |
+
|
120 |
+
# common_arg = parser.add_argument_group('Common')
|
121 |
+
parser.add_argument(
|
122 |
+
"--vocab_load", type=str, required=False, help="Where to load the vocab"
|
123 |
+
)
|
124 |
+
parser.add_argument(
|
125 |
+
"--n_samples", type=int, required=False, help="Number of samples to sample"
|
126 |
+
)
|
127 |
+
parser.add_argument(
|
128 |
+
"--gen_save", type=str, required=False, help="Where to save the gen molecules"
|
129 |
+
)
|
130 |
+
parser.add_argument(
|
131 |
+
"--max_len", type=int, default=100, help="Max of length of SMILES"
|
132 |
+
)
|
133 |
+
parser.add_argument(
|
134 |
+
"--train_load", type=str, required=False, help="Where to load the model"
|
135 |
+
)
|
136 |
+
parser.add_argument(
|
137 |
+
"--val_load", type=str, required=False, help="Where to load the model"
|
138 |
+
)
|
139 |
+
parser.add_argument(
|
140 |
+
"--n_workers",
|
141 |
+
type=int,
|
142 |
+
required=False,
|
143 |
+
default=1,
|
144 |
+
help="Where to load the model",
|
145 |
+
)
|
146 |
+
# beam search hyper parameters
|
147 |
+
parser.add_argument(
|
148 |
+
"--beam_size", type=int, default=0, help="Number of beams to generate"
|
149 |
+
)
|
150 |
+
parser.add_argument(
|
151 |
+
"--num_seq_returned",
|
152 |
+
type=int,
|
153 |
+
default=0,
|
154 |
+
help="number of beams to be returned (must be <= beam_size",
|
155 |
+
)
|
156 |
+
parser.add_argument(
|
157 |
+
"--min_len", type=int, default=1, help="minimum length to be generated"
|
158 |
+
)
|
159 |
+
parser.add_argument(
|
160 |
+
"--nucleus_thresh", type=float, default=0.9, help="nucleus sampling threshold"
|
161 |
+
)
|
162 |
+
parser.add_argument(
|
163 |
+
"--finetune_path",
|
164 |
+
type=str,
|
165 |
+
default="",
|
166 |
+
help="path to trainer file to continue training",
|
167 |
+
)
|
168 |
+
parser.add_argument(
|
169 |
+
"--restart_path",
|
170 |
+
type=str,
|
171 |
+
default="",
|
172 |
+
help="path to trainer file to continue training",
|
173 |
+
)
|
174 |
+
parser.add_argument(
|
175 |
+
"--data_path", type=str, default="", help="path to pubchem file"
|
176 |
+
)
|
177 |
+
parser.add_argument(
|
178 |
+
"--pretext_size", type=int, default=0, help="number of k-mers to pretext"
|
179 |
+
)
|
180 |
+
parser.add_argument(
|
181 |
+
"--model_save_dir",
|
182 |
+
type=str,
|
183 |
+
required=False,
|
184 |
+
default="./models_dump/",
|
185 |
+
help="Where to save the models/log/config/vocab",
|
186 |
+
)
|
187 |
+
parser.add_argument(
|
188 |
+
"--model_save",
|
189 |
+
type=str,
|
190 |
+
required=False,
|
191 |
+
default="model.pt",
|
192 |
+
help="Where to save the model",
|
193 |
+
)
|
194 |
+
# parser.add_argument('--save_frequency',
|
195 |
+
# type=int, default=20,
|
196 |
+
# help='How often to save the model')
|
197 |
+
parser.add_argument(
|
198 |
+
"--num_epoch", type=int, default=1, help="number of epochs to train"
|
199 |
+
)
|
200 |
+
# parser.add_argument('--num_iter',
|
201 |
+
# type=int, default=-1,
|
202 |
+
# help='how many itersations per epoch (for unlikelihood tuning)')
|
203 |
+
parser.add_argument(
|
204 |
+
"--log_file", type=str, required=False, help="Where to save the log"
|
205 |
+
)
|
206 |
+
parser.add_argument(
|
207 |
+
"--tb_loc",
|
208 |
+
type=str,
|
209 |
+
required=False,
|
210 |
+
help="Where to save the tensorflow location",
|
211 |
+
)
|
212 |
+
parser.add_argument(
|
213 |
+
"--config_save", type=str, required=False, help="Where to save the config"
|
214 |
+
)
|
215 |
+
parser.add_argument("--vocab_save", type=str, help="Where to save the vocab")
|
216 |
+
|
217 |
+
# resume_arg = parser.add_argument_group('Resume')
|
218 |
+
parser.add_argument(
|
219 |
+
"--debug",
|
220 |
+
default=False,
|
221 |
+
action="store_true",
|
222 |
+
help="do not erase cache at end of program",
|
223 |
+
)
|
224 |
+
parser.add_argument(
|
225 |
+
"--fast_dev_run",
|
226 |
+
default=False,
|
227 |
+
help="This flag runs a “unit test” by running n if set to n (int) else 1 if set to True training and validation batch(es).",
|
228 |
+
)
|
229 |
+
parser.add_argument(
|
230 |
+
"--freeze_model",
|
231 |
+
default=False,
|
232 |
+
action="store_true",
|
233 |
+
help="freeze weights of bert model during fine tuning",
|
234 |
+
)
|
235 |
+
parser.add_argument(
|
236 |
+
"--resume", default=False, action="store_true", help="Resume from a saved model"
|
237 |
+
)
|
238 |
+
parser.add_argument(
|
239 |
+
"--rotate",
|
240 |
+
default=False,
|
241 |
+
action="store_true",
|
242 |
+
help="use rotational relative embedding",
|
243 |
+
)
|
244 |
+
parser.add_argument(
|
245 |
+
"--model_load", type=str, required=False, help="Where to load the model"
|
246 |
+
)
|
247 |
+
parser.add_argument(
|
248 |
+
"--root_dir", type=str, required=False, default=".", help="location of root dir"
|
249 |
+
)
|
250 |
+
parser.add_argument(
|
251 |
+
"--config_load", type=str, required=False, help="Where to load the config"
|
252 |
+
)
|
253 |
+
parser.add_argument(
|
254 |
+
"--gpus", type=int, required=False, default=1, help="number of gpus to use"
|
255 |
+
)
|
256 |
+
# parser.add_argument('--start_epoch',
|
257 |
+
# type=int, required=False, default=0,
|
258 |
+
# help='Where to load the config')
|
259 |
+
|
260 |
+
parser.add_argument(
|
261 |
+
"--model_arch",
|
262 |
+
type=str,
|
263 |
+
required=False,
|
264 |
+
help="used to teack model arch in params",
|
265 |
+
)
|
266 |
+
parser.add_argument(
|
267 |
+
"--eval_every",
|
268 |
+
type=int,
|
269 |
+
default=50000,
|
270 |
+
help="run evaluation every x iterations",
|
271 |
+
)
|
272 |
+
parser.add_argument(
|
273 |
+
"--num_feats",
|
274 |
+
type=int,
|
275 |
+
required=False,
|
276 |
+
default=32,
|
277 |
+
help="number of random reatures for FAVOR+",
|
278 |
+
)
|
279 |
+
parser.add_argument(
|
280 |
+
"--max_epochs", type=int, required=False, default=1, help="max number of epochs"
|
281 |
+
)
|
282 |
+
|
283 |
+
# debug() FINE TUNEING
|
284 |
+
# parser.add_argument('--save_dir', type=str, required=True)
|
285 |
+
parser.add_argument(
|
286 |
+
"--mode", type=str, default="cls", help="type of pooling to use"
|
287 |
+
)
|
288 |
+
parser.add_argument("--dataset_length", type=int, default=None, required=False)
|
289 |
+
parser.add_argument("--num_workers", type=int, default=0, required=False)
|
290 |
+
parser.add_argument("--dropout", type=float, default=0.1, required=False)
|
291 |
+
# parser.add_argument("--dims", type=int, nargs="*", default="", required=False)
|
292 |
+
parser.add_argument(
|
293 |
+
"--smiles_embedding",
|
294 |
+
type=str,
|
295 |
+
default="/dccstor/medscan7/smallmolecule/runs/ba-predictor/small-data/embeddings/protein/ba_embeddings_tanh_512_2986138_2.pt",
|
296 |
+
)
|
297 |
+
# parser.add_argument("--train_pct", type=str, required=False, default="95")
|
298 |
+
# parser.add_argument("--aug", type=int, required=True)
|
299 |
+
parser.add_argument("--dataset_name", type=str, required=False, default="sol")
|
300 |
+
parser.add_argument("--measure_name", type=str, required=False, default="measure")
|
301 |
+
# parser.add_argument("--emb_type", type=str, required=True)
|
302 |
+
parser.add_argument("--checkpoints_folder", type=str, required=True)
|
303 |
+
# parser.add_argument("--results_dir", type=str, required=True)
|
304 |
+
# parser.add_argument("--patience_epochs", type=int, required=True)
|
305 |
+
parser.add_argument("--model_path", type=str, default="./smi_ted/")
|
306 |
+
parser.add_argument("--ckpt_filename", type=str, default="smi_ted_Light_40.pt")
|
307 |
+
parser.add_argument("--restart_filename", type=str, default="")
|
308 |
+
# parser.add_argument('--n_output', type=int, default=1)
|
309 |
+
parser.add_argument("--save_every_epoch", type=int, default=0)
|
310 |
+
parser.add_argument("--save_ckpt", type=int, default=1)
|
311 |
+
parser.add_argument("--start_seed", type=int, default=0)
|
312 |
+
parser.add_argument("--smi_ted_version", type=str, default="v1")
|
313 |
+
parser.add_argument("--train_decoder", type=int, default=1)
|
314 |
+
parser.add_argument("--target_metric", type=str, default="rmse")
|
315 |
+
parser.add_argument("--loss_fn", type=str, default="mae")
|
316 |
+
|
317 |
+
parser.add_argument(
|
318 |
+
"--data_root",
|
319 |
+
type=str,
|
320 |
+
required=False,
|
321 |
+
default="/dccstor/medscan7/smallmolecule/runs/ba-predictor/small-data/affinity",
|
322 |
+
)
|
323 |
+
# parser.add_argument("--use_bn", type=int, default=0)
|
324 |
+
parser.add_argument("--use_linear", type=int, default=0)
|
325 |
+
|
326 |
+
parser.add_argument("--lr", type=float, default=0.001)
|
327 |
+
# parser.add_argument("--weight_decay", type=float, default=5e-4)
|
328 |
+
# parser.add_argument("--val_check_interval", type=float, default=1.0)
|
329 |
+
parser.add_argument("--batch_size", type=int, default=64)
|
330 |
+
|
331 |
+
return parser
|
332 |
+
|
333 |
+
|
334 |
+
def parse_args():
|
335 |
+
parser = get_parser()
|
336 |
+
args = parser.parse_args()
|
337 |
+
return args
|
models/smi_ted/finetune/finetune_classification.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Deep learning
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
from torch import optim
|
5 |
+
from trainers import TrainerClassifier
|
6 |
+
from utils import get_optim_groups
|
7 |
+
|
8 |
+
# Data
|
9 |
+
import pandas as pd
|
10 |
+
import numpy as np
|
11 |
+
|
12 |
+
# Standard library
|
13 |
+
import args
|
14 |
+
import os
|
15 |
+
|
16 |
+
|
17 |
+
def main(config):
|
18 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
19 |
+
|
20 |
+
# load dataset
|
21 |
+
df_train = pd.read_csv(f"{config.data_root}/train.csv")
|
22 |
+
df_valid = pd.read_csv(f"{config.data_root}/valid.csv")
|
23 |
+
df_test = pd.read_csv(f"{config.data_root}/test.csv")
|
24 |
+
|
25 |
+
# load model
|
26 |
+
if config.smi_ted_version == 'v1':
|
27 |
+
from smi_ted_light.load import load_smi_ted
|
28 |
+
elif config.smi_ted_version == 'v2':
|
29 |
+
from smi_ted_large.load import load_smi_ted
|
30 |
+
|
31 |
+
model = load_smi_ted(folder=config.model_path, ckpt_filename=config.ckpt_filename, n_output=config.n_output, eval=False)
|
32 |
+
model.net.apply(model._init_weights)
|
33 |
+
print(model.net)
|
34 |
+
|
35 |
+
lr = config.lr_start*config.lr_multiplier
|
36 |
+
optim_groups = get_optim_groups(model, keep_decoder=bool(config.train_decoder))
|
37 |
+
if config.loss_fn == 'crossentropy':
|
38 |
+
loss_function = nn.CrossEntropyLoss()
|
39 |
+
|
40 |
+
# init trainer
|
41 |
+
trainer = TrainerClassifier(
|
42 |
+
raw_data=(df_train, df_valid, df_test),
|
43 |
+
dataset_name=config.dataset_name,
|
44 |
+
target=config.measure_name,
|
45 |
+
batch_size=config.n_batch,
|
46 |
+
hparams=config,
|
47 |
+
target_metric=config.target_metric,
|
48 |
+
seed=config.start_seed,
|
49 |
+
smi_ted_version=config.smi_ted_version,
|
50 |
+
checkpoints_folder=config.checkpoints_folder,
|
51 |
+
restart_filename=config.restart_filename,
|
52 |
+
device=device,
|
53 |
+
save_every_epoch=bool(config.save_every_epoch),
|
54 |
+
save_ckpt=bool(config.save_ckpt)
|
55 |
+
)
|
56 |
+
trainer.compile(
|
57 |
+
model=model,
|
58 |
+
optimizer=optim.AdamW(optim_groups, lr=lr, betas=(0.9, 0.99)),
|
59 |
+
loss_fn=loss_function
|
60 |
+
)
|
61 |
+
trainer.fit(max_epochs=config.max_epochs)
|
62 |
+
trainer.evaluate()
|
63 |
+
|
64 |
+
|
65 |
+
if __name__ == '__main__':
|
66 |
+
parser = args.get_parser()
|
67 |
+
config = parser.parse_args()
|
68 |
+
main(config)
|
models/smi_ted/finetune/finetune_classification_multitask.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Deep learning
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
from torch import optim
|
5 |
+
from trainers import TrainerClassifierMultitask
|
6 |
+
from utils import get_optim_groups
|
7 |
+
|
8 |
+
# Data
|
9 |
+
import pandas as pd
|
10 |
+
import numpy as np
|
11 |
+
|
12 |
+
# Standard library
|
13 |
+
import args
|
14 |
+
import os
|
15 |
+
|
16 |
+
|
17 |
+
def main(config):
|
18 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
19 |
+
|
20 |
+
# Define Target and Causal Features
|
21 |
+
if config.dataset_name == 'tox21':
|
22 |
+
targets = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD',
|
23 |
+
'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']
|
24 |
+
elif config.dataset_name == 'clintox':
|
25 |
+
targets = ['FDA_APPROVED', 'CT_TOX']
|
26 |
+
elif config.dataset_name == 'sider':
|
27 |
+
targets = [
|
28 |
+
'Hepatobiliary disorders', 'Metabolism and nutrition disorders',
|
29 |
+
'Product issues', 'Eye disorders', 'Investigations',
|
30 |
+
'Musculoskeletal and connective tissue disorders',
|
31 |
+
'Gastrointestinal disorders', 'Social circumstances',
|
32 |
+
'Immune system disorders', 'Reproductive system and breast disorders',
|
33 |
+
'Neoplasms benign, malignant and unspecified (incl cysts and polyps)',
|
34 |
+
'General disorders and administration site conditions',
|
35 |
+
'Endocrine disorders', 'Surgical and medical procedures',
|
36 |
+
'Vascular disorders', 'Blood and lymphatic system disorders',
|
37 |
+
'Skin and subcutaneous tissue disorders',
|
38 |
+
'Congenital, familial and genetic disorders', 'Infections and infestations',
|
39 |
+
'Respiratory, thoracic and mediastinal disorders', 'Psychiatric disorders',
|
40 |
+
'Renal and urinary disorders',
|
41 |
+
'Pregnancy, puerperium and perinatal conditions',
|
42 |
+
'Ear and labyrinth disorders', 'Cardiac disorders',
|
43 |
+
'Nervous system disorders', 'Injury, poisoning and procedural complications'
|
44 |
+
]
|
45 |
+
elif config.dataset_name == 'muv':
|
46 |
+
targets = [
|
47 |
+
'MUV-466', 'MUV-548', 'MUV-600', 'MUV-644', 'MUV-652', 'MUV-689',
|
48 |
+
'MUV-692', 'MUV-712', 'MUV-713', 'MUV-733', 'MUV-737', 'MUV-810',
|
49 |
+
'MUV-832', 'MUV-846', 'MUV-852', 'MUV-858', 'MUV-859'
|
50 |
+
]
|
51 |
+
config.n_output = len(targets)
|
52 |
+
|
53 |
+
# load dataset
|
54 |
+
df_train = pd.read_csv(f"{config.data_root}/train.csv")
|
55 |
+
df_valid = pd.read_csv(f"{config.data_root}/valid.csv")
|
56 |
+
df_test = pd.read_csv(f"{config.data_root}/test.csv")
|
57 |
+
|
58 |
+
# load model
|
59 |
+
if config.smi_ted_version == 'v1':
|
60 |
+
from smi_ted_light.load import load_smi_ted
|
61 |
+
elif config.smi_ted_version == 'v2':
|
62 |
+
from smi_ted_large.load import load_smi_ted
|
63 |
+
|
64 |
+
model = load_smi_ted(folder=config.model_path, ckpt_filename=config.ckpt_filename, n_output=len(targets), eval=False)
|
65 |
+
model.net.apply(model._init_weights)
|
66 |
+
print(model.net)
|
67 |
+
|
68 |
+
lr = config.lr_start*config.lr_multiplier
|
69 |
+
optim_groups = get_optim_groups(model, keep_decoder=bool(config.train_decoder))
|
70 |
+
if config.loss_fn == 'bceloss':
|
71 |
+
loss_function = nn.BCELoss()
|
72 |
+
|
73 |
+
# init trainer
|
74 |
+
trainer = TrainerClassifierMultitask(
|
75 |
+
raw_data=(df_train, df_valid, df_test),
|
76 |
+
dataset_name=config.dataset_name,
|
77 |
+
target=targets,
|
78 |
+
batch_size=config.n_batch,
|
79 |
+
hparams=config,
|
80 |
+
target_metric=config.target_metric,
|
81 |
+
seed=config.start_seed,
|
82 |
+
smi_ted_version=config.smi_ted_version,
|
83 |
+
checkpoints_folder=config.checkpoints_folder,
|
84 |
+
restart_filename=config.restart_filename,
|
85 |
+
device=device,
|
86 |
+
save_every_epoch=bool(config.save_every_epoch),
|
87 |
+
save_ckpt=bool(config.save_ckpt)
|
88 |
+
)
|
89 |
+
trainer.compile(
|
90 |
+
model=model,
|
91 |
+
optimizer=optim.AdamW(optim_groups, lr=lr, betas=(0.9, 0.99)),
|
92 |
+
loss_fn=loss_function
|
93 |
+
)
|
94 |
+
trainer.fit(max_epochs=config.max_epochs)
|
95 |
+
trainer.evaluate()
|
96 |
+
|
97 |
+
|
98 |
+
if __name__ == '__main__':
|
99 |
+
parser = args.get_parser()
|
100 |
+
config = parser.parse_args()
|
101 |
+
main(config)
|
models/smi_ted/finetune/finetune_regression.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Deep learning
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
from torch import optim
|
5 |
+
from trainers import TrainerRegressor
|
6 |
+
from utils import RMSELoss, get_optim_groups
|
7 |
+
|
8 |
+
# Data
|
9 |
+
import pandas as pd
|
10 |
+
import numpy as np
|
11 |
+
|
12 |
+
# Standard library
|
13 |
+
import args
|
14 |
+
import os
|
15 |
+
|
16 |
+
|
17 |
+
def main(config):
|
18 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
19 |
+
|
20 |
+
# load dataset
|
21 |
+
df_train = pd.read_csv(f"{config.data_root}/train.csv")
|
22 |
+
df_valid = pd.read_csv(f"{config.data_root}/valid.csv")
|
23 |
+
df_test = pd.read_csv(f"{config.data_root}/test.csv")
|
24 |
+
|
25 |
+
# load model
|
26 |
+
if config.smi_ted_version == 'v1':
|
27 |
+
from smi_ted_light.load import load_smi_ted
|
28 |
+
elif config.smi_ted_version == 'v2':
|
29 |
+
from smi_ted_large.load import load_smi_ted
|
30 |
+
|
31 |
+
model = load_smi_ted(folder=config.model_path, ckpt_filename=config.ckpt_filename, n_output=config.n_output, eval=False)
|
32 |
+
model.net.apply(model._init_weights)
|
33 |
+
print(model.net)
|
34 |
+
|
35 |
+
lr = config.lr_start*config.lr_multiplier
|
36 |
+
optim_groups = get_optim_groups(model, keep_decoder=bool(config.train_decoder))
|
37 |
+
if config.loss_fn == 'rmse':
|
38 |
+
loss_function = RMSELoss()
|
39 |
+
elif config.loss_fn == 'mae':
|
40 |
+
loss_function = nn.L1Loss()
|
41 |
+
|
42 |
+
# init trainer
|
43 |
+
trainer = TrainerRegressor(
|
44 |
+
raw_data=(df_train, df_valid, df_test),
|
45 |
+
dataset_name=config.dataset_name,
|
46 |
+
target=config.measure_name,
|
47 |
+
batch_size=config.n_batch,
|
48 |
+
hparams=config,
|
49 |
+
target_metric=config.target_metric,
|
50 |
+
seed=config.start_seed,
|
51 |
+
smi_ted_version=config.smi_ted_version,
|
52 |
+
checkpoints_folder=config.checkpoints_folder,
|
53 |
+
restart_filename=config.restart_filename,
|
54 |
+
device=device,
|
55 |
+
save_every_epoch=bool(config.save_every_epoch),
|
56 |
+
save_ckpt=bool(config.save_ckpt)
|
57 |
+
)
|
58 |
+
trainer.compile(
|
59 |
+
model=model,
|
60 |
+
optimizer=optim.AdamW(optim_groups, lr=lr, betas=(0.9, 0.99)),
|
61 |
+
loss_fn=loss_function
|
62 |
+
)
|
63 |
+
trainer.fit(max_epochs=config.max_epochs)
|
64 |
+
trainer.evaluate()
|
65 |
+
|
66 |
+
|
67 |
+
if __name__ == '__main__':
|
68 |
+
parser = args.get_parser()
|
69 |
+
config = parser.parse_args()
|
70 |
+
main(config)
|
models/smi_ted/finetune/moleculenet/bace/test.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e3af97c680375dd09349c63b4779b35166212302e79e4fc7a1752ef5d71cf35b
|
3 |
+
size 400436
|
models/smi_ted/finetune/moleculenet/bace/train.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b5b3426e84dc7e2f40f2cf9d15d4d38328126c07f49c215cfb4fb657f69200de
|
3 |
+
size 3109699
|
models/smi_ted/finetune/moleculenet/bace/valid.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:813c8f2af5a1058568cf60b7021b8b2cd818a17944afd0b09f9d838e36ee985d
|
3 |
+
size 397085
|
models/smi_ted/finetune/moleculenet/bbbp/test.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cca4161c44535fd0f8ff917cc68d26703da7fbce19ddecb7dc5f7ae4b4d241a6
|
3 |
+
size 14874
|
models/smi_ted/finetune/moleculenet/bbbp/train.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7300807bf21ea1177efd81c218e43275ed00b6c3006b5dae7625f774edb6b1a6
|
3 |
+
size 115549
|
models/smi_ted/finetune/moleculenet/bbbp/valid.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:af39cc3735a356010a072e1e196a64eca6e0d88f0b2a023d4dc1adba7030ce40
|
3 |
+
size 15655
|
models/smi_ted/finetune/moleculenet/biodegradability/biodeg_example.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c98992c1c22ae7468a41fb7bc86c775ccc30fa29e50053bb148ffc2f2d95551e
|
3 |
+
size 6352
|
models/smi_ted/finetune/moleculenet/biodegradability/biodegradability.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8ec61887444a0e8925b16cca48433c3b3bff1ac5cf08f448d6b64bbdbc14a318
|
3 |
+
size 416181
|
models/smi_ted/finetune/moleculenet/biodegradability/test.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:86c2f7f39add0fff77358454c0f1b289a233e4a78d50b7f005ec2dc1c632d473
|
3 |
+
size 84488
|
models/smi_ted/finetune/moleculenet/biodegradability/train.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1a4a94ae0f8c134ce10f2d853eced84d031a4e7b394662344a9141e7567b3eb2
|
3 |
+
size 252230
|
models/smi_ted/finetune/moleculenet/biodegradability/valid.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:09e827ee7e55544f5b327d5e2ef2d9fe09e3f62024e1316b6e71d1fc9be275a1
|
3 |
+
size 85290
|
models/smi_ted/finetune/moleculenet/clintox/test.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:963a05e8eeaaa38fd3688f448dfc28cd0917ea280b1b9cb5b4297244f7f68fe2
|
3 |
+
size 10219
|
models/smi_ted/finetune/moleculenet/clintox/train.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:04bbee4a0d7fb4942292c9581f318909d06508d529a4a3a76590e6749417c1a7
|
3 |
+
size 74357
|
models/smi_ted/finetune/moleculenet/clintox/valid.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f3e2b9ab566ffc184c0590002bfbd6a42e6522209e6d6271968262844dde2905
|
3 |
+
size 10255
|
models/smi_ted/finetune/moleculenet/esol/test.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7da41a7eab447fdfd163292b4a5eb8ef09a747fc82b0f1cc5c468e46b1b2ef5a
|
3 |
+
size 9999
|
models/smi_ted/finetune/moleculenet/esol/train.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:784ba31de05a43ecab98260c94a47e2c807f4d65c0f93d9a88fbd962515976c5
|
3 |
+
size 77154
|
models/smi_ted/finetune/moleculenet/esol/valid.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bc30e7fa1f774e27ed56de7cfd77e21f07a5a2c38fcc6d928c0084a9a99181e5
|
3 |
+
size 9892
|
models/smi_ted/finetune/moleculenet/freesolv/test.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c8212c391ccbff3722a11d1bd3752b3a9dd187f2a7b33f8b9d2d594950b188d7
|
3 |
+
size 3223
|
models/smi_ted/finetune/moleculenet/freesolv/train.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f3b781e5d03dbd7d272347288161f92e8e66c628da50e3e2bc06de12225de22d
|
3 |
+
size 25053
|
models/smi_ted/finetune/moleculenet/freesolv/valid.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3b35d9c13a02291eefe85bd4b048ccc28f5326a3b018beb937aba12067b072d2
|
3 |
+
size 3151
|
models/smi_ted/finetune/moleculenet/hiv/test.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6e86ca708a331966f6e7b06621a2e221a9f6ce45f0141e6cbe919fd64ec50fc7
|
3 |
+
size 213176
|
models/smi_ted/finetune/moleculenet/hiv/train.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c289700d093d7ccbe55a583ad5cb3a670df931a19283ea66880413ed398358ff
|
3 |
+
size 1685863
|