# materials.smi-TED - INFERENCE (Regression)

In [None]:
# Install extra packages for notebook
%pip install seaborn xgboost

In [1]:
import sys
sys.path.append('../inference')

In [2]:
# materials.smi-ted (smi-ted)
from smi_ted_light.load import load_smi_ted

# Data
import torch
import pandas as pd
import numpy as np

# Chemistry
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import Descriptors
PandasTools.RenderImagesInAllDataFrames(True)

In [3]:
# function to canonicalize SMILES
def normalize_smiles(smi, canonical=True, isomeric=False):
 try:
 normalized = Chem.MolToSmiles(
 Chem.MolFromSmiles(smi), canonical=canonical, isomericSmiles=isomeric
 )
 except:
 normalized = None
 return normalized

### Import smi-ted

In [4]:
model_smi_ted = load_smi_ted(
 folder='../inference/smi_ted_light',
 ckpt_filename='smi-ted-Light_40.pt'
)

Random Seed: 12345
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Vocab size: 2393
[INFERENCE MODE - smi-ted-Light]


## Lipophilicity Dataset

### Experiments - Data Load

In [5]:
df_train = pd.read_csv("../finetune/moleculenet/lipophilicity/train.csv")
df_test = pd.read_csv("../finetune/moleculenet/lipophilicity/test.csv")

### SMILES canonization

In [6]:
df_train['norm_smiles'] = df_train['smiles'].apply(normalize_smiles)
df_train_normalized = df_train.dropna()
print(df_train_normalized.shape)
df_train_normalized.head()

(3360, 3)


Unnamed: 0,smiles,y,norm_smiles
0,Nc1ncnc2c1c(COc3cccc(Cl)c3)nn2C4CCOCC4,0.814313,Nc1ncnc2c1c(COc1cccc(Cl)c1)nn2C1CCOCC1
1,COc1cc(cc2cnc(Nc3ccc(cc3)[C@@H](C)NC(=O)C)nc12...,0.446346,COc1cc(-c2ccncc2)cc2cnc(Nc3ccc(C(C)NC(C)=O)cc3...
2,CC(=O)Nc1ccc2ccn(c3cc(Nc4ccn(C)n4)n5ncc(C#N)c5...,1.148828,CC(=O)Nc1ccc2ccn(-c3cc(Nc4ccn(C)n4)n4ncc(C#N)c...
3,Oc1ccc(CCNCCS(=O)(=O)CCCOCCSc2ccccc2)c3sc(O)nc13,0.404532,O=S(=O)(CCCOCCSc1ccccc1)CCNCCc1ccc(O)c2nc(O)sc12
4,Clc1ccc2C(=O)C3=C(Nc2c1)C(=O)NN(Cc4cc5ccccc5s4...,-0.164144,O=c1[nH]n(Cc2cc3ccccc3s2)c(=O)c2c(=O)c3ccc(Cl)...


In [7]:
df_test['norm_smiles'] = df_test['smiles'].apply(normalize_smiles)
df_test_normalized = df_test.dropna()
print(df_test_normalized.shape)
df_test_normalized.head()

(420, 3)


Unnamed: 0,smiles,y,norm_smiles
0,N(c1ccccc1)c2ccnc3ccccc23,0.488161,c1ccc(Nc2ccnc3ccccc23)cc1
1,Clc1ccc2Oc3ccccc3N=C(N4CCNCC4)c2c1,0.070017,Clc1ccc2c(c1)C(N1CCNCC1)=Nc1ccccc1O2
2,NC1(CCC1)c2ccc(cc2)c3ncc4cccnc4c3c5ccccc5,-0.41503,NC1(c2ccc(-c3ncc4cccnc4c3-c3ccccc3)cc2)CCC1
3,OC[C@H](O)CN1C(=O)[C@@H](Cc2ccccc12)NC(=O)c3cc...,0.897942,O=C(NC1Cc2ccccc2N(CC(O)CO)C1=O)c1cc2cc(Cl)sc2[...
4,NS(=O)(=O)c1nc2ccccc2s1,-0.707731,NS(=O)(=O)c1nc2ccccc2s1


### Embeddings extraction 

#### smi-ted embeddings extraction

In [8]:
with torch.no_grad():
 df_embeddings_train = model_smi_ted.encode(df_train_normalized['norm_smiles'])
df_embeddings_train.head()

100%|██████████| 33/33 [00:38<00:00, 1.15s/it]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.367646,-0.504889,0.040485,0.385314,0.564923,-0.684497,1.160397,0.071218,0.799428,0.181323,...,-1.379994,-0.167221,0.104886,0.239571,-0.74439,0.590423,-0.808946,0.792584,0.550898,-0.176831
1,0.455316,-0.485554,0.062206,0.387994,0.56759,-0.713285,1.144267,-0.057046,0.753016,0.11218,...,-1.332142,-0.096662,0.221944,0.327923,-0.739358,0.659803,-0.775723,0.745837,0.56633,-0.111946
2,0.442309,-0.484732,0.084945,0.384787,0.564752,-0.70413,1.159491,0.021168,0.846539,0.118463,...,-1.324177,-0.110403,0.207824,0.281665,-0.780818,0.693484,-0.832626,0.763095,0.53246,-0.196708
3,0.527961,-0.519151,0.091635,0.353518,0.421795,-0.72422,1.093752,0.148574,0.804047,0.194627,...,-1.358414,-0.111483,0.151692,0.186741,-0.601867,0.641591,-0.747422,0.794239,0.640765,-0.239649
4,0.464432,-0.51109,0.038785,0.346217,0.492919,-0.619387,1.048157,0.09591,0.738604,0.11927,...,-1.223927,-0.109863,0.15128,0.244834,-0.68661,0.759327,-0.756338,0.766427,0.610454,-0.197345


In [9]:
with torch.no_grad():
 df_embeddings_test = model_smi_ted.encode(df_test_normalized['norm_smiles'])
df_embeddings_test.head()

100%|██████████| 4/4 [00:05<00:00, 1.46s/it]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.392252,-0.504846,0.056791,0.356297,0.475918,-0.648899,1.157862,-0.022914,0.70324,0.192023,...,-1.208714,-0.094441,0.128845,0.403995,-0.782782,0.541907,-0.707272,0.901041,0.629461,-0.02063
1,0.387422,-0.481142,0.049675,0.353058,0.60117,-0.646099,1.142392,0.060092,0.763799,0.110331,...,-1.248282,-0.13979,0.075585,0.202242,-0.729794,0.705914,-0.771751,0.843173,0.61885,-0.213584
2,0.390975,-0.510056,0.070656,0.380695,0.601486,-0.595827,1.182193,0.011085,0.688093,0.056453,...,-1.294595,-0.164846,0.194435,0.240742,-0.773443,0.608631,-0.747181,0.791911,0.611874,-0.125455
3,0.423924,-0.557325,0.08381,0.328703,0.399589,-0.622818,1.079945,0.097611,0.72403,0.135976,...,-1.41206,-0.106541,0.153314,0.209962,-0.69969,0.648061,-0.716241,0.757986,0.615963,-0.258693
4,0.335576,-0.559591,0.119437,0.364141,0.375474,-0.639833,1.144707,0.077512,0.791759,0.164201,...,-1.279041,-0.186733,0.106963,0.254949,-0.651694,0.594167,-0.680426,0.887482,0.651587,-0.144996


### Experiments - Lipophilicity prediction using smi-ted latent spaces

#### XGBoost prediction using the whole Latent Space

In [10]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [11]:
xgb_predict = XGBRegressor(n_estimators=2000, learning_rate=0.05, max_depth=4)
xgb_predict.fit(df_embeddings_train, df_train_normalized['y'])

In [12]:
# get XGBoost predictions
y_pred = xgb_predict.predict(df_embeddings_test)

In [13]:
rmse = np.sqrt(mean_squared_error(df_test_normalized["y"], y_pred))
print(f"RMSE Score: {rmse:.4f}")

RMSE Score: 0.6485
