Saicharan
improvements
c36a10c
raw
history blame
6.47 kB
import pandas as pd
df = pd.read_csv('/Users/saicharan/chembl_10000.csv')
import evaluate
molgenevalmetric = evaluate.load("saicharan2804/molgenevalmetric")
ls= df['SMILES'].tolist()
ls_gen = ls[0:5000]
ls_train = ls[5000:10000]
print('computing')
print(molgenevalmetric.compute(gensmi = ls_gen, trainsmi = ls_train))
# import evaluate
# from evaluate.utils import launch_gradio_widget
# import gradio as gr
# module = evaluate.load("saicharan2804/molgenevalmetric")
# # launch_gradio_widget(module)
# iface = gr.Interface(
# fn = module,
# inputs=[
# gr.File(label="Generated SMILES"),
# gr.File(label="Training Data", value=None),
# ],
# outputs="text"
# )
# iface.launch()
# import pandas as pd
# df = pd.read_csv('/home/saicharan/Downloads/chembl.csv')
# df = df.rename(columns={'canonical_smiles': 'SMILES'})
# df = df[0:10000]
# print(df[['SMILES']].to_csv('/home/saicharan/Downloads/chembl_10000.csv'))
# from SCScore import SCScorer
# '''
# This is a standalone, importable SCScorer model. It does not have tensorflow as a
# dependency and is a more attractive option for deployment. The calculations are
# fast enough that there is no real reason to use GPUs (via tf) instead of CPUs (via np)
# '''
# import numpy as np
# import time
# import rdkit.Chem as Chem
# import rdkit.Chem.AllChem as AllChem
# import json
# import gzip
# import six
# import os
# project_root = os.path.dirname(os.path.dirname(__file__))
# score_scale = 5.0
# min_separation = 0.25
# FP_len = 1024
# FP_rad = 2
# def sigmoid(x):
# return 1 / (1 + np.exp(-x))
# class SCScorer():
# def __init__(self, score_scale=score_scale):
# self.vars = []
# self.score_scale = score_scale
# self._restored = False
# def restore(self, weight_path=os.path.join('model.ckpt-10654.as_numpy.json.gz'), FP_rad=FP_rad, FP_len=FP_len):
# self.FP_len = FP_len; self.FP_rad = FP_rad
# self._load_vars(weight_path)
# # print('Restored variables from {}'.format(weight_path))
# if 'uint8' in weight_path or 'counts' in weight_path:
# def mol_to_fp(self, mol):
# if mol is None:
# return np.array((self.FP_len,), dtype=np.uint8)
# fp = AllChem.GetMorganFingerprint(mol, self.FP_rad, useChirality=True) # uitnsparsevect
# fp_folded = np.zeros((self.FP_len,), dtype=np.uint8)
# for k, v in six.iteritems(fp.GetNonzeroElements()):
# fp_folded[k % self.FP_len] += v
# return np.array(fp_folded)
# else:
# def mol_to_fp(self, mol):
# if mol is None:
# return np.zeros((self.FP_len,), dtype=np.float32)
# return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, self.FP_rad, nBits=self.FP_len,
# useChirality=True), dtype=np.bool_)
# self.mol_to_fp = mol_to_fp
# self._restored = True
# return self
# def smi_to_fp(self, smi):
# if not smi:
# return np.zeros((self.FP_len,), dtype=np.float32)
# return self.mol_to_fp(self, Chem.MolFromSmiles(smi))
# def apply(self, x):
# if not self._restored:
# raise ValueError('Must restore model weights!')
# # Each pair of vars is a weight and bias term
# for i in range(0, len(self.vars), 2):
# last_layer = (i == len(self.vars)-2)
# W = self.vars[i]
# b = self.vars[i+1]
# x = np.matmul(x, W) + b
# if not last_layer:
# x = x * (x > 0) # ReLU
# x = 1 + (score_scale - 1) * sigmoid(x)
# return x
# def get_score_from_smi(self, smi='', v=False):
# if not smi:
# return ('', 0.)
# fp = np.array((self.smi_to_fp(smi)), dtype=np.float32)
# if sum(fp) == 0:
# if v: print('Could not get fingerprint?')
# cur_score = 0.
# else:
# # Run
# cur_score = self.apply(fp)
# if v: print('Score: {}'.format(cur_score))
# mol = Chem.MolFromSmiles(smi)
# if mol:
# smi = Chem.MolToSmiles(mol, isomericSmiles=True, kekuleSmiles=True)
# else:
# smi = ''
# return (smi, cur_score)
# def get_avg_score(self, smis):
# """
# Compute the average score for a list of SMILES strings.
# Args:
# smis (list of str): A list of SMILES strings.
# Returns:
# float: The average score of the given SMILES strings.
# """
# if not smis: # Check if the list is empty
# return 0.0
# total_score = 0.0
# valid_smiles_count = 0
# for smi in smis:
# _, score = self.get_score_from_smi(smi)
# if score > 0: # Assuming only positive scores are valid
# total_score += score
# valid_smiles_count += 1
# # Avoid division by zero
# if valid_smiles_count == 0:
# return 0.0
# else:
# return total_score / valid_smiles_count
# def _load_vars(self, weight_path):
# if weight_path.endswith('pickle'):
# import pickle
# with open(weight_path, 'rb') as fid:
# self.vars = pickle.load(fid)
# self.vars = [x.tolist() for x in self.vars]
# elif weight_path.endswith('json.gz'):
# with gzip.GzipFile(weight_path, 'r') as fin: # 4. gzip
# json_bytes = fin.read() # 3. bytes (i.e. UTF-8)
# json_str = json_bytes.decode('utf-8') # 2. string (i.e. JSON)
# self.vars = json.loads(json_str)
# self.vars = [np.array(x) for x in self.vars]
# from myscscore.SCScore import SCScorer
# import pandas as pd
# model = SCScorer()
# model.restore()
# # import evaluate
# # molgenevalmetric = evaluate.load("saicharan2804/molgenevalmetric")
# df = pd.read_csv('/home/saicharan/Downloads/chembl_10000.csv')
# ls= df['SMILES'].tolist()
# ls_gen = ls[0:5000]
# ls_train = ls[5000:10000]
# print('computing')
# average_score = model.get_avg_score(ls_gen)
# # Print the average score
# print('Average score:', average_score)
# # print(molgenevalmetric.compute(gensmi = ls_gen, trainsmi = ls_train))