Spaces:
Sleeping
Sleeping
import evaluate | |
import datasets | |
# import moses | |
from moses import metrics | |
import pandas as pd | |
from tdc import Evaluator | |
from tdc import Oracle | |
_DESCRIPTION = """ | |
Comprehensive suite of metrics designed to assess the performance of molecular generation models, for understanding how well a model can produce novel, chemically valid molecules that are relevant to specific research objectives. | |
""" | |
_KWARGS_DESCRIPTION = """ | |
Args: | |
generated_smiles (`list` of `string`): A collection of SMILES (Simplified Molecular Input Line Entry System) strings generated by the model, ideally encompassing more than 30,000 samples. | |
train_smiles (`list` of `string`): The dataset of SMILES strings used to train the model, serving as a reference to evaluate the novelty and diversity of the generated molecules. | |
Returns: | |
Dectionary item containing various metrics to evaluate model performance | |
""" | |
_CITATION = """ | |
@article{DBLP:journals/corr/abs-1811-12823, | |
author = {Daniil Polykovskiy and | |
Alexander Zhebrak and | |
Benjam{\'{\i}}n S{\'{a}}nchez{-}Lengeling and | |
Sergey Golovanov and | |
Oktai Tatanov and | |
Stanislav Belyaev and | |
Rauf Kurbanov and | |
Aleksey Artamonov and | |
Vladimir Aladinskiy and | |
Mark Veselov and | |
Artur Kadurin and | |
Sergey I. Nikolenko and | |
Al{\'{a}}n Aspuru{-}Guzik and | |
Alex Zhavoronkov}, | |
title = {Molecular Sets {(MOSES):} {A} Benchmarking Platform for Molecular | |
Generation Models}, | |
journal = {CoRR}, | |
volume = {abs/1811.12823}, | |
year = {2018}, | |
url = {http://arxiv.org/abs/1811.12823}, | |
eprinttype = {arXiv}, | |
eprint = {1811.12823}, | |
timestamp = {Fri, 26 Nov 2021 15:34:30 +0100}, | |
biburl = {https://dblp.org/rec/journals/corr/abs-1811-12823.bib}, | |
bibsource = {dblp computer science bibliography, https://dblp.org} | |
} | |
""" | |
class molgenevalmetric(evaluate.Metric): | |
def _info(self): | |
return evaluate.MetricInfo( | |
description=_DESCRIPTION, | |
citation=_CITATION, | |
inputs_description=_KWARGS_DESCRIPTION, | |
features=datasets.Features( | |
{ | |
"generated_smiles": datasets.Sequence(datasets.Value("string")), | |
"train_smiles": datasets.Sequence(datasets.Value("string")), | |
} | |
if self.config_name == "multilabel" | |
else { | |
"generated_smiles": datasets.Value("string"), | |
"train_smiles": datasets.Value("string"), | |
} | |
), | |
reference_urls=["https://github.com/molecularsets/moses", "https://tdcommons.ai/functions/oracles/"], | |
) | |
def _compute(self, generated_smiles, train_smiles = None): | |
Results = metrics.get_all_metrics(gen = generated_smiles, train= train_smiles) | |
evaluator = Evaluator(name = 'KL_Divergence') | |
KL_Divergence = evaluator(generated_smiles, train_smiles) | |
Results.update({ | |
"KL_Divergence": KL_Divergence, | |
}) | |
oracle_list = [ | |
'QED', 'SA', 'MPO', 'GSK3B', 'JNK3', | |
'DRD2', 'LogP', 'Rediscovery', 'Similarity', | |
'Median', 'Isomers', 'Valsartan_SMARTS', 'Hop' | |
] | |
for oracle_name in oracle_list: | |
oracle = Oracle(name=oracle_name) | |
if oracle_name in ['Rediscovery', 'MPO', 'Similarity', 'Median', 'Isomers', 'Hop']: | |
score = oracle(generated_smiles) | |
if isinstance(score, dict): | |
score = {key: sum(values)/len(values) for key, values in score.items()} | |
else: | |
score = oracle(generated_smiles) | |
if isinstance(score, list): | |
score = sum(score) / len(score) | |
Results.update({f"{oracle_name}": score}) | |
keys_to_remove = ["FCD/TestSF", "SNN/TestSF", "Frag/TestSF", "Scaf/TestSF"] | |
for key in keys_to_remove: | |
Results.pop(key, None) | |
return {"results": Results} | |