{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "import logging\n", "\n", "sys.path.append(\"..\")\n", "\n", "# Set logging level to warning\n", "logging.basicConfig(level=logging.WARNING)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.10/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", " from pandas import MultiIndex, Int64Index\n" ] } ], "source": [ "import protac_degradation_predictor as pdp\n", "\n", "import pandas as pd\n", "from tqdm.notebook import tqdm" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Uniprot | \n", "Cell Line Identifier | \n", "Smiles | \n", "E3 Ligase | \n", "DC50 (nM) | \n", "Dmax (%) | \n", "Active | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "Q07817 | \n", "MOLT-4 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
1 | \n", "Q07817 | \n", "MOLT-4 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
2 | \n", "Q07817 | \n", "MOLT-4 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
3 | \n", "Q07817 | \n", "MOLT-4 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
4 | \n", "Q07817 | \n", "MOLT-4 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "53.0 | \n", "100.0 | \n", "True | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
2136 | \n", "O60885 | \n", "HEK293 | \n", "Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O... | \n", "VHL | \n", "63.1 | \n", "NaN | \n", "NaN | \n", "
2137 | \n", "Q05397 | \n", "A549 Cas9 | \n", "CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCCOCCOCCO... | \n", "VHL | \n", "125.9 | \n", "NaN | \n", "NaN | \n", "
2138 | \n", "Q05397 | \n", "A549 Cas9 | \n", "CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCCOCC(=O)... | \n", "VHL | \n", "158.5 | \n", "NaN | \n", "NaN | \n", "
2139 | \n", "Q05397 | \n", "A549 Cas9 | \n", "CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCC(=O)N[C... | \n", "VHL | \n", "31.6 | \n", "NaN | \n", "NaN | \n", "
2140 | \n", "Q05397 | \n", "A549 Cas9 | \n", "CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCC(=O)N[C@H... | \n", "VHL | \n", "39.8 | \n", "NaN | \n", "NaN | \n", "
2141 rows × 7 columns
\n", "\n", " | Uniprot | \n", "Cell Line Identifier | \n", "Smiles | \n", "E3 Ligase | \n", "DC50 (nM) | \n", "Dmax (%) | \n", "Active | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "Q07817 | \n", "MOLT-4 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
1 | \n", "Q07817 | \n", "MOLT-4 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
2 | \n", "Q07817 | \n", "MOLT-4 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
3 | \n", "Q07817 | \n", "MOLT-4 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
5 | \n", "Q07817 | \n", "MOLT-4 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
2136 | \n", "O60885 | \n", "HEK293 | \n", "Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O... | \n", "VHL | \n", "63.1 | \n", "NaN | \n", "NaN | \n", "
2137 | \n", "Q05397 | \n", "A549 Cas9 | \n", "CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCCOCCOCCO... | \n", "VHL | \n", "125.9 | \n", "NaN | \n", "NaN | \n", "
2138 | \n", "Q05397 | \n", "A549 Cas9 | \n", "CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCCOCC(=O)... | \n", "VHL | \n", "158.5 | \n", "NaN | \n", "NaN | \n", "
2139 | \n", "Q05397 | \n", "A549 Cas9 | \n", "CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCC(=O)N[C... | \n", "VHL | \n", "31.6 | \n", "NaN | \n", "NaN | \n", "
2140 | \n", "Q05397 | \n", "A549 Cas9 | \n", "CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCC(=O)N[C@H... | \n", "VHL | \n", "39.8 | \n", "NaN | \n", "NaN | \n", "
1284 rows × 7 columns
\n", "\n", " | Uniprot | \n", "Cell Line Identifier | \n", "Smiles | \n", "E3 Ligase | \n", "mean_active_prob | \n", "majority_vote_active | \n", "
---|---|---|---|---|---|---|
0 | \n", "Q07817 | \n", "MOLT-4 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "0.661548 | \n", "True | \n", "
1 | \n", "Q07817 | \n", "MOLT-4 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "0.664419 | \n", "True | \n", "
2 | \n", "Q07817 | \n", "MOLT-4 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "0.671933 | \n", "True | \n", "
3 | \n", "Q07817 | \n", "MOLT-4 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "0.662499 | \n", "True | \n", "
5 | \n", "Q07817 | \n", "MOLT-4 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "0.649880 | \n", "True | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
2136 | \n", "O60885 | \n", "HEK293 | \n", "Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O... | \n", "VHL | \n", "0.596114 | \n", "True | \n", "
2137 | \n", "Q05397 | \n", "A549 Cas9 | \n", "CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCCOCCOCCO... | \n", "VHL | \n", "0.652442 | \n", "True | \n", "
2138 | \n", "Q05397 | \n", "A549 Cas9 | \n", "CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCCOCC(=O)... | \n", "VHL | \n", "0.645811 | \n", "True | \n", "
2139 | \n", "Q05397 | \n", "A549 Cas9 | \n", "CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCC(=O)N[C... | \n", "VHL | \n", "0.667549 | \n", "True | \n", "
2140 | \n", "Q05397 | \n", "A549 Cas9 | \n", "CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCC(=O)N[C@H... | \n", "VHL | \n", "0.654636 | \n", "True | \n", "
1284 rows × 6 columns
\n", "\n", " | Dmax (%) | \n", "pDC50 | \n", "Uniprot | \n", "Cell Line Identifier | \n", "Smiles | \n", "E3 Ligase | \n", "mean_active_prob | \n", "majority_vote_active | \n", "
---|---|---|---|---|---|---|---|---|
143 | \n", "NaN | \n", "6.000000 | \n", "Q9UHD2 | \n", "NaN | \n", "Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O... | \n", "VHL | \n", "0.881667 | \n", "True | \n", "
144 | \n", "NaN | \n", "6.000000 | \n", "Q9UHD2 | \n", "NaN | \n", "Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O... | \n", "VHL | \n", "0.862472 | \n", "True | \n", "
145 | \n", "NaN | \n", "6.000000 | \n", "Q9UHD2 | \n", "NaN | \n", "Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O... | \n", "VHL | \n", "0.854805 | \n", "True | \n", "
146 | \n", "NaN | \n", "6.000000 | \n", "Q9UHD2 | \n", "NaN | \n", "Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O... | \n", "VHL | \n", "0.853683 | \n", "True | \n", "
147 | \n", "NaN | \n", "6.000000 | \n", "Q9UHD2 | \n", "NaN | \n", "Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O... | \n", "VHL | \n", "0.844864 | \n", "True | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1865 | \n", "61.0 | \n", "NaN | \n", "Q13547 | \n", "HCT116-53BPI(+/-) | \n", "Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O... | \n", "VHL | \n", "0.153555 | \n", "False | \n", "
2019 | \n", "NaN | \n", "6.920819 | \n", "P40763 | \n", "MOLM-16 | \n", "NC(=O)CC[C@H](NC(=O)[C@@H]1CC[C@@H]2CCN(C(=O)C... | \n", "CRBN | \n", "0.140929 | \n", "False | \n", "
2020 | \n", "NaN | \n", "7.769551 | \n", "P40763 | \n", "SU-DHL-1 | \n", "NC(=O)CC[C@H](NC(=O)[C@@H]1CC[C@@H]2CCN(C(=O)C... | \n", "CRBN | \n", "0.135795 | \n", "False | \n", "
2096 | \n", "NaN | \n", "6.153663 | \n", "Q9BY41 | \n", "Jurkat | \n", "O=C1CCC(N2C(=O)c3cccc(NCCCCCCCCCCCNC(=O)c4cccc... | \n", "CRBN | \n", "0.162395 | \n", "False | \n", "
2116 | \n", "NaN | \n", "8.105130 | \n", "P10636 | \n", "NaN | \n", "CC(=O)N1CCCN(c2nc(NCCc3cccs3)nc(N(C)CC(=O)NCCO... | \n", "VHL | \n", "0.141287 | \n", "False | \n", "
154 rows × 8 columns
\n", "