{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wWSzEAjavzrb", "outputId": "3c8987e4-5537-46ed-fc4b-0988e03b376c" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Requirement already satisfied: bunch in /usr/local/lib/python3.8/dist-packages (1.0.1)\n" ] } ], "source": [ "\n", "pip install bunch" ] }, { "cell_type": "code", "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PXp_V--04b4N", "outputId": "35b31fa0-1cde-4ddc-b82d-af62805a5c76" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "7mgq8uVbyW4d" }, "source": [ "## Utility Codes" ] }, { "cell_type": "markdown", "metadata": { "id": "oyGzbJOiyau5" }, "source": [ "### Parameter Setting" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "FSYvCcgkvxtK" }, "outputs": [], "source": [ "import os\n", "import time\n", "import json\n", "from bunch import Bunch\n", "\n", "import pandas as pd\n", "\n", "df= pd.read_csv('vocab.csv')\n", "vocab = df['Smiles'].tolist()\n", "\n", "def get_config_from_json(json_file):\n", " with open(json_file, 'r') as config_file:\n", " config_dict = json.load(config_file)\n", " config = Bunch(config_dict)\n", " return config\n", "\n", "\n", "def process_config(json_file):\n", " config = get_config_from_json(json_file)\n", " config.config_file = json_file\n", " config.exp_dir = os.path.join(\n", " 'experiments', time.strftime('%Y-%m-%d/', time.localtime()),\n", " config.exp_name)\n", " config.tensorboard_log_dir = os.path.join(\n", " 'experiments', time.strftime('%Y-%m-%d/', time.localtime()),\n", " config.exp_name, 'logs/')\n", " config.checkpoint_dir = os.path.join(\n", " 'experiments', time.strftime('%Y-%m-%d/', time.localtime()),\n", " config.exp_name, 'checkpoints/')\n", " return config" ] }, { "cell_type": "markdown", "metadata": { "id": "tJp7vxoyyhej" }, "source": [ "### Creating Directory" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Bx9GeQ9qwEXy" }, "outputs": [], "source": [ "import os\n", "import sys\n", "\n", "\n", "def create_dirs(dirs):\n", " try:\n", " for dir_ in dirs:\n", " if not os.path.exists(dir_):\n", " os.makedirs(dir_)\n", " except Exception as err:\n", " print(f'Creating directories error: {err}')\n", " sys.exit()" ] }, { "cell_type": "markdown", "metadata": { "id": "7ME9uPedQj00" }, "source": [ "## Building Function Vocabulary" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "5MokMxOEQiaO", "outputId": "d4eb662d-400e-41f5-ed34-f1278ab6520b" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Unnamed: 0 Len Functional grp name \\\n", "0 0 7 Acid anhydrides \n", "1 1 8 Diacyl peroxides \n", "2 2 15 Phenyl esters \n", "3 3 5 Isocyanates \n", "4 4 8 β-Lactams \n", "... ... ... ... \n", "1240 1240 12 SA46 \n", "1241 1241 11 SA47 \n", "1242 1242 19 SA48 \n", "1243 1243 15 SA49 \n", "1244 1244 6 Primary amines (strict) \n", "\n", " SMARTS Smiles \\\n", "0 [CX3](=[OX1])[OX2][CX3](=[OX1]) O=COC=O \n", "1 [CX3](=[OX1])[OX2][OX2][CX3](=[OX1]) O=COOC=O \n", "2 [cR1]1([OX2][CX3](=[OX1])[#6])[cR1][cR1][cR1][... CC(=O)Oc1ccccc1 \n", "3 [NX2]=[CX2]=[OX1] N=C=O \n", "4 [NX3]1[CX4][CX4][CX3]1(=[OX1]) O=C1CCN1 \n", "... ... ... \n", "1240 [#6]-1-2-[#6]=[#6]-[#6](-[#6]-1)-[#6]-[#6]-2 C1=CC2CCC1C2 \n", "1241 [#6]-1-2-[#6]-[#6]-[#6](-[#6]-1)-[#6]-[#6]-2 C1CC2CCC1C2 \n", "1242 c1(c(cccc1)-[#8]-c2ccccc2)-[Br] Brc1ccccc1Oc1ccccc1 \n", "1243 c1-2c(cccc1)-[#6]-[#6]-[#6]-2 c1ccc2c(c1)CCC2 \n", "1244 [NX3H2][CX4&!$([CX4]([NH2])[O,N,S,P])] C[NH2] \n", "\n", " Validity \n", "0 Valid \n", "1 Valid \n", "2 Valid \n", "3 Valid \n", "4 Valid \n", "... ... \n", "1240 Valid \n", "1241 Valid \n", "1242 Valid \n", "1243 Valid \n", "1244 Valid \n", "\n", "[1245 rows x 6 columns]" ], "text/html": [ "\n", "
\n", " | Unnamed: 0 | \n", "Len | \n", "Functional grp name | \n", "SMARTS | \n", "Smiles | \n", "Validity | \n", "
---|---|---|---|---|---|---|
0 | \n", "0 | \n", "7 | \n", "Acid anhydrides | \n", "[CX3](=[OX1])[OX2][CX3](=[OX1]) | \n", "O=COC=O | \n", "Valid | \n", "
1 | \n", "1 | \n", "8 | \n", "Diacyl peroxides | \n", "[CX3](=[OX1])[OX2][OX2][CX3](=[OX1]) | \n", "O=COOC=O | \n", "Valid | \n", "
2 | \n", "2 | \n", "15 | \n", "Phenyl esters | \n", "[cR1]1([OX2][CX3](=[OX1])[#6])[cR1][cR1][cR1][... | \n", "CC(=O)Oc1ccccc1 | \n", "Valid | \n", "
3 | \n", "3 | \n", "5 | \n", "Isocyanates | \n", "[NX2]=[CX2]=[OX1] | \n", "N=C=O | \n", "Valid | \n", "
4 | \n", "4 | \n", "8 | \n", "β-Lactams | \n", "[NX3]1[CX4][CX4][CX3]1(=[OX1]) | \n", "O=C1CCN1 | \n", "Valid | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1240 | \n", "1240 | \n", "12 | \n", "SA46 | \n", "[#6]-1-2-[#6]=[#6]-[#6](-[#6]-1)-[#6]-[#6]-2 | \n", "C1=CC2CCC1C2 | \n", "Valid | \n", "
1241 | \n", "1241 | \n", "11 | \n", "SA47 | \n", "[#6]-1-2-[#6]-[#6]-[#6](-[#6]-1)-[#6]-[#6]-2 | \n", "C1CC2CCC1C2 | \n", "Valid | \n", "
1242 | \n", "1242 | \n", "19 | \n", "SA48 | \n", "c1(c(cccc1)-[#8]-c2ccccc2)-[Br] | \n", "Brc1ccccc1Oc1ccccc1 | \n", "Valid | \n", "
1243 | \n", "1243 | \n", "15 | \n", "SA49 | \n", "c1-2c(cccc1)-[#6]-[#6]-[#6]-2 | \n", "c1ccc2c(c1)CCC2 | \n", "Valid | \n", "
1244 | \n", "1244 | \n", "6 | \n", "Primary amines (strict) | \n", "[NX3H2][CX4&!$([CX4]([NH2])[O,N,S,P])] | \n", "C[NH2] | \n", "Valid | \n", "
1245 rows × 6 columns
\n", "