{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "4058fc54", "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "execution": { "iopub.execute_input": "2023-07-08T12:37:04.657833Z", "iopub.status.busy": "2023-07-08T12:37:04.657370Z", "iopub.status.idle": "2023-07-08T12:37:04.671182Z", "shell.execute_reply": "2023-07-08T12:37:04.669941Z" }, "papermill": { "duration": 0.028753, "end_time": "2023-07-08T12:37:04.673965", "exception": false, "start_time": "2023-07-08T12:37:04.645212", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv\n", "/kaggle/input/icr-identify-age-related-conditions/greeks.csv\n", "/kaggle/input/icr-identify-age-related-conditions/train.csv\n", "/kaggle/input/icr-identify-age-related-conditions/test.csv\n" ] } ], "source": [ "# This Python 3 environment comes with many helpful analytics libraries installed\n", "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n", "# For example, here's several helpful packages to load\n", "\n", "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "\n", "# Input data files are available in the read-only \"../input/\" directory\n", "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n", "\n", "import os\n", "for dirname, _, filenames in os.walk('/kaggle/input'):\n", " for filename in filenames:\n", " print(os.path.join(dirname, filename))\n", "\n", "# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n", "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session" ] }, { "cell_type": "markdown", "id": "3b574567", "metadata": { "papermill": { "duration": 0.008144, "end_time": "2023-07-08T12:37:04.690217", "exception": false, "start_time": "2023-07-08T12:37:04.682073", "status": "completed" }, "tags": [] }, "source": [ "# 1. Import the Libraries" ] }, { "cell_type": "code", "execution_count": 2, "id": "82ba3941", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:04.706780Z", "iopub.status.busy": "2023-07-08T12:37:04.706409Z", "iopub.status.idle": "2023-07-08T12:37:11.980200Z", "shell.execute_reply": "2023-07-08T12:37:11.979425Z" }, "papermill": { "duration": 7.284593, "end_time": "2023-07-08T12:37:11.982471", "exception": false, "start_time": "2023-07-08T12:37:04.697878", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import torch\n", "from torch import tensor\n", "from fastai.data.transforms import RandomSplitter\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.model_selection import StratifiedKFold, train_test_split\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder, StandardScaler\n", "import lightgbm as lgb\n", "from sklearn.metrics import log_loss, confusion_matrix, roc_curve, roc_auc_score\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.linear_model import LogisticRegression" ] }, { "cell_type": "code", "execution_count": 3, "id": "10789ecd", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:12.001266Z", "iopub.status.busy": "2023-07-08T12:37:12.000101Z", "iopub.status.idle": "2023-07-08T12:37:12.005279Z", "shell.execute_reply": "2023-07-08T12:37:12.004195Z" }, "papermill": { "duration": 0.016583, "end_time": "2023-07-08T12:37:12.007353", "exception": false, "start_time": "2023-07-08T12:37:11.990770", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.max_rows', None)" ] }, { "cell_type": "markdown", "id": "e8123568", "metadata": { "papermill": { "duration": 0.008009, "end_time": "2023-07-08T12:37:12.025078", "exception": false, "start_time": "2023-07-08T12:37:12.017069", "status": "completed" }, "tags": [] }, "source": [ "# 2. Load the Datasets" ] }, { "cell_type": "code", "execution_count": 4, "id": "255f98e5", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:12.044522Z", "iopub.status.busy": "2023-07-08T12:37:12.042918Z", "iopub.status.idle": "2023-07-08T12:37:12.098425Z", "shell.execute_reply": "2023-07-08T12:37:12.097103Z" }, "papermill": { "duration": 0.067882, "end_time": "2023-07-08T12:37:12.101115", "exception": false, "start_time": "2023-07-08T12:37:12.033233", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "train_df = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/train.csv\")\n", "test_df = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/test.csv\")\n", "greeks_df = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/greeks.csv\")\n", "sample_submission_df = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv\")" ] }, { "cell_type": "markdown", "id": "606bc79e", "metadata": { "papermill": { "duration": 0.00765, "end_time": "2023-07-08T12:37:12.116804", "exception": false, "start_time": "2023-07-08T12:37:12.109154", "status": "completed" }, "tags": [] }, "source": [ "# 3. View the Dataset" ] }, { "cell_type": "code", "execution_count": 5, "id": "acf04ce8", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:12.134314Z", "iopub.status.busy": "2023-07-08T12:37:12.133766Z", "iopub.status.idle": "2023-07-08T12:37:12.196768Z", "shell.execute_reply": "2023-07-08T12:37:12.195381Z" }, "papermill": { "duration": 0.074372, "end_time": "2023-07-08T12:37:12.198928", "exception": false, "start_time": "2023-07-08T12:37:12.124556", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdABAFAHAMARAXAYAZBCBDBNBPBQBRBZCBCCCDCFCHCLCRCSCUCWDADEDFDHDIDLDNDUDVDYEBEEEGEHEJELEPEUFCFDFEFIFLFRFSGBGEGFGHGIGLClass
0000ff2bfdfe90.2093773109.0332985.20014722.3944078.1386880.6998610.0255789.8122145.5556344126.5873122.5984175.638726152.707705823.928241257.43237747.2233580.56348123.3876004.8519150.0234821.0502250.06922513.7841111.30201236.20595669.08340295.5705750.238680.28423289.24556084.3166429.6571045.3106901.7430723.1877047.2941761.9872831433.1667500.949104B30.87942078.5269683.82838413.39464010.2650739028.2919213.5834507.2981621.738550.09482211.33913872.6110632003.81031922.13622969.8349440.1203431
1007255e476980.145282978.7641685.20014736.9688898.1386883.6321900.02557813.5177901.2299005496.9282419.4205155.86803014.75472051.216883257.43237730.2843450.48471050.6282086.0850410.0314421.1138751.11780028.3109531.35718237.47656870.79836178.5531000.238680.363489110.58181575.7454837.5320000.0055181.7430717.2223284.9263960.8586031111.2871500.003042A109.12515995.41508652.26048017.1759840.2968506785.00347410.3589270.1732290.497060.5689329.29269872.61106327981.56275029.13543032.13199621.9780000
2013f2bd269f50.4700302635.1065485.20014732.3605538.1386886.7328400.02557812.8245701.2299005135.7802426.4825128.988531219.320160482.141594257.43237732.5637130.49585285.9553765.3764880.0362181.0502250.70035039.3647431.00961121.45964470.81970321.4266250.238680.210441120.05643865.4698428.0534641.2897391.7430736.8613527.8136748.1466511494.0764880.377208B109.12515978.5269685.390628224.2074248.7452018338.90618111.6269177.7095600.975561.19882137.07777288.60943713676.95781028.02285135.1926760.1969410
3043ac50845d50.2521073819.65177120.20161877.1122038.1386883.6853440.02557811.0537081.2299004169.6773823.6577237.28226411.050410661.518640257.43237715.2019140.71788288.1593602.3476520.0290541.4003000.63607541.1169600.72272721.53039247.27586196.6079850.238680.292431139.82457071.5712024.3548562.6553451.7430752.0038847.3860603.81332615691.5521800.614484B31.67435778.52696831.32337259.3019847.88433610965.76604014.8520226.1221620.497060.28446618.52958482.4168032094.26245239.94865690.4932480.1558290
4044fb8a146ec0.3802973733.0484485.20014714.1037388.1386883.9422550.0548103.396778102.1519805728.7341224.0108324.546318149.7171656074.859475257.43237782.2134950.53646772.64426430.5377220.0254721.0502250.69315031.7247260.82755034.41536074.06532200.1781600.238680.20770897.92012052.8388826.0199121.1449021.743079.0648567.3507203.4908461403.6563000.164268B109.12515991.99482551.14133629.1026404.27464016198.04959013.6667278.15305848.501340.12191416.408728146.1099438524.37050245.38131636.2626280.0966141
\n", "
" ], "text/plain": [ " Id AB AF AH AM AR \\\n", "0 000ff2bfdfe9 0.209377 3109.03329 85.200147 22.394407 8.138688 \n", "1 007255e47698 0.145282 978.76416 85.200147 36.968889 8.138688 \n", "2 013f2bd269f5 0.470030 2635.10654 85.200147 32.360553 8.138688 \n", "3 043ac50845d5 0.252107 3819.65177 120.201618 77.112203 8.138688 \n", "4 044fb8a146ec 0.380297 3733.04844 85.200147 14.103738 8.138688 \n", "\n", " AX AY AZ BC BD BN BP \\\n", "0 0.699861 0.025578 9.812214 5.555634 4126.58731 22.5984 175.638726 \n", "1 3.632190 0.025578 13.517790 1.229900 5496.92824 19.4205 155.868030 \n", "2 6.732840 0.025578 12.824570 1.229900 5135.78024 26.4825 128.988531 \n", "3 3.685344 0.025578 11.053708 1.229900 4169.67738 23.6577 237.282264 \n", "4 3.942255 0.054810 3.396778 102.151980 5728.73412 24.0108 324.546318 \n", "\n", " BQ BR BZ CB CC CD \\\n", "0 152.707705 823.928241 257.432377 47.223358 0.563481 23.387600 \n", "1 14.754720 51.216883 257.432377 30.284345 0.484710 50.628208 \n", "2 219.320160 482.141594 257.432377 32.563713 0.495852 85.955376 \n", "3 11.050410 661.518640 257.432377 15.201914 0.717882 88.159360 \n", "4 149.717165 6074.859475 257.432377 82.213495 0.536467 72.644264 \n", "\n", " CF CH CL CR CS CU CW \\\n", "0 4.851915 0.023482 1.050225 0.069225 13.784111 1.302012 36.205956 \n", "1 6.085041 0.031442 1.113875 1.117800 28.310953 1.357182 37.476568 \n", "2 5.376488 0.036218 1.050225 0.700350 39.364743 1.009611 21.459644 \n", "3 2.347652 0.029054 1.400300 0.636075 41.116960 0.722727 21.530392 \n", "4 30.537722 0.025472 1.050225 0.693150 31.724726 0.827550 34.415360 \n", "\n", " DA DE DF DH DI DL DN \\\n", "0 69.08340 295.570575 0.23868 0.284232 89.245560 84.31664 29.657104 \n", "1 70.79836 178.553100 0.23868 0.363489 110.581815 75.74548 37.532000 \n", "2 70.81970 321.426625 0.23868 0.210441 120.056438 65.46984 28.053464 \n", "3 47.27586 196.607985 0.23868 0.292431 139.824570 71.57120 24.354856 \n", "4 74.06532 200.178160 0.23868 0.207708 97.920120 52.83888 26.019912 \n", "\n", " DU DV DY EB EE EG EH \\\n", "0 5.310690 1.74307 23.187704 7.294176 1.987283 1433.166750 0.949104 \n", "1 0.005518 1.74307 17.222328 4.926396 0.858603 1111.287150 0.003042 \n", "2 1.289739 1.74307 36.861352 7.813674 8.146651 1494.076488 0.377208 \n", "3 2.655345 1.74307 52.003884 7.386060 3.813326 15691.552180 0.614484 \n", "4 1.144902 1.74307 9.064856 7.350720 3.490846 1403.656300 0.164268 \n", "\n", " EJ EL EP EU FC FD FE \\\n", "0 B 30.879420 78.526968 3.828384 13.394640 10.265073 9028.291921 \n", "1 A 109.125159 95.415086 52.260480 17.175984 0.296850 6785.003474 \n", "2 B 109.125159 78.526968 5.390628 224.207424 8.745201 8338.906181 \n", "3 B 31.674357 78.526968 31.323372 59.301984 7.884336 10965.766040 \n", "4 B 109.125159 91.994825 51.141336 29.102640 4.274640 16198.049590 \n", "\n", " FI FL FR FS GB GE \\\n", "0 3.583450 7.298162 1.73855 0.094822 11.339138 72.611063 \n", "1 10.358927 0.173229 0.49706 0.568932 9.292698 72.611063 \n", "2 11.626917 7.709560 0.97556 1.198821 37.077772 88.609437 \n", "3 14.852022 6.122162 0.49706 0.284466 18.529584 82.416803 \n", "4 13.666727 8.153058 48.50134 0.121914 16.408728 146.109943 \n", "\n", " GF GH GI GL Class \n", "0 2003.810319 22.136229 69.834944 0.120343 1 \n", "1 27981.562750 29.135430 32.131996 21.978000 0 \n", "2 13676.957810 28.022851 35.192676 0.196941 0 \n", "3 2094.262452 39.948656 90.493248 0.155829 0 \n", "4 8524.370502 45.381316 36.262628 0.096614 1 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df.head()" ] }, { "cell_type": "markdown", "id": "d59c5362", "metadata": { "papermill": { "duration": 0.008915, "end_time": "2023-07-08T12:37:12.216891", "exception": false, "start_time": "2023-07-08T12:37:12.207976", "status": "completed" }, "tags": [] }, "source": [ "# 4. Desciptive Statistics of train dataset" ] }, { "cell_type": "code", "execution_count": 6, "id": "823c6173", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:12.235375Z", "iopub.status.busy": "2023-07-08T12:37:12.234988Z", "iopub.status.idle": "2023-07-08T12:37:12.381088Z", "shell.execute_reply": "2023-07-08T12:37:12.379852Z" }, "papermill": { "duration": 0.15796, "end_time": "2023-07-08T12:37:12.383149", "exception": false, "start_time": "2023-07-08T12:37:12.225189", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABAFAHAMARAXAYAZBCBDBNBPBQBRBZCBCCCDCFCHCLCRCSCUCWDADEDFDHDIDLDNDUDVDYEBEEEGEHELEPEUFCFDFEFIFLFRFSGBGEGFGHGIGLClass
count617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000557.000000617.000000617.000000615.000000614.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000616.000000617.000000617.000000617.000000617.000000617.000000617.000000557.000000617.000000617.000000616.000000617.000000617.000000617.000000616.000000617.000000615.000000617.000000617.000000617.000000617.000000617.000000616.000000617.000000
mean0.4771493502.013221118.62451338.96855210.1282425.5455760.06032010.5664478.0530125350.38865521.419492231.32222398.3287371218.133238550.63252577.1041510.68880190.25173511.2410640.0306151.4037610.74226236.9175901.38379227.16565351.128326401.9012990.6338840.367002146.97209994.79537726.3705681.8029001.92483026.3889899.0727003.0647781731.2482150.30510769.582596105.06071269.11700571.3415266.93008610306.81073710.1110795.4331993.5339050.42150120.724856131.71498714679.59539831.48971650.5844378.5309610.175041
std0.4683882300.322717127.83895069.72822610.5188772.5516960.4168174.35064565.1669433021.3266413.478278183.99250596.4793717575.2937072076.371275159.0493020.26399451.58513013.5711330.0148081.9222100.28119517.2663470.53871714.64599321.210888317.7456231.9123840.11298986.08441928.2431878.0388259.0347211.48455518.1166796.2002812.0583441790.2274761.84749938.55570768.445620390.187057165.55154564.75426211331.2940512.93402511.49625750.1819481.3053659.991907144.18152419352.9593879.86423936.26625110.3270100.380310
min0.081187192.59328085.2001473.1775228.1386880.6998610.0255783.3967781.2299001693.6243209.88680072.9489511.33115551.216883257.43237712.4997600.17687423.3876000.5108880.0031841.0502250.06922513.7841110.1379257.0306406.90640035.9988950.2386800.04099560.23247010.3456006.3394960.0055181.7430700.8040684.9263960.286201185.5941000.0030425.39467578.5269683.8283847.5341280.2968501563.1366883.5834500.1732290.4970600.0677304.10218272.61106313.0388949.4327350.8976280.0011290.000000
25%0.2521072197.34548085.20014712.2703148.1386884.1282940.0255788.1295801.2299004155.70287019.420500156.84723927.834425424.990642257.43237723.3175670.56368864.7241925.0663060.0234821.0502250.58957529.7824671.0702987.03064037.942520188.8156900.2386800.295164102.70355378.23224020.8882640.0055181.74307014.7157925.9653921.6486791111.1606250.00304230.92746878.5269684.32465625.8153840.2968505164.6662608.5230980.1732290.4970600.06773014.03671872.6110632798.99258425.03488823.0116840.1243920.000000
50%0.3546593120.31896085.20014720.5331108.1386885.0319120.02557810.4613201.2299004997.96073021.186000193.90881661.642115627.417402257.43237742.5543300.65871579.8191049.1230000.0278601.0502250.73080034.8351301.35166536.01910449.180940307.5095950.2386800.358023130.05063096.26496025.2488000.2517411.74307021.6424568.1494042.6161191493.8174130.08517671.94930678.52696822.64114436.3940081.8701557345.1434249.9454523.0281411.1310000.25060118.77143672.6110637838.27361030.60894641.0079680.3378270.000000
75%0.5597634361.637390113.73954039.1398868.1386886.4316340.03684512.9695165.0812446035.88570023.657700247.803462134.009015975.649259257.43237777.3100970.77220699.81352013.5659010.0344271.2284450.85935040.5294011.66061737.93583261.408760507.8962000.2386800.426348165.836955110.64068030.5442241.0586901.74307034.05834410.5030483.9100701905.7014750.237276109.125159112.76665449.08535256.7144484.88021410647.95165011.5166576.2388141.5120600.53506725.608406127.59167119035.70924036.86394767.93166421.9780000.000000
max6.16166628688.1876601910.123198630.518230178.94363438.27088010.31585138.9715681463.69344853060.59924029.3073002447.810550344.644105179250.25290050092.4593002271.4361674.103032633.534408200.9675260.22407431.6881533.039675267.9428234.95150764.521624210.3309202103.40519037.8950131.0604041049.168078326.23620062.808096161.35531525.192930152.35516494.95858018.32492630243.75878042.569748109.1251591063.5945786501.2644803030.6558241578.654237143224.68230035.851039137.9327391244.22702031.365763135.7812941497.351958143790.07120081.210825191.19476421.9780001.000000
\n", "
" ], "text/plain": [ " AB AF AH AM AR \\\n", "count 617.000000 617.000000 617.000000 617.000000 617.000000 \n", "mean 0.477149 3502.013221 118.624513 38.968552 10.128242 \n", "std 0.468388 2300.322717 127.838950 69.728226 10.518877 \n", "min 0.081187 192.593280 85.200147 3.177522 8.138688 \n", "25% 0.252107 2197.345480 85.200147 12.270314 8.138688 \n", "50% 0.354659 3120.318960 85.200147 20.533110 8.138688 \n", "75% 0.559763 4361.637390 113.739540 39.139886 8.138688 \n", "max 6.161666 28688.187660 1910.123198 630.518230 178.943634 \n", "\n", " AX AY AZ BC BD \\\n", "count 617.000000 617.000000 617.000000 617.000000 617.000000 \n", "mean 5.545576 0.060320 10.566447 8.053012 5350.388655 \n", "std 2.551696 0.416817 4.350645 65.166943 3021.326641 \n", "min 0.699861 0.025578 3.396778 1.229900 1693.624320 \n", "25% 4.128294 0.025578 8.129580 1.229900 4155.702870 \n", "50% 5.031912 0.025578 10.461320 1.229900 4997.960730 \n", "75% 6.431634 0.036845 12.969516 5.081244 6035.885700 \n", "max 38.270880 10.315851 38.971568 1463.693448 53060.599240 \n", "\n", " BN BP BQ BR BZ \\\n", "count 617.000000 617.000000 557.000000 617.000000 617.000000 \n", "mean 21.419492 231.322223 98.328737 1218.133238 550.632525 \n", "std 3.478278 183.992505 96.479371 7575.293707 2076.371275 \n", "min 9.886800 72.948951 1.331155 51.216883 257.432377 \n", "25% 19.420500 156.847239 27.834425 424.990642 257.432377 \n", "50% 21.186000 193.908816 61.642115 627.417402 257.432377 \n", "75% 23.657700 247.803462 134.009015 975.649259 257.432377 \n", "max 29.307300 2447.810550 344.644105 179250.252900 50092.459300 \n", "\n", " CB CC CD CF CH \\\n", "count 615.000000 614.000000 617.000000 617.000000 617.000000 \n", "mean 77.104151 0.688801 90.251735 11.241064 0.030615 \n", "std 159.049302 0.263994 51.585130 13.571133 0.014808 \n", "min 12.499760 0.176874 23.387600 0.510888 0.003184 \n", "25% 23.317567 0.563688 64.724192 5.066306 0.023482 \n", "50% 42.554330 0.658715 79.819104 9.123000 0.027860 \n", "75% 77.310097 0.772206 99.813520 13.565901 0.034427 \n", "max 2271.436167 4.103032 633.534408 200.967526 0.224074 \n", "\n", " CL CR CS CU CW DA \\\n", "count 617.000000 617.000000 617.000000 617.000000 617.000000 617.000000 \n", "mean 1.403761 0.742262 36.917590 1.383792 27.165653 51.128326 \n", "std 1.922210 0.281195 17.266347 0.538717 14.645993 21.210888 \n", "min 1.050225 0.069225 13.784111 0.137925 7.030640 6.906400 \n", "25% 1.050225 0.589575 29.782467 1.070298 7.030640 37.942520 \n", "50% 1.050225 0.730800 34.835130 1.351665 36.019104 49.180940 \n", "75% 1.228445 0.859350 40.529401 1.660617 37.935832 61.408760 \n", "max 31.688153 3.039675 267.942823 4.951507 64.521624 210.330920 \n", "\n", " DE DF DH DI DL \\\n", "count 617.000000 617.000000 617.000000 617.000000 617.000000 \n", "mean 401.901299 0.633884 0.367002 146.972099 94.795377 \n", "std 317.745623 1.912384 0.112989 86.084419 28.243187 \n", "min 35.998895 0.238680 0.040995 60.232470 10.345600 \n", "25% 188.815690 0.238680 0.295164 102.703553 78.232240 \n", "50% 307.509595 0.238680 0.358023 130.050630 96.264960 \n", "75% 507.896200 0.238680 0.426348 165.836955 110.640680 \n", "max 2103.405190 37.895013 1.060404 1049.168078 326.236200 \n", "\n", " DN DU DV DY EB EE \\\n", "count 617.000000 616.000000 617.000000 617.000000 617.000000 617.000000 \n", "mean 26.370568 1.802900 1.924830 26.388989 9.072700 3.064778 \n", "std 8.038825 9.034721 1.484555 18.116679 6.200281 2.058344 \n", "min 6.339496 0.005518 1.743070 0.804068 4.926396 0.286201 \n", "25% 20.888264 0.005518 1.743070 14.715792 5.965392 1.648679 \n", "50% 25.248800 0.251741 1.743070 21.642456 8.149404 2.616119 \n", "75% 30.544224 1.058690 1.743070 34.058344 10.503048 3.910070 \n", "max 62.808096 161.355315 25.192930 152.355164 94.958580 18.324926 \n", "\n", " EG EH EL EP EU \\\n", "count 617.000000 617.000000 557.000000 617.000000 617.000000 \n", "mean 1731.248215 0.305107 69.582596 105.060712 69.117005 \n", "std 1790.227476 1.847499 38.555707 68.445620 390.187057 \n", "min 185.594100 0.003042 5.394675 78.526968 3.828384 \n", "25% 1111.160625 0.003042 30.927468 78.526968 4.324656 \n", "50% 1493.817413 0.085176 71.949306 78.526968 22.641144 \n", "75% 1905.701475 0.237276 109.125159 112.766654 49.085352 \n", "max 30243.758780 42.569748 109.125159 1063.594578 6501.264480 \n", "\n", " FC FD FE FI FL \\\n", "count 616.000000 617.000000 617.000000 617.000000 616.000000 \n", "mean 71.341526 6.930086 10306.810737 10.111079 5.433199 \n", "std 165.551545 64.754262 11331.294051 2.934025 11.496257 \n", "min 7.534128 0.296850 1563.136688 3.583450 0.173229 \n", "25% 25.815384 0.296850 5164.666260 8.523098 0.173229 \n", "50% 36.394008 1.870155 7345.143424 9.945452 3.028141 \n", "75% 56.714448 4.880214 10647.951650 11.516657 6.238814 \n", "max 3030.655824 1578.654237 143224.682300 35.851039 137.932739 \n", "\n", " FR FS GB GE GF \\\n", "count 617.000000 615.000000 617.000000 617.000000 617.000000 \n", "mean 3.533905 0.421501 20.724856 131.714987 14679.595398 \n", "std 50.181948 1.305365 9.991907 144.181524 19352.959387 \n", "min 0.497060 0.067730 4.102182 72.611063 13.038894 \n", "25% 0.497060 0.067730 14.036718 72.611063 2798.992584 \n", "50% 1.131000 0.250601 18.771436 72.611063 7838.273610 \n", "75% 1.512060 0.535067 25.608406 127.591671 19035.709240 \n", "max 1244.227020 31.365763 135.781294 1497.351958 143790.071200 \n", "\n", " GH GI GL Class \n", "count 617.000000 617.000000 616.000000 617.000000 \n", "mean 31.489716 50.584437 8.530961 0.175041 \n", "std 9.864239 36.266251 10.327010 0.380310 \n", "min 9.432735 0.897628 0.001129 0.000000 \n", "25% 25.034888 23.011684 0.124392 0.000000 \n", "50% 30.608946 41.007968 0.337827 0.000000 \n", "75% 36.863947 67.931664 21.978000 0.000000 \n", "max 81.210825 191.194764 21.978000 1.000000 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df.describe()" ] }, { "cell_type": "markdown", "id": "83668022", "metadata": { "papermill": { "duration": 0.008702, "end_time": "2023-07-08T12:37:12.400982", "exception": false, "start_time": "2023-07-08T12:37:12.392280", "status": "completed" }, "tags": [] }, "source": [ "# 5. Encoding" ] }, { "cell_type": "code", "execution_count": 7, "id": "38227807", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:12.420754Z", "iopub.status.busy": "2023-07-08T12:37:12.420359Z", "iopub.status.idle": "2023-07-08T12:37:12.426175Z", "shell.execute_reply": "2023-07-08T12:37:12.425023Z" }, "papermill": { "duration": 0.018785, "end_time": "2023-07-08T12:37:12.428835", "exception": false, "start_time": "2023-07-08T12:37:12.410050", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "def encode(dataframe):\n", " le = LabelEncoder()\n", " obj = list(dataframe.loc[:, dataframe.dtypes == 'object'].columns)\n", " for i in obj:\n", " if i not in ['id', 'Epsilon']:\n", " dataframe[i] = le.fit_transform(dataframe[i])\n", " return dataframe" ] }, { "cell_type": "code", "execution_count": 8, "id": "0e25d090", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:12.448556Z", "iopub.status.busy": "2023-07-08T12:37:12.448219Z", "iopub.status.idle": "2023-07-08T12:37:12.461925Z", "shell.execute_reply": "2023-07-08T12:37:12.460920Z" }, "papermill": { "duration": 0.026491, "end_time": "2023-07-08T12:37:12.464641", "exception": false, "start_time": "2023-07-08T12:37:12.438150", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "df = encode(train_df)\n", "test_df = encode(test_df)" ] }, { "cell_type": "code", "execution_count": 9, "id": "d8d92abb", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:12.484887Z", "iopub.status.busy": "2023-07-08T12:37:12.484487Z", "iopub.status.idle": "2023-07-08T12:37:12.492017Z", "shell.execute_reply": "2023-07-08T12:37:12.490682Z" }, "papermill": { "duration": 0.020058, "end_time": "2023-07-08T12:37:12.494463", "exception": false, "start_time": "2023-07-08T12:37:12.474405", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "Index(['Id', 'AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN',\n", " 'BP', 'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS',\n", " 'CU', 'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY',\n", " 'EB', 'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI',\n", " 'FL', 'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL', 'Class'],\n", " dtype='object')" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "cell_type": "markdown", "id": "8c002e33", "metadata": { "papermill": { "duration": 0.008727, "end_time": "2023-07-08T12:37:12.512640", "exception": false, "start_time": "2023-07-08T12:37:12.503913", "status": "completed" }, "tags": [] }, "source": [ "# 6. Separate into Dependent and Independent " ] }, { "cell_type": "code", "execution_count": 10, "id": "4a189550", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:12.532567Z", "iopub.status.busy": "2023-07-08T12:37:12.532205Z", "iopub.status.idle": "2023-07-08T12:37:12.538788Z", "shell.execute_reply": "2023-07-08T12:37:12.537366Z" }, "papermill": { "duration": 0.018953, "end_time": "2023-07-08T12:37:12.540718", "exception": false, "start_time": "2023-07-08T12:37:12.521765", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "indep_cols = ['Id', 'AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN',\n", " 'BP', 'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS',\n", " 'CU', 'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY',\n", " 'EB', 'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI',\n", " 'FL', 'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL']\n", "\n", "dep_cols = ['Class']" ] }, { "cell_type": "markdown", "id": "d4fc397f", "metadata": { "papermill": { "duration": 0.008825, "end_time": "2023-07-08T12:37:12.558875", "exception": false, "start_time": "2023-07-08T12:37:12.550050", "status": "completed" }, "tags": [] }, "source": [ "# 7. Simple Imputer" ] }, { "cell_type": "code", "execution_count": 11, "id": "4021c983", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:12.579038Z", "iopub.status.busy": "2023-07-08T12:37:12.578626Z", "iopub.status.idle": "2023-07-08T12:37:12.615548Z", "shell.execute_reply": "2023-07-08T12:37:12.614038Z" }, "papermill": { "duration": 0.050168, "end_time": "2023-07-08T12:37:12.618109", "exception": false, "start_time": "2023-07-08T12:37:12.567941", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "imputer = SimpleImputer(strategy = 'mean')\n", "df[indep_cols] = imputer.fit_transform(df[indep_cols])\n", "test_df[indep_cols] = imputer.fit_transform(test_df[indep_cols])" ] }, { "cell_type": "code", "execution_count": 12, "id": "c5748b87", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:12.638761Z", "iopub.status.busy": "2023-07-08T12:37:12.638334Z", "iopub.status.idle": "2023-07-08T12:37:12.645764Z", "shell.execute_reply": "2023-07-08T12:37:12.644599Z" }, "papermill": { "duration": 0.019942, "end_time": "2023-07-08T12:37:12.647818", "exception": false, "start_time": "2023-07-08T12:37:12.627876", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "[2.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0,\n", " 0.0]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(imputer.statistics_)" ] }, { "cell_type": "markdown", "id": "ccb402f0", "metadata": { "papermill": { "duration": 0.009508, "end_time": "2023-07-08T12:37:12.666898", "exception": false, "start_time": "2023-07-08T12:37:12.657390", "status": "completed" }, "tags": [] }, "source": [ "# 8. MinMax Scaler " ] }, { "cell_type": "code", "execution_count": 13, "id": "1bc5c2a2", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:12.687784Z", "iopub.status.busy": "2023-07-08T12:37:12.687364Z", "iopub.status.idle": "2023-07-08T12:37:12.692087Z", "shell.execute_reply": "2023-07-08T12:37:12.691246Z" }, "papermill": { "duration": 0.01743, "end_time": "2023-07-08T12:37:12.693923", "exception": false, "start_time": "2023-07-08T12:37:12.676493", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "scaler = MinMaxScaler()" ] }, { "cell_type": "code", "execution_count": 14, "id": "7f9feb8b", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:12.714293Z", "iopub.status.busy": "2023-07-08T12:37:12.713907Z", "iopub.status.idle": "2023-07-08T12:37:12.731282Z", "shell.execute_reply": "2023-07-08T12:37:12.729918Z" }, "papermill": { "duration": 0.030037, "end_time": "2023-07-08T12:37:12.733256", "exception": false, "start_time": "2023-07-08T12:37:12.703219", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
MinMaxScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "MinMaxScaler()" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scaler.fit(df[indep_cols])" ] }, { "cell_type": "markdown", "id": "b4b22656", "metadata": { "papermill": { "duration": 0.008936, "end_time": "2023-07-08T12:37:12.751338", "exception": false, "start_time": "2023-07-08T12:37:12.742402", "status": "completed" }, "tags": [] }, "source": [ "**Minimum**" ] }, { "cell_type": "code", "execution_count": 15, "id": "dcc126ed", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:12.771383Z", "iopub.status.busy": "2023-07-08T12:37:12.771043Z", "iopub.status.idle": "2023-07-08T12:37:12.778332Z", "shell.execute_reply": "2023-07-08T12:37:12.777449Z" }, "papermill": { "duration": 0.020392, "end_time": "2023-07-08T12:37:12.780961", "exception": false, "start_time": "2023-07-08T12:37:12.760569", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Minimum:\n" ] }, { "data": { "text/plain": [ "[0.0,\n", " 0.081187,\n", " 192.59328,\n", " 85.200147,\n", " 3.177522,\n", " 8.138688,\n", " 0.699861,\n", " 0.025578,\n", " 3.396778,\n", " 1.2299,\n", " 1693.62432,\n", " 9.8868,\n", " 72.948951,\n", " 1.331155,\n", " 51.216883,\n", " 257.432377,\n", " 12.49976,\n", " 0.17687412,\n", " 23.3876,\n", " 0.510888,\n", " 0.003184,\n", " 1.050225,\n", " 0.069225,\n", " 13.784111,\n", " 0.137925,\n", " 7.03064,\n", " 6.9064,\n", " 35.998895,\n", " 0.23868,\n", " 0.040995,\n", " 60.23247,\n", " 10.3456,\n", " 6.339496,\n", " 0.0055176,\n", " 1.74307,\n", " 0.804068,\n", " 4.926396,\n", " 0.286201,\n", " 185.5941,\n", " 0.003042,\n", " 0.0,\n", " 5.394675,\n", " 78.526968,\n", " 3.828384,\n", " 7.534128,\n", " 0.29685,\n", " 1563.136688,\n", " 3.58345,\n", " 0.173229,\n", " 0.49706,\n", " 0.06773,\n", " 4.102182,\n", " 72.611063,\n", " 13.038894,\n", " 9.432735,\n", " 0.897628,\n", " 0.001129278]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print('Minimum:')\n", "list(scaler.data_min_)" ] }, { "cell_type": "markdown", "id": "7f74d791", "metadata": { "papermill": { "duration": 0.009625, "end_time": "2023-07-08T12:37:12.800557", "exception": false, "start_time": "2023-07-08T12:37:12.790932", "status": "completed" }, "tags": [] }, "source": [ "**Maximum**" ] }, { "cell_type": "code", "execution_count": 16, "id": "cb67e6d1", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:12.822334Z", "iopub.status.busy": "2023-07-08T12:37:12.821896Z", "iopub.status.idle": "2023-07-08T12:37:12.829448Z", "shell.execute_reply": "2023-07-08T12:37:12.827991Z" }, "papermill": { "duration": 0.020683, "end_time": "2023-07-08T12:37:12.831361", "exception": false, "start_time": "2023-07-08T12:37:12.810678", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Maximum:\n" ] }, { "data": { "text/plain": [ "[616.0,\n", " 6.161666,\n", " 28688.18766,\n", " 1910.123198,\n", " 630.51823,\n", " 178.943634,\n", " 38.27088,\n", " 10.315851,\n", " 38.971568,\n", " 1463.693448,\n", " 53060.59924,\n", " 29.3073,\n", " 2447.81055,\n", " 344.644105,\n", " 179250.2529,\n", " 50092.4593,\n", " 2271.436167,\n", " 4.1030316,\n", " 633.534408,\n", " 200.967526,\n", " 0.224074,\n", " 31.6881525,\n", " 3.039675,\n", " 267.9428235,\n", " 4.9515075,\n", " 64.521624,\n", " 210.33092,\n", " 2103.40519,\n", " 37.895013,\n", " 1.060404,\n", " 1049.168078,\n", " 326.2362,\n", " 62.808096,\n", " 161.355315,\n", " 25.19293,\n", " 152.355164,\n", " 94.95858,\n", " 18.324926,\n", " 30243.75878,\n", " 42.569748,\n", " 1.0,\n", " 109.125159,\n", " 1063.594578,\n", " 6501.26448,\n", " 3030.655824,\n", " 1578.654237,\n", " 143224.6823,\n", " 35.851039,\n", " 137.9327388,\n", " 1244.22702,\n", " 31.365763,\n", " 135.781294,\n", " 1497.351958,\n", " 143790.0712,\n", " 81.210825,\n", " 191.194764,\n", " 21.978]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print('Maximum:')\n", "list(scaler.data_max_)" ] }, { "cell_type": "code", "execution_count": 17, "id": "7e748f04", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:12.853646Z", "iopub.status.busy": "2023-07-08T12:37:12.853285Z", "iopub.status.idle": "2023-07-08T12:37:12.867262Z", "shell.execute_reply": "2023-07-08T12:37:12.865854Z" }, "papermill": { "duration": 0.027491, "end_time": "2023-07-08T12:37:12.869838", "exception": false, "start_time": "2023-07-08T12:37:12.842347", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "df[indep_cols] = scaler.transform(df[indep_cols])" ] }, { "cell_type": "code", "execution_count": 18, "id": "39a0433b", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:12.891609Z", "iopub.status.busy": "2023-07-08T12:37:12.891215Z", "iopub.status.idle": "2023-07-08T12:37:13.033360Z", "shell.execute_reply": "2023-07-08T12:37:13.032168Z" }, "papermill": { "duration": 0.155513, "end_time": "2023-07-08T12:37:13.035553", "exception": false, "start_time": "2023-07-08T12:37:12.880040", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdABAFAHAMARAXAYAZBCBDBNBPBQBRBZCBCCCDCFCHCLCRCSCUCWDADEDFDHDIDLDNDUDVDYEBEEEGEHEJELEPEUFCFDFEFIFLFRFSGBGEGFGHGIGL
count617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000617.000000
mean0.5000000.0651200.1161380.0183150.0570520.0116480.1289750.0033760.2015380.0046650.0711890.5938410.0666870.2825340.0065120.0058830.0285990.1303890.1095870.0535290.1241830.0115390.2265770.0910200.2588230.3502290.2173870.1769860.0104950.3198000.0877100.2673390.3547290.0111400.0077510.1688200.0460540.1540340.0514220.0070960.6401940.6187950.0269360.0100480.0211060.0042030.0617220.2022970.0381820.0024420.0113030.1262360.0414840.1020090.3072940.2611010.388128
std0.2893780.0770310.0807260.0700520.1111490.0615840.0679170.0405060.1222960.0445600.0588180.1791030.0774750.2669880.0422730.0416650.0702950.0670760.0845450.0677010.0670390.0627400.0946640.0679350.1119160.2547530.1042690.1536930.0507850.1108380.0870480.0894080.1423590.0559490.0633080.1195420.0688670.1141070.0595590.0434020.4803330.3531260.0694830.0600520.0547170.0410260.0799880.0909280.0833840.0403480.0416400.0758810.1011980.1346040.1374270.1905770.469522
min0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
25%0.2500000.0281100.0703530.0000000.0144940.0000000.0912520.0000000.1330380.0000000.0479310.4909090.0353280.0837850.0020860.0000000.0047890.0986810.0677490.0227250.0918920.0000000.1751750.0629460.1936960.0000000.1525680.0739170.0000000.2493300.0429460.2149060.2576440.0000000.0000000.0917960.0115400.0755310.0307930.0000000.0000000.2696860.0000000.0000760.0060560.0000000.0254230.1530840.0000000.0000000.0000000.0754450.0000000.0193770.2173670.1162080.005610
50%0.5000000.0449750.1027430.0000000.0276650.0000000.1153030.0000000.1985830.0000000.0643280.5818180.0509330.2096180.0032150.0000000.0133580.1230090.0924880.0429620.1117120.0000000.2227190.0828260.2521490.5042260.2078140.1313290.0000000.3109920.0705990.2719910.3348640.0015470.0000000.1375010.0357980.1291620.0435230.0019301.0000000.6187950.0000000.0028950.0095660.0009970.0408160.1971640.0207870.0005100.0060590.1114020.0000000.0544260.2950230.2107770.015393
75%0.7500000.0787070.1463050.0156390.0573250.0000000.1525580.0010950.2690880.0026330.0845340.7090910.0736270.3540780.0051590.0000000.0286700.1515570.1252580.0651260.1414410.0058170.2659950.1052310.3163320.5375660.2679240.2282560.0000000.3780160.1067860.3174990.4286400.0065910.0000000.2194260.0619410.2008940.0572260.0055031.0000001.0000000.0347590.0069650.0162790.0029040.0641300.2458570.0440190.0008160.0149320.1633230.0385900.1323070.3821670.3522601.000000
max1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
\n", "
" ], "text/plain": [ " Id AB AF AH AM AR \\\n", "count 617.000000 617.000000 617.000000 617.000000 617.000000 617.000000 \n", "mean 0.500000 0.065120 0.116138 0.018315 0.057052 0.011648 \n", "std 0.289378 0.077031 0.080726 0.070052 0.111149 0.061584 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.250000 0.028110 0.070353 0.000000 0.014494 0.000000 \n", "50% 0.500000 0.044975 0.102743 0.000000 0.027665 0.000000 \n", "75% 0.750000 0.078707 0.146305 0.015639 0.057325 0.000000 \n", "max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", "\n", " AX AY AZ BC BD BN \\\n", "count 617.000000 617.000000 617.000000 617.000000 617.000000 617.000000 \n", "mean 0.128975 0.003376 0.201538 0.004665 0.071189 0.593841 \n", "std 0.067917 0.040506 0.122296 0.044560 0.058818 0.179103 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.091252 0.000000 0.133038 0.000000 0.047931 0.490909 \n", "50% 0.115303 0.000000 0.198583 0.000000 0.064328 0.581818 \n", "75% 0.152558 0.001095 0.269088 0.002633 0.084534 0.709091 \n", "max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", "\n", " BP BQ BR BZ CB CC \\\n", "count 617.000000 617.000000 617.000000 617.000000 617.000000 617.000000 \n", "mean 0.066687 0.282534 0.006512 0.005883 0.028599 0.130389 \n", "std 0.077475 0.266988 0.042273 0.041665 0.070295 0.067076 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.035328 0.083785 0.002086 0.000000 0.004789 0.098681 \n", "50% 0.050933 0.209618 0.003215 0.000000 0.013358 0.123009 \n", "75% 0.073627 0.354078 0.005159 0.000000 0.028670 0.151557 \n", "max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", "\n", " CD CF CH CL CR CS \\\n", "count 617.000000 617.000000 617.000000 617.000000 617.000000 617.000000 \n", "mean 0.109587 0.053529 0.124183 0.011539 0.226577 0.091020 \n", "std 0.084545 0.067701 0.067039 0.062740 0.094664 0.067935 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.067749 0.022725 0.091892 0.000000 0.175175 0.062946 \n", "50% 0.092488 0.042962 0.111712 0.000000 0.222719 0.082826 \n", "75% 0.125258 0.065126 0.141441 0.005817 0.265995 0.105231 \n", "max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", "\n", " CU CW DA DE DF DH \\\n", "count 617.000000 617.000000 617.000000 617.000000 617.000000 617.000000 \n", "mean 0.258823 0.350229 0.217387 0.176986 0.010495 0.319800 \n", "std 0.111916 0.254753 0.104269 0.153693 0.050785 0.110838 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.193696 0.000000 0.152568 0.073917 0.000000 0.249330 \n", "50% 0.252149 0.504226 0.207814 0.131329 0.000000 0.310992 \n", "75% 0.316332 0.537566 0.267924 0.228256 0.000000 0.378016 \n", "max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", "\n", " DI DL DN DU DV DY \\\n", "count 617.000000 617.000000 617.000000 617.000000 617.000000 617.000000 \n", "mean 0.087710 0.267339 0.354729 0.011140 0.007751 0.168820 \n", "std 0.087048 0.089408 0.142359 0.055949 0.063308 0.119542 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.042946 0.214906 0.257644 0.000000 0.000000 0.091796 \n", "50% 0.070599 0.271991 0.334864 0.001547 0.000000 0.137501 \n", "75% 0.106786 0.317499 0.428640 0.006591 0.000000 0.219426 \n", "max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", "\n", " EB EE EG EH EJ EL \\\n", "count 617.000000 617.000000 617.000000 617.000000 617.000000 617.000000 \n", "mean 0.046054 0.154034 0.051422 0.007096 0.640194 0.618795 \n", "std 0.068867 0.114107 0.059559 0.043402 0.480333 0.353126 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.011540 0.075531 0.030793 0.000000 0.000000 0.269686 \n", "50% 0.035798 0.129162 0.043523 0.001930 1.000000 0.618795 \n", "75% 0.061941 0.200894 0.057226 0.005503 1.000000 1.000000 \n", "max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", "\n", " EP EU FC FD FE FI \\\n", "count 617.000000 617.000000 617.000000 617.000000 617.000000 617.000000 \n", "mean 0.026936 0.010048 0.021106 0.004203 0.061722 0.202297 \n", "std 0.069483 0.060052 0.054717 0.041026 0.079988 0.090928 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000076 0.006056 0.000000 0.025423 0.153084 \n", "50% 0.000000 0.002895 0.009566 0.000997 0.040816 0.197164 \n", "75% 0.034759 0.006965 0.016279 0.002904 0.064130 0.245857 \n", "max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", "\n", " FL FR FS GB GE GF \\\n", "count 617.000000 617.000000 617.000000 617.000000 617.000000 617.000000 \n", "mean 0.038182 0.002442 0.011303 0.126236 0.041484 0.102009 \n", "std 0.083384 0.040348 0.041640 0.075881 0.101198 0.134604 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 0.075445 0.000000 0.019377 \n", "50% 0.020787 0.000510 0.006059 0.111402 0.000000 0.054426 \n", "75% 0.044019 0.000816 0.014932 0.163323 0.038590 0.132307 \n", "max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", "\n", " GH GI GL \n", "count 617.000000 617.000000 617.000000 \n", "mean 0.307294 0.261101 0.388128 \n", "std 0.137427 0.190577 0.469522 \n", "min 0.000000 0.000000 0.000000 \n", "25% 0.217367 0.116208 0.005610 \n", "50% 0.295023 0.210777 0.015393 \n", "75% 0.382167 0.352260 1.000000 \n", "max 1.000000 1.000000 1.000000 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[indep_cols].describe()" ] }, { "cell_type": "markdown", "id": "2134f930", "metadata": { "papermill": { "duration": 0.010066, "end_time": "2023-07-08T12:37:13.056108", "exception": false, "start_time": "2023-07-08T12:37:13.046042", "status": "completed" }, "tags": [] }, "source": [ "# 9. Train the model" ] }, { "cell_type": "code", "execution_count": 19, "id": "7bd4d1c7", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:13.079638Z", "iopub.status.busy": "2023-07-08T12:37:13.078489Z", "iopub.status.idle": "2023-07-08T12:37:13.085119Z", "shell.execute_reply": "2023-07-08T12:37:13.084292Z" }, "papermill": { "duration": 0.020513, "end_time": "2023-07-08T12:37:13.087059", "exception": false, "start_time": "2023-07-08T12:37:13.066546", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "X = df[indep_cols]\n", "y = df[dep_cols]" ] }, { "cell_type": "code", "execution_count": 20, "id": "2e4439d4", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:13.111279Z", "iopub.status.busy": "2023-07-08T12:37:13.110273Z", "iopub.status.idle": "2023-07-08T12:37:13.114976Z", "shell.execute_reply": "2023-07-08T12:37:13.114298Z" }, "papermill": { "duration": 0.018764, "end_time": "2023-07-08T12:37:13.116778", "exception": false, "start_time": "2023-07-08T12:37:13.098014", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n", "scores = []" ] }, { "cell_type": "markdown", "id": "cbaf561e", "metadata": { "papermill": { "duration": 0.010193, "end_time": "2023-07-08T12:37:13.137558", "exception": false, "start_time": "2023-07-08T12:37:13.127365", "status": "completed" }, "tags": [] }, "source": [ "**Logistic Regression**" ] }, { "cell_type": "code", "execution_count": 21, "id": "545fab1a", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:13.161255Z", "iopub.status.busy": "2023-07-08T12:37:13.160534Z", "iopub.status.idle": "2023-07-08T12:37:13.271565Z", "shell.execute_reply": "2023-07-08T12:37:13.270303Z" }, "papermill": { "duration": 0.125613, "end_time": "2023-07-08T12:37:13.273760", "exception": false, "start_time": "2023-07-08T12:37:13.148147", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Log-loss scores: [0.32225114643241054, 0.3403899503127844, 0.3592379987382104, 0.34510276634634535, 0.30361286139047855]\n", "*********************************************\n", "Mean Log-loss: 0.33411894464404585\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.10/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", " y = column_or_1d(y, warn=True)\n", "/opt/conda/lib/python3.10/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", " y = column_or_1d(y, warn=True)\n", "/opt/conda/lib/python3.10/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", " y = column_or_1d(y, warn=True)\n", "/opt/conda/lib/python3.10/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", " y = column_or_1d(y, warn=True)\n", "/opt/conda/lib/python3.10/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", " y = column_or_1d(y, warn=True)\n" ] } ], "source": [ "for train_idx, val_idx in skf.split(X, y):\n", " X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]\n", " X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]\n", " model = LogisticRegression(solver='liblinear') \n", " model.fit(X_train, y_train)\n", " val_preds = model.predict_proba(X_valid)\n", " val_score = log_loss(y_valid, val_preds)\n", " scores.append(val_score)\n", "\n", "print(f'Log-loss scores: {scores}')\n", "print('*' * 45)\n", "print(f'Mean Log-loss: {np.mean(scores)}')" ] }, { "cell_type": "markdown", "id": "12d5f7be", "metadata": { "papermill": { "duration": 0.010444, "end_time": "2023-07-08T12:37:13.295213", "exception": false, "start_time": "2023-07-08T12:37:13.284769", "status": "completed" }, "tags": [] }, "source": [ "***Checking the weights and biases of the trained mode***" ] }, { "cell_type": "code", "execution_count": 22, "id": "2cbc9503", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:13.318905Z", "iopub.status.busy": "2023-07-08T12:37:13.318441Z", "iopub.status.idle": "2023-07-08T12:37:13.324221Z", "shell.execute_reply": "2023-07-08T12:37:13.323113Z" }, "papermill": { "duration": 0.020425, "end_time": "2023-07-08T12:37:13.326320", "exception": false, "start_time": "2023-07-08T12:37:13.305895", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[-0.048052289727862726, 0.8901303031883299, 1.3150279115943055, 0.13174306487470439, 0.2622248197824984, 0.0390137362811528, -0.26887734576414224, 0.3189733468464598, -0.20667907391503582, 0.5075642847440733, 0.4732638629119724, 1.588431634183748, 0.6643016265569007, 1.6813665369969006, 0.534784482343446, 0.1569225386396318, -0.18079557638749433, -0.71948639212963, 1.323381673155981, 0.5661967044583167, -0.6725976239897116, 0.44640002874247514, -2.104515356021582, -0.4106919581294929, -0.476265241357835, -0.33458320371675826, -0.9935583565069934, -0.6994955837121527, -0.2514334312777739, -1.5392087395159657, 1.0610628850468167, -1.094798666372673, -0.9686861797798433, 1.423025118877475, 0.4905439829928888, 0.8913732117099116, 0.4053340712554371, -1.05398978966617, -0.40837875387742967, 0.5336512972480261, -0.6399629894331432, 0.19323320859572463, -0.47744452211118044, -0.25097807252395554, 0.02380150507745937, 0.22365378392300575, 1.44986245473914, -0.966989469933129, 1.1012802725256605, 0.6576609075694497, -0.017191565566879646, 0.4430270764946399, -0.8875754351515566, -0.8787775777763472, 0.04030522478117337, 0.06678679668677007, -0.8578451912673534]]\n" ] } ], "source": [ "print(model.coef_.tolist())" ] }, { "cell_type": "code", "execution_count": 23, "id": "969f3b95", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:13.352103Z", "iopub.status.busy": "2023-07-08T12:37:13.351584Z", "iopub.status.idle": "2023-07-08T12:37:13.357637Z", "shell.execute_reply": "2023-07-08T12:37:13.356342Z" }, "papermill": { "duration": 0.021725, "end_time": "2023-07-08T12:37:13.359917", "exception": false, "start_time": "2023-07-08T12:37:13.338192", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[-0.57293004]\n" ] } ], "source": [ "print(model.intercept_)" ] }, { "cell_type": "markdown", "id": "f565b6f8", "metadata": { "papermill": { "duration": 0.01066, "end_time": "2023-07-08T12:37:13.381716", "exception": false, "start_time": "2023-07-08T12:37:13.371056", "status": "completed" }, "tags": [] }, "source": [ "# 10. Sample Submission File" ] }, { "cell_type": "code", "execution_count": 24, "id": "e62489de", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:13.406292Z", "iopub.status.busy": "2023-07-08T12:37:13.405815Z", "iopub.status.idle": "2023-07-08T12:37:13.421639Z", "shell.execute_reply": "2023-07-08T12:37:13.419698Z" }, "papermill": { "duration": 0.031693, "end_time": "2023-07-08T12:37:13.424809", "exception": false, "start_time": "2023-07-08T12:37:13.393116", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Idclass_0class_1
000eed32682bb0.50.5
1010ebe33f6680.50.5
202fa521e18380.50.5
3040e15f562a20.50.5
4046e85c7cc7f0.50.5
\n", "
" ], "text/plain": [ " Id class_0 class_1\n", "0 00eed32682bb 0.5 0.5\n", "1 010ebe33f668 0.5 0.5\n", "2 02fa521e1838 0.5 0.5\n", "3 040e15f562a2 0.5 0.5\n", "4 046e85c7cc7f 0.5 0.5" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample_submission_df.head()" ] }, { "cell_type": "markdown", "id": "2b7aaa18", "metadata": { "papermill": { "duration": 0.011352, "end_time": "2023-07-08T12:37:13.447797", "exception": false, "start_time": "2023-07-08T12:37:13.436445", "status": "completed" }, "tags": [] }, "source": [ "# 11. Prediction of Final Submission File" ] }, { "cell_type": "code", "execution_count": 25, "id": "8844deaf", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:13.475396Z", "iopub.status.busy": "2023-07-08T12:37:13.475006Z", "iopub.status.idle": "2023-07-08T12:37:13.484283Z", "shell.execute_reply": "2023-07-08T12:37:13.482881Z" }, "papermill": { "duration": 0.025612, "end_time": "2023-07-08T12:37:13.486824", "exception": false, "start_time": "2023-07-08T12:37:13.461212", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "prediction = model.predict_proba(test_df[indep_cols])" ] }, { "cell_type": "code", "execution_count": 26, "id": "32531474", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:13.510477Z", "iopub.status.busy": "2023-07-08T12:37:13.510120Z", "iopub.status.idle": "2023-07-08T12:37:13.516037Z", "shell.execute_reply": "2023-07-08T12:37:13.514794Z" }, "papermill": { "duration": 0.020275, "end_time": "2023-07-08T12:37:13.518043", "exception": false, "start_time": "2023-07-08T12:37:13.497768", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "sample_submission_df[['class_0', 'class_1']] = prediction" ] }, { "cell_type": "code", "execution_count": 27, "id": "44999d42", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:13.542605Z", "iopub.status.busy": "2023-07-08T12:37:13.542180Z", "iopub.status.idle": "2023-07-08T12:37:13.554019Z", "shell.execute_reply": "2023-07-08T12:37:13.552916Z" }, "papermill": { "duration": 0.027046, "end_time": "2023-07-08T12:37:13.556262", "exception": false, "start_time": "2023-07-08T12:37:13.529216", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Idclass_0class_1
000eed32682bb0.6394390.360561
1010ebe33f6680.6504420.349558
202fa521e18380.6612870.338713
3040e15f562a20.6719650.328035
4046e85c7cc7f0.6824680.317532
\n", "
" ], "text/plain": [ " Id class_0 class_1\n", "0 00eed32682bb 0.639439 0.360561\n", "1 010ebe33f668 0.650442 0.349558\n", "2 02fa521e1838 0.661287 0.338713\n", "3 040e15f562a2 0.671965 0.328035\n", "4 046e85c7cc7f 0.682468 0.317532" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample_submission_df.head()" ] }, { "cell_type": "code", "execution_count": 28, "id": "00bbb885", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T12:37:13.580094Z", "iopub.status.busy": "2023-07-08T12:37:13.579775Z", "iopub.status.idle": "2023-07-08T12:37:13.590826Z", "shell.execute_reply": "2023-07-08T12:37:13.589760Z" }, "papermill": { "duration": 0.025435, "end_time": "2023-07-08T12:37:13.593299", "exception": false, "start_time": "2023-07-08T12:37:13.567864", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "sample_submission_df.to_csv('submission.csv', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.10" }, "papermill": { "default_parameters": {}, "duration": 20.953622, "end_time": "2023-07-08T12:37:15.230170", "environment_variables": {}, "exception": null, "input_path": "__notebook__.ipynb", "output_path": "__notebook__.ipynb", "parameters": {}, "start_time": "2023-07-08T12:36:54.276548", "version": "2.4.0" } }, "nbformat": 4, "nbformat_minor": 5 }