diff --git "a/iris_classification.ipynb" "b/iris_classification.ipynb" new file mode 100644--- /dev/null +++ "b/iris_classification.ipynb" @@ -0,0 +1,4037 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# 1. Load the Dataset\n", + "\n", + "The dataset you loaded has three classes of Iris species (setosa, versicolor, virginica) and four features (sepal length, sepal width, petal length, petal width). These features can predict the species." + ], + "metadata": { + "id": "ZRclqpDjvjI5" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install datasets" + ], + "metadata": { + "collapsed": true, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jy7afuhZv2cm", + "outputId": "cf268477-3114-4976-851d-08236709d46f" + }, + "execution_count": 76, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (3.1.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.16.1)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.26.4)\n", + "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (17.0.0)\n", + "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n", + "Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.32.3)\n", + "Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.6)\n", + "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.5.0)\n", + "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n", + "Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2024.9.0)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.11.2)\n", + "Requirement already satisfied: huggingface-hub>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.26.2)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.2)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.2)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.3)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.5.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (0.2.0)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.17.2)\n", + "Requirement already satisfied: async-timeout<6.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.23.0->datasets) (4.12.2)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.4.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2.2.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2024.8.30)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n" + ] + } + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": { + "id": "g6I-DZGcvb-h" + }, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "ds = load_dataset(\"scikit-learn/iris\")" + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "data = pd.DataFrame(ds['train'])\n", + "data.drop\n", + "data.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 309 + }, + "id": "F_LhJ80b2lPl", + "outputId": "5edb0950-e70b-4a8b-b00a-f62ba09fda7b" + }, + "execution_count": 78, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n", + "0 1 5.1 3.5 1.4 0.2 Iris-setosa\n", + "1 2 4.9 3.0 1.4 0.2 Iris-setosa\n", + "2 3 4.7 3.2 1.3 0.2 Iris-setosa\n", + "3 4 4.6 3.1 1.5 0.2 Iris-setosa\n", + "4 5 5.0 3.6 1.4 0.2 Iris-setosa" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
015.13.51.40.2Iris-setosa
124.93.01.40.2Iris-setosa
234.73.21.30.2Iris-setosa
344.63.11.50.2Iris-setosa
455.03.61.40.2Iris-setosa
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "data", + "summary": "{\n \"name\": \"data\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"Id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 43,\n \"min\": 1,\n \"max\": 150,\n \"num_unique_values\": 150,\n \"samples\": [\n 74,\n 19,\n 119\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.8280661279778629,\n \"min\": 4.3,\n \"max\": 7.9,\n \"num_unique_values\": 35,\n \"samples\": [\n 6.2,\n 4.5,\n 5.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.4335943113621737,\n \"min\": 2.0,\n \"max\": 4.4,\n \"num_unique_values\": 23,\n \"samples\": [\n 2.3,\n 4.0,\n 3.5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.7644204199522617,\n \"min\": 1.0,\n \"max\": 6.9,\n \"num_unique_values\": 43,\n \"samples\": [\n 6.7,\n 3.8,\n 3.7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.7631607417008414,\n \"min\": 0.1,\n \"max\": 2.5,\n \"num_unique_values\": 22,\n \"samples\": [\n 0.2,\n 1.2,\n 1.3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Iris-setosa\",\n \"Iris-versicolor\",\n \"Iris-virginica\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 78 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# 2. Preprocess the Data\n", + "We need to split the data into training and testing sets for evaluation. We’ll also normalize the data to improve model performance.\n" + ], + "metadata": { + "id": "XOfQg2PEv8Y4" + } + }, + { + "cell_type": "code", + "source": [ + "data.info()" + ], + "metadata": { + "id": "MYfcugmwv_Ip", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "outputId": "92bbd9d4-ee64-4da6-84a8-74b385cabcb6" + }, + "execution_count": 79, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 150 entries, 0 to 149\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 150 non-null int64 \n", + " 1 SepalLengthCm 150 non-null float64\n", + " 2 SepalWidthCm 150 non-null float64\n", + " 3 PetalLengthCm 150 non-null float64\n", + " 4 PetalWidthCm 150 non-null float64\n", + " 5 Species 150 non-null object \n", + "dtypes: float64(4), int64(1), object(1)\n", + "memory usage: 7.2+ KB\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "data.drop('Id', axis=1, inplace=True)" + ], + "metadata": { + "id": "0fBxtkxa3Rf3" + }, + "execution_count": 80, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "data.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 309 + }, + "collapsed": true, + "id": "RHb_Ysqq3VpO", + "outputId": "45a9f1a9-7d8a-439a-dcf1-3c2733bc7437" + }, + "execution_count": 81, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n", + "0 5.1 3.5 1.4 0.2 Iris-setosa\n", + "1 4.9 3.0 1.4 0.2 Iris-setosa\n", + "2 4.7 3.2 1.3 0.2 Iris-setosa\n", + "3 4.6 3.1 1.5 0.2 Iris-setosa\n", + "4 5.0 3.6 1.4 0.2 Iris-setosa" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
05.13.51.40.2Iris-setosa
14.93.01.40.2Iris-setosa
24.73.21.30.2Iris-setosa
34.63.11.50.2Iris-setosa
45.03.61.40.2Iris-setosa
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "data", + "summary": "{\n \"name\": \"data\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.8280661279778629,\n \"min\": 4.3,\n \"max\": 7.9,\n \"num_unique_values\": 35,\n \"samples\": [\n 6.2,\n 4.5,\n 5.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.4335943113621737,\n \"min\": 2.0,\n \"max\": 4.4,\n \"num_unique_values\": 23,\n \"samples\": [\n 2.3,\n 4.0,\n 3.5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.7644204199522617,\n \"min\": 1.0,\n \"max\": 6.9,\n \"num_unique_values\": 43,\n \"samples\": [\n 6.7,\n 3.8,\n 3.7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.7631607417008414,\n \"min\": 0.1,\n \"max\": 2.5,\n \"num_unique_values\": 22,\n \"samples\": [\n 0.2,\n 1.2,\n 1.3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Iris-setosa\",\n \"Iris-versicolor\",\n \"Iris-virginica\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 81 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "refs: [Data science | Data Pre-processing using Scikit-learn| Iris dataset| Jay Patel@medium](https://jay190301.medium.com/data-science-data-pre-processing-using-scikit-learn-iris-dataset-1ba0a9ae04e6)" + ], + "metadata": { + "id": "LJPk_k_3wnV4" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Data Encoding\n", + "\n", + "1. label encoding" + ], + "metadata": { + "id": "pECEH0dJw8bm" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "le = LabelEncoder()\n", + "data['Species'] = le.fit_transform(data['Species'])\n", + "data['Species'].value_counts()" + ], + "metadata": { + "id": "UhgqOteCvhCJ", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 244 + }, + "outputId": "6a8b0932-0d41-4a4b-e3f9-09ea5a7115fc" + }, + "execution_count": 82, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Species\n", + "0 50\n", + "1 50\n", + "2 50\n", + "Name: count, dtype: int64" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
count
Species
050
150
250
\n", + "

" + ] + }, + "metadata": {}, + "execution_count": 82 + } + ] + }, + { + "cell_type": "code", + "source": [ + "le.classes_" + ], + "metadata": { + "id": "TUZBxAjRv--O", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "3ce3dd07-5bb7-4f32-e75a-de81881b7a8a" + }, + "execution_count": 83, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)" + ] + }, + "metadata": {}, + "execution_count": 83 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "2. Onehot encoder" + ], + "metadata": { + "id": "09l_XnFSxwfX" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.preprocessing import OneHotEncoder\n", + "\n", + "ohe = OneHotEncoder()\n", + "transformed_data = ohe.fit_transform(data['Species'].values.reshape(-1,1)).toarray()" + ], + "metadata": { + "id": "MF2ssgzqxqYF" + }, + "execution_count": 84, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "ohe.categories_" + ], + "metadata": { + "id": "nQcrvyC6x8Ht", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "b1244732-061f-4c86-acf2-5cd24a7a111e" + }, + "execution_count": 85, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[array([0, 1, 2])]" + ] + }, + "metadata": {}, + "execution_count": 85 + } + ] + }, + { + "cell_type": "code", + "source": [ + "transformed_data = pd.DataFrame(\n", + " transformed_data,\n", + " columns=['setosa', 'versicolor', 'virginica'],\n", + " index=data.index\n", + ")\n", + "transformed_data.head()" + ], + "metadata": { + "id": "JHaxnvPEx_x9", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 224 + }, + "outputId": "06e1a530-04d3-44fa-e35c-0c2c4e07606a" + }, + "execution_count": 86, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " setosa versicolor virginica\n", + "0 1.0 0.0 0.0\n", + "1 1.0 0.0 0.0\n", + "2 1.0 0.0 0.0\n", + "3 1.0 0.0 0.0\n", + "4 1.0 0.0 0.0" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
setosaversicolorvirginica
01.00.00.0
11.00.00.0
21.00.00.0
31.00.00.0
41.00.00.0
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "transformed_data", + "summary": "{\n \"name\": \"transformed_data\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"setosa\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.47298376984040197,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.0,\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"versicolor\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.47298376984040197,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.0,\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"virginica\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.4729837698404015,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.0,\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 86 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Normalization\n", + "\n", + "$$\n", + "x' = \\frac{x - \\text{min}(x)}{\\text{max}(x) - \\text{min}(x)}\n", + "$$" + ], + "metadata": { + "id": "vEvDpT7ZycYx" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n", + "\n", + "mms = MinMaxScaler(feature_range=(0,1))\n", + "normalized_data = mms.fit_transform(data)\n", + "pd.DataFrame(\n", + " normalized_data,\n", + " columns=data.columns,\n", + ")\n" + ], + "metadata": { + "id": "Jbrfax_ryUqB", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 439 + }, + "outputId": "a399d0ce-d4c3-4ad3-fa6f-6b49e1d444a5" + }, + "execution_count": 87, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n", + "0 0.222222 0.625000 0.067797 0.041667 0.0\n", + "1 0.166667 0.416667 0.067797 0.041667 0.0\n", + "2 0.111111 0.500000 0.050847 0.041667 0.0\n", + "3 0.083333 0.458333 0.084746 0.041667 0.0\n", + "4 0.194444 0.666667 0.067797 0.041667 0.0\n", + ".. ... ... ... ... ...\n", + "145 0.666667 0.416667 0.711864 0.916667 1.0\n", + "146 0.555556 0.208333 0.677966 0.750000 1.0\n", + "147 0.611111 0.416667 0.711864 0.791667 1.0\n", + "148 0.527778 0.583333 0.745763 0.916667 1.0\n", + "149 0.444444 0.416667 0.694915 0.708333 1.0\n", + "\n", + "[150 rows x 5 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
00.2222220.6250000.0677970.0416670.0
10.1666670.4166670.0677970.0416670.0
20.1111110.5000000.0508470.0416670.0
30.0833330.4583330.0847460.0416670.0
40.1944440.6666670.0677970.0416670.0
..................
1450.6666670.4166670.7118640.9166671.0
1460.5555560.2083330.6779660.7500001.0
1470.6111110.4166670.7118640.7916671.0
1480.5277780.5833330.7457630.9166671.0
1490.4444440.4166670.6949150.7083331.0
\n", + "

150 rows × 5 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \")\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.23001836888273966,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 35,\n \"samples\": [\n 0.5277777777777779,\n 0.05555555555555558,\n 0.36111111111111094\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.18066429640090576,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 23,\n \"samples\": [\n 0.12499999999999989,\n 0.8333333333333333,\n 0.625\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.29905430846648523,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 43,\n \"samples\": [\n 0.9661016949152543,\n 0.47457627118644063,\n 0.4576271186440678\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3179836423753504,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 22,\n \"samples\": [\n 0.04166666666666667,\n 0.4583333333333333,\n 0.5000000000000001\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.40961596025952024,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.0,\n 0.5,\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 87 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Standardization\n", + "\n", + "$$\n", + "z = \\frac{x - \\mu}{\\sigma}\n", + "$$" + ], + "metadata": { + "id": "lSgmybXl196Y" + } + }, + { + "cell_type": "code", + "source": [ + "standard_scaler = StandardScaler()\n", + "standardized_data = standard_scaler.fit_transform(data)\n", + "pd.DataFrame(\n", + " standardized_data,\n", + " columns=data.columns,\n", + ")" + ], + "metadata": { + "id": "JeftVnOz0cAR", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 439 + }, + "outputId": "8a3d1c62-89ca-4d01-90e2-c7b0292c2855" + }, + "execution_count": 88, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n", + "0 -0.900681 1.032057 -1.341272 -1.312977 -1.224745\n", + "1 -1.143017 -0.124958 -1.341272 -1.312977 -1.224745\n", + "2 -1.385353 0.337848 -1.398138 -1.312977 -1.224745\n", + "3 -1.506521 0.106445 -1.284407 -1.312977 -1.224745\n", + "4 -1.021849 1.263460 -1.341272 -1.312977 -1.224745\n", + ".. ... ... ... ... ...\n", + "145 1.038005 -0.124958 0.819624 1.447956 1.224745\n", + "146 0.553333 -1.281972 0.705893 0.922064 1.224745\n", + "147 0.795669 -0.124958 0.819624 1.053537 1.224745\n", + "148 0.432165 0.800654 0.933356 1.447956 1.224745\n", + "149 0.068662 -0.124958 0.762759 0.790591 1.224745\n", + "\n", + "[150 rows x 5 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
0-0.9006811.032057-1.341272-1.312977-1.224745
1-1.143017-0.124958-1.341272-1.312977-1.224745
2-1.3853530.337848-1.398138-1.312977-1.224745
3-1.5065210.106445-1.284407-1.312977-1.224745
4-1.0218491.263460-1.341272-1.312977-1.224745
..................
1451.038005-0.1249580.8196241.4479561.224745
1460.553333-1.2819720.7058930.9220641.224745
1470.795669-0.1249580.8196241.0535371.224745
1480.4321650.8006540.9333561.4479561.224745
1490.068662-0.1249580.7627590.7905911.224745
\n", + "

150 rows × 5 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \")\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0033500931359767,\n \"min\": -1.870024133847019,\n \"max\": 2.4920192021244283,\n \"num_unique_values\": 35,\n \"samples\": [\n 0.432165404582356,\n -1.6276883929597161,\n -0.29484181807955234\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0033500931359767,\n \"min\": -2.438987252491841,\n \"max\": 3.1146839106774356,\n \"num_unique_values\": 23,\n \"samples\": [\n -1.7447783570956819,\n 2.1890720501492225,\n 1.0320572244889565\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0033500931359765,\n \"min\": -1.5687352207168408,\n \"max\": 1.7863413146490472,\n \"num_unique_values\": 43,\n \"samples\": [\n 1.6726099066705424,\n 0.02350449098222449,\n -0.03336121300702764\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0033500931359767,\n \"min\": -1.4444496972795189,\n \"max\": 1.7109015831854495,\n \"num_unique_values\": 22,\n \"samples\": [\n -1.3129767272601454,\n 0.001752972933591456,\n 0.13322594295296525\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0033500931359767,\n \"min\": -1.224744871391589,\n \"max\": 1.224744871391589,\n \"num_unique_values\": 3,\n \"samples\": [\n -1.224744871391589,\n 0.0,\n 1.224744871391589\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 88 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Imputation of missing values\n" + ], + "metadata": { + "id": "jhRacCGx4hHD" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.impute import SimpleImputer\n", + "import numpy as np\n", + "\n", + "imputer = SimpleImputer(missing_values=np.nan, strategy='mean')\n", + "imputed_data = imputer.fit_transform(data)\n", + "pd.DataFrame(\n", + " imputed_data,\n", + ").isnull().sum()" + ], + "metadata": { + "id": "3qqVM1ng2PsJ", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 275 + }, + "outputId": "af35c600-d995-4458-a5e6-a88e359526dd" + }, + "execution_count": 89, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 0\n", + "1 0\n", + "2 0\n", + "3 0\n", + "4 0\n", + "dtype: int64" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
00
10
20
30
40
\n", + "

" + ] + }, + "metadata": {}, + "execution_count": 89 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Discretization" + ], + "metadata": { + "id": "i76NX2Ev463h" + } + }, + { + "cell_type": "markdown", + "source": [ + "1. Quantile Discretization Transform" + ], + "metadata": { + "id": "VMYOkM_L48kl" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.preprocessing import KBinsDiscretizer\n", + "\n", + "trans = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')\n", + "new_data = trans.fit_transform(data)\n", + "pd.DataFrame(\n", + " new_data,\n", + " columns=data.columns\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 541 + }, + "id": "gOVi2uYk4qcu", + "outputId": "7b51e4c9-9c89-42ac-bb78-465a7032d4bf" + }, + "execution_count": 90, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_discretization.py:307: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 1 are removed. Consider decreasing the number of bins.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_discretization.py:307: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 3 are removed. Consider decreasing the number of bins.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_discretization.py:307: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 4 are removed. Consider decreasing the number of bins.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n", + "0 2.0 7.0 1.0 1.0 0.0\n", + "1 1.0 4.0 1.0 1.0 0.0\n", + "2 0.0 6.0 0.0 1.0 0.0\n", + "3 0.0 5.0 2.0 1.0 0.0\n", + "4 2.0 7.0 1.0 1.0 0.0\n", + ".. ... ... ... ... ...\n", + "145 8.0 4.0 7.0 8.0 1.0\n", + "146 7.0 1.0 7.0 7.0 1.0\n", + "147 7.0 4.0 7.0 7.0 1.0\n", + "148 6.0 7.0 8.0 8.0 1.0\n", + "149 5.0 4.0 7.0 6.0 1.0\n", + "\n", + "[150 rows x 5 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
02.07.01.01.00.0
11.04.01.01.00.0
20.06.00.01.00.0
30.05.02.01.00.0
42.07.01.01.00.0
..................
1458.04.07.08.01.0
1467.01.07.07.01.0
1477.04.07.07.01.0
1486.07.08.08.01.0
1495.04.07.06.01.0
\n", + "

150 rows × 5 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \")\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.830395898032167,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 8.0,\n 1.0,\n 4.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.3949611533748287,\n \"min\": 0.0,\n \"max\": 8.0,\n \"num_unique_values\": 9,\n \"samples\": [\n 2.0,\n 4.0,\n 3.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.7971621924901315,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 9.0,\n 0.0,\n 5.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.5674698763560073,\n \"min\": 0.0,\n \"max\": 8.0,\n \"num_unique_values\": 9,\n \"samples\": [\n 8.0,\n 2.0,\n 6.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.47298376984040197,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.0,\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 90 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "2. Uniform Discretization Transform" + ], + "metadata": { + "id": "vtHUC3pT5OTS" + } + }, + { + "cell_type": "code", + "source": [ + "trans = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')\n", + "new_data = trans.fit_transform(data)\n", + "pd.DataFrame(\n", + " new_data,\n", + " columns=data.columns\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 439 + }, + "id": "XZaM1ewV5HjE", + "outputId": "874d7838-627a-4705-dc2f-084dff68bdd5" + }, + "execution_count": 91, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n", + "0 2.0 6.0 0.0 0.0 0.0\n", + "1 1.0 4.0 0.0 0.0 0.0\n", + "2 1.0 5.0 0.0 0.0 0.0\n", + "3 0.0 4.0 0.0 0.0 0.0\n", + "4 1.0 6.0 0.0 0.0 0.0\n", + ".. ... ... ... ... ...\n", + "145 6.0 4.0 7.0 9.0 9.0\n", + "146 5.0 2.0 6.0 7.0 9.0\n", + "147 6.0 4.0 7.0 7.0 9.0\n", + "148 5.0 5.0 7.0 9.0 9.0\n", + "149 4.0 4.0 6.0 7.0 9.0\n", + "\n", + "[150 rows x 5 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
02.06.00.00.00.0
11.04.00.00.00.0
21.05.00.00.00.0
30.04.00.00.00.0
41.06.00.00.00.0
..................
1456.04.07.09.09.0
1465.02.06.07.09.0
1476.04.07.07.09.0
1485.05.07.09.09.0
1494.04.06.07.09.0
\n", + "

150 rows × 5 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \")\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.3315749206787793,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 9.0,\n 1.0,\n 7.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.8075778842435182,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 2.0,\n 4.0,\n 8.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.9708577087647687,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 9,\n \"samples\": [\n 7.0,\n 1.0,\n 4.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.1659613014009933,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 9.0,\n 1.0,\n 3.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.6941213351051103,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.0,\n 5.0,\n 9.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 91 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "3. KMeans Discretization Transform" + ], + "metadata": { + "id": "uMFedD8J5V0U" + } + }, + { + "cell_type": "code", + "source": [ + "trans = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='kmeans')\n", + "new_data = trans.fit_transform(data)\n", + "pd.DataFrame(\n", + " new_data,\n", + " columns=data.columns\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 507 + }, + "id": "q4KFEczX5SYj", + "outputId": "1a85ca09-5dc4-4dfc-9ccb-4aca2776bc21" + }, + "execution_count": 92, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/sklearn/base.py:1473: ConvergenceWarning: Number of distinct clusters (3) found smaller than n_clusters (10). Possibly due to duplicate points in X.\n", + " return fit_method(estimator, *args, **kwargs)\n", + "/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_discretization.py:307: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 4 are removed. Consider decreasing the number of bins.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n", + "0 2.0 6.0 0.0 0.0 0.0\n", + "1 1.0 4.0 0.0 0.0 0.0\n", + "2 1.0 4.0 0.0 0.0 0.0\n", + "3 0.0 4.0 0.0 0.0 0.0\n", + "4 1.0 6.0 0.0 0.0 0.0\n", + ".. ... ... ... ... ...\n", + "145 6.0 4.0 5.0 9.0 2.0\n", + "146 5.0 2.0 5.0 7.0 2.0\n", + "147 6.0 4.0 5.0 7.0 2.0\n", + "148 5.0 5.0 6.0 9.0 2.0\n", + "149 4.0 4.0 5.0 7.0 2.0\n", + "\n", + "[150 rows x 5 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
02.06.00.00.00.0
11.04.00.00.00.0
21.04.00.00.00.0
30.04.00.00.00.0
41.06.00.00.00.0
..................
1456.04.05.09.02.0
1465.02.05.07.02.0
1476.04.05.07.02.0
1485.05.06.09.02.0
1494.04.05.07.02.0
\n", + "

150 rows × 5 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \")\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.3191065162163307,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 9.0,\n 1.0,\n 7.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.7795256135824453,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 2.0,\n 4.0,\n 8.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.551842778463346,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 8.0,\n 1.0,\n 2.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.1587455386713343,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 9.0,\n 1.0,\n 6.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.8192319205190405,\n \"min\": 0.0,\n \"max\": 2.0,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.0,\n 1.0,\n 2.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 92 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## In this dataset we use Standardization\n" + ], + "metadata": { + "id": "q8Zl6Cp65pwQ" + } + }, + { + "cell_type": "code", + "source": [ + "# Extract features and labels\n", + "\n", + "X = data.drop('Species', axis=1)\n", + "y = data['Species']" + ], + "metadata": { + "id": "4Y46_FZP5uQ_" + }, + "execution_count": 93, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" + ], + "metadata": { + "id": "fSEBRAIx6lMx" + }, + "execution_count": 94, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "sc = StandardScaler()\n", + "X_train = sc.fit_transform(X_train)\n", + "X_test = sc.transform(X_test)" + ], + "metadata": { + "id": "cV2TNfaT6m-G" + }, + "execution_count": 95, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# 3. Choose a Model\n", + "\n", + "https://en.wikipedia.org/wiki/Logistic_regression" + ], + "metadata": { + "id": "prBHDhku7R2I" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "model = LogisticRegression()\n", + "model.fit(X_train, y_train)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 80 + }, + "id": "-6h32lvi62pc", + "outputId": "c62a7a44-2f13-4f9f-e305-cb63918af714" + }, + "execution_count": 96, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "LogisticRegression()" + ], + "text/html": [ + "
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ] + }, + "metadata": {}, + "execution_count": 96 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# 4. Train the Model\n" + ], + "metadata": { + "id": "VTIpDLia9KYN" + } + }, + { + "cell_type": "code", + "source": [ + "y_pred = model.predict(X_test)" + ], + "metadata": { + "id": "SBhwo4cZ85tF" + }, + "execution_count": 97, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# 5. Evaluate the Model\n" + ], + "metadata": { + "id": "7PghPlKQ9OTQ" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics import accuracy_score, classification_report\n", + "\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "print(f\"Accuracy: {accuracy:.2f}\")\n", + "print(\"Classification Report:\")\n", + "print(classification_report(y_test, y_pred))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9N89xiJG9FOx", + "outputId": "aea846a4-aad0-440f-f37d-50807fe3fd57" + }, + "execution_count": 98, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Accuracy: 1.00\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 10\n", + " 1 1.00 1.00 1.00 9\n", + " 2 1.00 1.00 1.00 11\n", + "\n", + " accuracy 1.00 30\n", + " macro avg 1.00 1.00 1.00 30\n", + "weighted avg 1.00 1.00 1.00 30\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import joblib\n", + "\n", + "joblib.dump(model, 'iris_logistic_regression_model.pkl')\n", + "print(\"Model saved to iris_logistic_regression_model.pkl\")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LbRKB89f9PeA", + "outputId": "dd884167-32aa-444b-984b-6c2302e6c80f" + }, + "execution_count": 99, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Model saved to iris_logistic_regression_model.pkl\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "loaded_model = joblib.load('logistic_regression_model.pkl')\n", + "print(\"Model loaded successfully\")\n", + "\n", + "\n", + "new_predictions = loaded_model.predict(X_test)\n", + "print(new_predictions)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HxP0rhZa9g-X", + "outputId": "2df09e3e-98a3-4d90-fd48-6007dfe2838f" + }, + "execution_count": 100, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Model loaded successfully\n", + "[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "accuracy = accuracy_score(y_test, new_predictions)\n", + "print(f\"Accuracy: {accuracy:.2f}\")\n", + "print(\"Classification Report:\")\n", + "print(classification_report(y_test, new_predictions))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xWcxawsV9jA5", + "outputId": "f22a4df2-1014-4c94-d802-ce1f2d9ea836" + }, + "execution_count": 101, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Accuracy: 1.00\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 10\n", + " 1 1.00 1.00 1.00 9\n", + " 2 1.00 1.00 1.00 11\n", + "\n", + " accuracy 1.00 30\n", + " macro avg 1.00 1.00 1.00 30\n", + "weighted avg 1.00 1.00 1.00 30\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "etijgWI29oUs" + }, + "execution_count": 101, + "outputs": [] + } + ] +} \ No newline at end of file