diff --git "a/iris_classification.ipynb" "b/iris_classification.ipynb"
new file mode 100644--- /dev/null
+++ "b/iris_classification.ipynb"
@@ -0,0 +1,4037 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# 1. Load the Dataset\n",
+ "\n",
+ "The dataset you loaded has three classes of Iris species (setosa, versicolor, virginica) and four features (sepal length, sepal width, petal length, petal width). These features can predict the species."
+ ],
+ "metadata": {
+ "id": "ZRclqpDjvjI5"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip install datasets"
+ ],
+ "metadata": {
+ "collapsed": true,
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "jy7afuhZv2cm",
+ "outputId": "cf268477-3114-4976-851d-08236709d46f"
+ },
+ "execution_count": 76,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (3.1.0)\n",
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.16.1)\n",
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.26.4)\n",
+ "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (17.0.0)\n",
+ "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n",
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n",
+ "Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.32.3)\n",
+ "Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.6)\n",
+ "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.5.0)\n",
+ "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n",
+ "Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2024.9.0)\n",
+ "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.11.2)\n",
+ "Requirement already satisfied: huggingface-hub>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.26.2)\n",
+ "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.2)\n",
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.2)\n",
+ "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.3)\n",
+ "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n",
+ "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.5.0)\n",
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n",
+ "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (0.2.0)\n",
+ "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.17.2)\n",
+ "Requirement already satisfied: async-timeout<6.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.23.0->datasets) (4.12.2)\n",
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.4.0)\n",
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.10)\n",
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2.2.3)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2024.8.30)\n",
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
+ "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "metadata": {
+ "id": "g6I-DZGcvb-h"
+ },
+ "outputs": [],
+ "source": [
+ "from datasets import load_dataset\n",
+ "\n",
+ "ds = load_dataset(\"scikit-learn/iris\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "data = pd.DataFrame(ds['train'])\n",
+ "data.drop\n",
+ "data.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 309
+ },
+ "id": "F_LhJ80b2lPl",
+ "outputId": "5edb0950-e70b-4a8b-b00a-f62ba09fda7b"
+ },
+ "execution_count": 78,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n",
+ "0 1 5.1 3.5 1.4 0.2 Iris-setosa\n",
+ "1 2 4.9 3.0 1.4 0.2 Iris-setosa\n",
+ "2 3 4.7 3.2 1.3 0.2 Iris-setosa\n",
+ "3 4 4.6 3.1 1.5 0.2 Iris-setosa\n",
+ "4 5 5.0 3.6 1.4 0.2 Iris-setosa"
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Id \n",
+ " SepalLengthCm \n",
+ " SepalWidthCm \n",
+ " PetalLengthCm \n",
+ " PetalWidthCm \n",
+ " Species \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 5.1 \n",
+ " 3.5 \n",
+ " 1.4 \n",
+ " 0.2 \n",
+ " Iris-setosa \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 2 \n",
+ " 4.9 \n",
+ " 3.0 \n",
+ " 1.4 \n",
+ " 0.2 \n",
+ " Iris-setosa \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 3 \n",
+ " 4.7 \n",
+ " 3.2 \n",
+ " 1.3 \n",
+ " 0.2 \n",
+ " Iris-setosa \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 4 \n",
+ " 4.6 \n",
+ " 3.1 \n",
+ " 1.5 \n",
+ " 0.2 \n",
+ " Iris-setosa \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 5 \n",
+ " 5.0 \n",
+ " 3.6 \n",
+ " 1.4 \n",
+ " 0.2 \n",
+ " Iris-setosa \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "data",
+ "summary": "{\n \"name\": \"data\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"Id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 43,\n \"min\": 1,\n \"max\": 150,\n \"num_unique_values\": 150,\n \"samples\": [\n 74,\n 19,\n 119\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.8280661279778629,\n \"min\": 4.3,\n \"max\": 7.9,\n \"num_unique_values\": 35,\n \"samples\": [\n 6.2,\n 4.5,\n 5.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.4335943113621737,\n \"min\": 2.0,\n \"max\": 4.4,\n \"num_unique_values\": 23,\n \"samples\": [\n 2.3,\n 4.0,\n 3.5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.7644204199522617,\n \"min\": 1.0,\n \"max\": 6.9,\n \"num_unique_values\": 43,\n \"samples\": [\n 6.7,\n 3.8,\n 3.7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.7631607417008414,\n \"min\": 0.1,\n \"max\": 2.5,\n \"num_unique_values\": 22,\n \"samples\": [\n 0.2,\n 1.2,\n 1.3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Iris-setosa\",\n \"Iris-versicolor\",\n \"Iris-virginica\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 78
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# 2. Preprocess the Data\n",
+ "We need to split the data into training and testing sets for evaluation. We’ll also normalize the data to improve model performance.\n"
+ ],
+ "metadata": {
+ "id": "XOfQg2PEv8Y4"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "data.info()"
+ ],
+ "metadata": {
+ "id": "MYfcugmwv_Ip",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "collapsed": true,
+ "outputId": "92bbd9d4-ee64-4da6-84a8-74b385cabcb6"
+ },
+ "execution_count": 79,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\n",
+ "RangeIndex: 150 entries, 0 to 149\n",
+ "Data columns (total 6 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 Id 150 non-null int64 \n",
+ " 1 SepalLengthCm 150 non-null float64\n",
+ " 2 SepalWidthCm 150 non-null float64\n",
+ " 3 PetalLengthCm 150 non-null float64\n",
+ " 4 PetalWidthCm 150 non-null float64\n",
+ " 5 Species 150 non-null object \n",
+ "dtypes: float64(4), int64(1), object(1)\n",
+ "memory usage: 7.2+ KB\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "data.drop('Id', axis=1, inplace=True)"
+ ],
+ "metadata": {
+ "id": "0fBxtkxa3Rf3"
+ },
+ "execution_count": 80,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "data.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 309
+ },
+ "collapsed": true,
+ "id": "RHb_Ysqq3VpO",
+ "outputId": "45a9f1a9-7d8a-439a-dcf1-3c2733bc7437"
+ },
+ "execution_count": 81,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n",
+ "0 5.1 3.5 1.4 0.2 Iris-setosa\n",
+ "1 4.9 3.0 1.4 0.2 Iris-setosa\n",
+ "2 4.7 3.2 1.3 0.2 Iris-setosa\n",
+ "3 4.6 3.1 1.5 0.2 Iris-setosa\n",
+ "4 5.0 3.6 1.4 0.2 Iris-setosa"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " SepalLengthCm \n",
+ " SepalWidthCm \n",
+ " PetalLengthCm \n",
+ " PetalWidthCm \n",
+ " Species \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 5.1 \n",
+ " 3.5 \n",
+ " 1.4 \n",
+ " 0.2 \n",
+ " Iris-setosa \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 4.9 \n",
+ " 3.0 \n",
+ " 1.4 \n",
+ " 0.2 \n",
+ " Iris-setosa \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 4.7 \n",
+ " 3.2 \n",
+ " 1.3 \n",
+ " 0.2 \n",
+ " Iris-setosa \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 4.6 \n",
+ " 3.1 \n",
+ " 1.5 \n",
+ " 0.2 \n",
+ " Iris-setosa \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 5.0 \n",
+ " 3.6 \n",
+ " 1.4 \n",
+ " 0.2 \n",
+ " Iris-setosa \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "data",
+ "summary": "{\n \"name\": \"data\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.8280661279778629,\n \"min\": 4.3,\n \"max\": 7.9,\n \"num_unique_values\": 35,\n \"samples\": [\n 6.2,\n 4.5,\n 5.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.4335943113621737,\n \"min\": 2.0,\n \"max\": 4.4,\n \"num_unique_values\": 23,\n \"samples\": [\n 2.3,\n 4.0,\n 3.5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.7644204199522617,\n \"min\": 1.0,\n \"max\": 6.9,\n \"num_unique_values\": 43,\n \"samples\": [\n 6.7,\n 3.8,\n 3.7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.7631607417008414,\n \"min\": 0.1,\n \"max\": 2.5,\n \"num_unique_values\": 22,\n \"samples\": [\n 0.2,\n 1.2,\n 1.3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Iris-setosa\",\n \"Iris-versicolor\",\n \"Iris-virginica\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 81
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "refs: [Data science | Data Pre-processing using Scikit-learn| Iris dataset| Jay Patel@medium](https://jay190301.medium.com/data-science-data-pre-processing-using-scikit-learn-iris-dataset-1ba0a9ae04e6)"
+ ],
+ "metadata": {
+ "id": "LJPk_k_3wnV4"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Data Encoding\n",
+ "\n",
+ "1. label encoding"
+ ],
+ "metadata": {
+ "id": "pECEH0dJw8bm"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.preprocessing import LabelEncoder\n",
+ "\n",
+ "le = LabelEncoder()\n",
+ "data['Species'] = le.fit_transform(data['Species'])\n",
+ "data['Species'].value_counts()"
+ ],
+ "metadata": {
+ "id": "UhgqOteCvhCJ",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 244
+ },
+ "outputId": "6a8b0932-0d41-4a4b-e3f9-09ea5a7115fc"
+ },
+ "execution_count": 82,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Species\n",
+ "0 50\n",
+ "1 50\n",
+ "2 50\n",
+ "Name: count, dtype: int64"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count \n",
+ " \n",
+ " \n",
+ " Species \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 50 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 50 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 50 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
dtype: int64 "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 82
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "le.classes_"
+ ],
+ "metadata": {
+ "id": "TUZBxAjRv--O",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "3ce3dd07-5bb7-4f32-e75a-de81881b7a8a"
+ },
+ "execution_count": 83,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 83
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "2. Onehot encoder"
+ ],
+ "metadata": {
+ "id": "09l_XnFSxwfX"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.preprocessing import OneHotEncoder\n",
+ "\n",
+ "ohe = OneHotEncoder()\n",
+ "transformed_data = ohe.fit_transform(data['Species'].values.reshape(-1,1)).toarray()"
+ ],
+ "metadata": {
+ "id": "MF2ssgzqxqYF"
+ },
+ "execution_count": 84,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "ohe.categories_"
+ ],
+ "metadata": {
+ "id": "nQcrvyC6x8Ht",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "b1244732-061f-4c86-acf2-5cd24a7a111e"
+ },
+ "execution_count": 85,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "[array([0, 1, 2])]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 85
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "transformed_data = pd.DataFrame(\n",
+ " transformed_data,\n",
+ " columns=['setosa', 'versicolor', 'virginica'],\n",
+ " index=data.index\n",
+ ")\n",
+ "transformed_data.head()"
+ ],
+ "metadata": {
+ "id": "JHaxnvPEx_x9",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 224
+ },
+ "outputId": "06e1a530-04d3-44fa-e35c-0c2c4e07606a"
+ },
+ "execution_count": 86,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " setosa versicolor virginica\n",
+ "0 1.0 0.0 0.0\n",
+ "1 1.0 0.0 0.0\n",
+ "2 1.0 0.0 0.0\n",
+ "3 1.0 0.0 0.0\n",
+ "4 1.0 0.0 0.0"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " setosa \n",
+ " versicolor \n",
+ " virginica \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 1.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 1.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 1.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 1.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "transformed_data",
+ "summary": "{\n \"name\": \"transformed_data\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"setosa\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.47298376984040197,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.0,\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"versicolor\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.47298376984040197,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.0,\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"virginica\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.4729837698404015,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.0,\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 86
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Normalization\n",
+ "\n",
+ "$$\n",
+ "x' = \\frac{x - \\text{min}(x)}{\\text{max}(x) - \\text{min}(x)}\n",
+ "$$"
+ ],
+ "metadata": {
+ "id": "vEvDpT7ZycYx"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
+ "\n",
+ "mms = MinMaxScaler(feature_range=(0,1))\n",
+ "normalized_data = mms.fit_transform(data)\n",
+ "pd.DataFrame(\n",
+ " normalized_data,\n",
+ " columns=data.columns,\n",
+ ")\n"
+ ],
+ "metadata": {
+ "id": "Jbrfax_ryUqB",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 439
+ },
+ "outputId": "a399d0ce-d4c3-4ad3-fa6f-6b49e1d444a5"
+ },
+ "execution_count": 87,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n",
+ "0 0.222222 0.625000 0.067797 0.041667 0.0\n",
+ "1 0.166667 0.416667 0.067797 0.041667 0.0\n",
+ "2 0.111111 0.500000 0.050847 0.041667 0.0\n",
+ "3 0.083333 0.458333 0.084746 0.041667 0.0\n",
+ "4 0.194444 0.666667 0.067797 0.041667 0.0\n",
+ ".. ... ... ... ... ...\n",
+ "145 0.666667 0.416667 0.711864 0.916667 1.0\n",
+ "146 0.555556 0.208333 0.677966 0.750000 1.0\n",
+ "147 0.611111 0.416667 0.711864 0.791667 1.0\n",
+ "148 0.527778 0.583333 0.745763 0.916667 1.0\n",
+ "149 0.444444 0.416667 0.694915 0.708333 1.0\n",
+ "\n",
+ "[150 rows x 5 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " SepalLengthCm \n",
+ " SepalWidthCm \n",
+ " PetalLengthCm \n",
+ " PetalWidthCm \n",
+ " Species \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 0.222222 \n",
+ " 0.625000 \n",
+ " 0.067797 \n",
+ " 0.041667 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 0.166667 \n",
+ " 0.416667 \n",
+ " 0.067797 \n",
+ " 0.041667 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 0.111111 \n",
+ " 0.500000 \n",
+ " 0.050847 \n",
+ " 0.041667 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 0.083333 \n",
+ " 0.458333 \n",
+ " 0.084746 \n",
+ " 0.041667 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 0.194444 \n",
+ " 0.666667 \n",
+ " 0.067797 \n",
+ " 0.041667 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " \n",
+ " \n",
+ " 145 \n",
+ " 0.666667 \n",
+ " 0.416667 \n",
+ " 0.711864 \n",
+ " 0.916667 \n",
+ " 1.0 \n",
+ " \n",
+ " \n",
+ " 146 \n",
+ " 0.555556 \n",
+ " 0.208333 \n",
+ " 0.677966 \n",
+ " 0.750000 \n",
+ " 1.0 \n",
+ " \n",
+ " \n",
+ " 147 \n",
+ " 0.611111 \n",
+ " 0.416667 \n",
+ " 0.711864 \n",
+ " 0.791667 \n",
+ " 1.0 \n",
+ " \n",
+ " \n",
+ " 148 \n",
+ " 0.527778 \n",
+ " 0.583333 \n",
+ " 0.745763 \n",
+ " 0.916667 \n",
+ " 1.0 \n",
+ " \n",
+ " \n",
+ " 149 \n",
+ " 0.444444 \n",
+ " 0.416667 \n",
+ " 0.694915 \n",
+ " 0.708333 \n",
+ " 1.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
150 rows × 5 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \")\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.23001836888273966,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 35,\n \"samples\": [\n 0.5277777777777779,\n 0.05555555555555558,\n 0.36111111111111094\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.18066429640090576,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 23,\n \"samples\": [\n 0.12499999999999989,\n 0.8333333333333333,\n 0.625\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.29905430846648523,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 43,\n \"samples\": [\n 0.9661016949152543,\n 0.47457627118644063,\n 0.4576271186440678\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3179836423753504,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 22,\n \"samples\": [\n 0.04166666666666667,\n 0.4583333333333333,\n 0.5000000000000001\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.40961596025952024,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.0,\n 0.5,\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 87
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Standardization\n",
+ "\n",
+ "$$\n",
+ "z = \\frac{x - \\mu}{\\sigma}\n",
+ "$$"
+ ],
+ "metadata": {
+ "id": "lSgmybXl196Y"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "standard_scaler = StandardScaler()\n",
+ "standardized_data = standard_scaler.fit_transform(data)\n",
+ "pd.DataFrame(\n",
+ " standardized_data,\n",
+ " columns=data.columns,\n",
+ ")"
+ ],
+ "metadata": {
+ "id": "JeftVnOz0cAR",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 439
+ },
+ "outputId": "8a3d1c62-89ca-4d01-90e2-c7b0292c2855"
+ },
+ "execution_count": 88,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n",
+ "0 -0.900681 1.032057 -1.341272 -1.312977 -1.224745\n",
+ "1 -1.143017 -0.124958 -1.341272 -1.312977 -1.224745\n",
+ "2 -1.385353 0.337848 -1.398138 -1.312977 -1.224745\n",
+ "3 -1.506521 0.106445 -1.284407 -1.312977 -1.224745\n",
+ "4 -1.021849 1.263460 -1.341272 -1.312977 -1.224745\n",
+ ".. ... ... ... ... ...\n",
+ "145 1.038005 -0.124958 0.819624 1.447956 1.224745\n",
+ "146 0.553333 -1.281972 0.705893 0.922064 1.224745\n",
+ "147 0.795669 -0.124958 0.819624 1.053537 1.224745\n",
+ "148 0.432165 0.800654 0.933356 1.447956 1.224745\n",
+ "149 0.068662 -0.124958 0.762759 0.790591 1.224745\n",
+ "\n",
+ "[150 rows x 5 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " SepalLengthCm \n",
+ " SepalWidthCm \n",
+ " PetalLengthCm \n",
+ " PetalWidthCm \n",
+ " Species \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " -0.900681 \n",
+ " 1.032057 \n",
+ " -1.341272 \n",
+ " -1.312977 \n",
+ " -1.224745 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " -1.143017 \n",
+ " -0.124958 \n",
+ " -1.341272 \n",
+ " -1.312977 \n",
+ " -1.224745 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " -1.385353 \n",
+ " 0.337848 \n",
+ " -1.398138 \n",
+ " -1.312977 \n",
+ " -1.224745 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " -1.506521 \n",
+ " 0.106445 \n",
+ " -1.284407 \n",
+ " -1.312977 \n",
+ " -1.224745 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " -1.021849 \n",
+ " 1.263460 \n",
+ " -1.341272 \n",
+ " -1.312977 \n",
+ " -1.224745 \n",
+ " \n",
+ " \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " \n",
+ " \n",
+ " 145 \n",
+ " 1.038005 \n",
+ " -0.124958 \n",
+ " 0.819624 \n",
+ " 1.447956 \n",
+ " 1.224745 \n",
+ " \n",
+ " \n",
+ " 146 \n",
+ " 0.553333 \n",
+ " -1.281972 \n",
+ " 0.705893 \n",
+ " 0.922064 \n",
+ " 1.224745 \n",
+ " \n",
+ " \n",
+ " 147 \n",
+ " 0.795669 \n",
+ " -0.124958 \n",
+ " 0.819624 \n",
+ " 1.053537 \n",
+ " 1.224745 \n",
+ " \n",
+ " \n",
+ " 148 \n",
+ " 0.432165 \n",
+ " 0.800654 \n",
+ " 0.933356 \n",
+ " 1.447956 \n",
+ " 1.224745 \n",
+ " \n",
+ " \n",
+ " 149 \n",
+ " 0.068662 \n",
+ " -0.124958 \n",
+ " 0.762759 \n",
+ " 0.790591 \n",
+ " 1.224745 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
150 rows × 5 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \")\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0033500931359767,\n \"min\": -1.870024133847019,\n \"max\": 2.4920192021244283,\n \"num_unique_values\": 35,\n \"samples\": [\n 0.432165404582356,\n -1.6276883929597161,\n -0.29484181807955234\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0033500931359767,\n \"min\": -2.438987252491841,\n \"max\": 3.1146839106774356,\n \"num_unique_values\": 23,\n \"samples\": [\n -1.7447783570956819,\n 2.1890720501492225,\n 1.0320572244889565\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0033500931359765,\n \"min\": -1.5687352207168408,\n \"max\": 1.7863413146490472,\n \"num_unique_values\": 43,\n \"samples\": [\n 1.6726099066705424,\n 0.02350449098222449,\n -0.03336121300702764\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0033500931359767,\n \"min\": -1.4444496972795189,\n \"max\": 1.7109015831854495,\n \"num_unique_values\": 22,\n \"samples\": [\n -1.3129767272601454,\n 0.001752972933591456,\n 0.13322594295296525\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0033500931359767,\n \"min\": -1.224744871391589,\n \"max\": 1.224744871391589,\n \"num_unique_values\": 3,\n \"samples\": [\n -1.224744871391589,\n 0.0,\n 1.224744871391589\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 88
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Imputation of missing values\n"
+ ],
+ "metadata": {
+ "id": "jhRacCGx4hHD"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.impute import SimpleImputer\n",
+ "import numpy as np\n",
+ "\n",
+ "imputer = SimpleImputer(missing_values=np.nan, strategy='mean')\n",
+ "imputed_data = imputer.fit_transform(data)\n",
+ "pd.DataFrame(\n",
+ " imputed_data,\n",
+ ").isnull().sum()"
+ ],
+ "metadata": {
+ "id": "3qqVM1ng2PsJ",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 275
+ },
+ "outputId": "af35c600-d995-4458-a5e6-a88e359526dd"
+ },
+ "execution_count": 89,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0 0\n",
+ "1 0\n",
+ "2 0\n",
+ "3 0\n",
+ "4 0\n",
+ "dtype: int64"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
dtype: int64 "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 89
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Discretization"
+ ],
+ "metadata": {
+ "id": "i76NX2Ev463h"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "1. Quantile Discretization Transform"
+ ],
+ "metadata": {
+ "id": "VMYOkM_L48kl"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.preprocessing import KBinsDiscretizer\n",
+ "\n",
+ "trans = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')\n",
+ "new_data = trans.fit_transform(data)\n",
+ "pd.DataFrame(\n",
+ " new_data,\n",
+ " columns=data.columns\n",
+ ")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 541
+ },
+ "id": "gOVi2uYk4qcu",
+ "outputId": "7b51e4c9-9c89-42ac-bb78-465a7032d4bf"
+ },
+ "execution_count": 90,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_discretization.py:307: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 1 are removed. Consider decreasing the number of bins.\n",
+ " warnings.warn(\n",
+ "/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_discretization.py:307: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 3 are removed. Consider decreasing the number of bins.\n",
+ " warnings.warn(\n",
+ "/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_discretization.py:307: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 4 are removed. Consider decreasing the number of bins.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n",
+ "0 2.0 7.0 1.0 1.0 0.0\n",
+ "1 1.0 4.0 1.0 1.0 0.0\n",
+ "2 0.0 6.0 0.0 1.0 0.0\n",
+ "3 0.0 5.0 2.0 1.0 0.0\n",
+ "4 2.0 7.0 1.0 1.0 0.0\n",
+ ".. ... ... ... ... ...\n",
+ "145 8.0 4.0 7.0 8.0 1.0\n",
+ "146 7.0 1.0 7.0 7.0 1.0\n",
+ "147 7.0 4.0 7.0 7.0 1.0\n",
+ "148 6.0 7.0 8.0 8.0 1.0\n",
+ "149 5.0 4.0 7.0 6.0 1.0\n",
+ "\n",
+ "[150 rows x 5 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " SepalLengthCm \n",
+ " SepalWidthCm \n",
+ " PetalLengthCm \n",
+ " PetalWidthCm \n",
+ " Species \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 2.0 \n",
+ " 7.0 \n",
+ " 1.0 \n",
+ " 1.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 1.0 \n",
+ " 4.0 \n",
+ " 1.0 \n",
+ " 1.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 0.0 \n",
+ " 6.0 \n",
+ " 0.0 \n",
+ " 1.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 0.0 \n",
+ " 5.0 \n",
+ " 2.0 \n",
+ " 1.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 2.0 \n",
+ " 7.0 \n",
+ " 1.0 \n",
+ " 1.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " \n",
+ " \n",
+ " 145 \n",
+ " 8.0 \n",
+ " 4.0 \n",
+ " 7.0 \n",
+ " 8.0 \n",
+ " 1.0 \n",
+ " \n",
+ " \n",
+ " 146 \n",
+ " 7.0 \n",
+ " 1.0 \n",
+ " 7.0 \n",
+ " 7.0 \n",
+ " 1.0 \n",
+ " \n",
+ " \n",
+ " 147 \n",
+ " 7.0 \n",
+ " 4.0 \n",
+ " 7.0 \n",
+ " 7.0 \n",
+ " 1.0 \n",
+ " \n",
+ " \n",
+ " 148 \n",
+ " 6.0 \n",
+ " 7.0 \n",
+ " 8.0 \n",
+ " 8.0 \n",
+ " 1.0 \n",
+ " \n",
+ " \n",
+ " 149 \n",
+ " 5.0 \n",
+ " 4.0 \n",
+ " 7.0 \n",
+ " 6.0 \n",
+ " 1.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
150 rows × 5 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \")\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.830395898032167,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 8.0,\n 1.0,\n 4.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.3949611533748287,\n \"min\": 0.0,\n \"max\": 8.0,\n \"num_unique_values\": 9,\n \"samples\": [\n 2.0,\n 4.0,\n 3.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.7971621924901315,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 9.0,\n 0.0,\n 5.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.5674698763560073,\n \"min\": 0.0,\n \"max\": 8.0,\n \"num_unique_values\": 9,\n \"samples\": [\n 8.0,\n 2.0,\n 6.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.47298376984040197,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 1.0,\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 90
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "2. Uniform Discretization Transform"
+ ],
+ "metadata": {
+ "id": "vtHUC3pT5OTS"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "trans = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')\n",
+ "new_data = trans.fit_transform(data)\n",
+ "pd.DataFrame(\n",
+ " new_data,\n",
+ " columns=data.columns\n",
+ ")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 439
+ },
+ "id": "XZaM1ewV5HjE",
+ "outputId": "874d7838-627a-4705-dc2f-084dff68bdd5"
+ },
+ "execution_count": 91,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n",
+ "0 2.0 6.0 0.0 0.0 0.0\n",
+ "1 1.0 4.0 0.0 0.0 0.0\n",
+ "2 1.0 5.0 0.0 0.0 0.0\n",
+ "3 0.0 4.0 0.0 0.0 0.0\n",
+ "4 1.0 6.0 0.0 0.0 0.0\n",
+ ".. ... ... ... ... ...\n",
+ "145 6.0 4.0 7.0 9.0 9.0\n",
+ "146 5.0 2.0 6.0 7.0 9.0\n",
+ "147 6.0 4.0 7.0 7.0 9.0\n",
+ "148 5.0 5.0 7.0 9.0 9.0\n",
+ "149 4.0 4.0 6.0 7.0 9.0\n",
+ "\n",
+ "[150 rows x 5 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " SepalLengthCm \n",
+ " SepalWidthCm \n",
+ " PetalLengthCm \n",
+ " PetalWidthCm \n",
+ " Species \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 2.0 \n",
+ " 6.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 1.0 \n",
+ " 4.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 1.0 \n",
+ " 5.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 0.0 \n",
+ " 4.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 1.0 \n",
+ " 6.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " \n",
+ " \n",
+ " 145 \n",
+ " 6.0 \n",
+ " 4.0 \n",
+ " 7.0 \n",
+ " 9.0 \n",
+ " 9.0 \n",
+ " \n",
+ " \n",
+ " 146 \n",
+ " 5.0 \n",
+ " 2.0 \n",
+ " 6.0 \n",
+ " 7.0 \n",
+ " 9.0 \n",
+ " \n",
+ " \n",
+ " 147 \n",
+ " 6.0 \n",
+ " 4.0 \n",
+ " 7.0 \n",
+ " 7.0 \n",
+ " 9.0 \n",
+ " \n",
+ " \n",
+ " 148 \n",
+ " 5.0 \n",
+ " 5.0 \n",
+ " 7.0 \n",
+ " 9.0 \n",
+ " 9.0 \n",
+ " \n",
+ " \n",
+ " 149 \n",
+ " 4.0 \n",
+ " 4.0 \n",
+ " 6.0 \n",
+ " 7.0 \n",
+ " 9.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
150 rows × 5 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \")\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.3315749206787793,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 9.0,\n 1.0,\n 7.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.8075778842435182,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 2.0,\n 4.0,\n 8.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.9708577087647687,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 9,\n \"samples\": [\n 7.0,\n 1.0,\n 4.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.1659613014009933,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 9.0,\n 1.0,\n 3.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.6941213351051103,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.0,\n 5.0,\n 9.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 91
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "3. KMeans Discretization Transform"
+ ],
+ "metadata": {
+ "id": "uMFedD8J5V0U"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "trans = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='kmeans')\n",
+ "new_data = trans.fit_transform(data)\n",
+ "pd.DataFrame(\n",
+ " new_data,\n",
+ " columns=data.columns\n",
+ ")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 507
+ },
+ "id": "q4KFEczX5SYj",
+ "outputId": "1a85ca09-5dc4-4dfc-9ccb-4aca2776bc21"
+ },
+ "execution_count": 92,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.10/dist-packages/sklearn/base.py:1473: ConvergenceWarning: Number of distinct clusters (3) found smaller than n_clusters (10). Possibly due to duplicate points in X.\n",
+ " return fit_method(estimator, *args, **kwargs)\n",
+ "/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_discretization.py:307: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 4 are removed. Consider decreasing the number of bins.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species\n",
+ "0 2.0 6.0 0.0 0.0 0.0\n",
+ "1 1.0 4.0 0.0 0.0 0.0\n",
+ "2 1.0 4.0 0.0 0.0 0.0\n",
+ "3 0.0 4.0 0.0 0.0 0.0\n",
+ "4 1.0 6.0 0.0 0.0 0.0\n",
+ ".. ... ... ... ... ...\n",
+ "145 6.0 4.0 5.0 9.0 2.0\n",
+ "146 5.0 2.0 5.0 7.0 2.0\n",
+ "147 6.0 4.0 5.0 7.0 2.0\n",
+ "148 5.0 5.0 6.0 9.0 2.0\n",
+ "149 4.0 4.0 5.0 7.0 2.0\n",
+ "\n",
+ "[150 rows x 5 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " SepalLengthCm \n",
+ " SepalWidthCm \n",
+ " PetalLengthCm \n",
+ " PetalWidthCm \n",
+ " Species \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 2.0 \n",
+ " 6.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 1.0 \n",
+ " 4.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 1.0 \n",
+ " 4.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 0.0 \n",
+ " 4.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 1.0 \n",
+ " 6.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " \n",
+ " \n",
+ " 145 \n",
+ " 6.0 \n",
+ " 4.0 \n",
+ " 5.0 \n",
+ " 9.0 \n",
+ " 2.0 \n",
+ " \n",
+ " \n",
+ " 146 \n",
+ " 5.0 \n",
+ " 2.0 \n",
+ " 5.0 \n",
+ " 7.0 \n",
+ " 2.0 \n",
+ " \n",
+ " \n",
+ " 147 \n",
+ " 6.0 \n",
+ " 4.0 \n",
+ " 5.0 \n",
+ " 7.0 \n",
+ " 2.0 \n",
+ " \n",
+ " \n",
+ " 148 \n",
+ " 5.0 \n",
+ " 5.0 \n",
+ " 6.0 \n",
+ " 9.0 \n",
+ " 2.0 \n",
+ " \n",
+ " \n",
+ " 149 \n",
+ " 4.0 \n",
+ " 4.0 \n",
+ " 5.0 \n",
+ " 7.0 \n",
+ " 2.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
150 rows × 5 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \")\",\n \"rows\": 150,\n \"fields\": [\n {\n \"column\": \"SepalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.3191065162163307,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 9.0,\n 1.0,\n 7.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SepalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.7795256135824453,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 2.0,\n 4.0,\n 8.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalLengthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.551842778463346,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 8.0,\n 1.0,\n 2.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PetalWidthCm\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.1587455386713343,\n \"min\": 0.0,\n \"max\": 9.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 9.0,\n 1.0,\n 6.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.8192319205190405,\n \"min\": 0.0,\n \"max\": 2.0,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.0,\n 1.0,\n 2.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 92
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## In this dataset we use Standardization\n"
+ ],
+ "metadata": {
+ "id": "q8Zl6Cp65pwQ"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Extract features and labels\n",
+ "\n",
+ "X = data.drop('Species', axis=1)\n",
+ "y = data['Species']"
+ ],
+ "metadata": {
+ "id": "4Y46_FZP5uQ_"
+ },
+ "execution_count": 93,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
+ ],
+ "metadata": {
+ "id": "fSEBRAIx6lMx"
+ },
+ "execution_count": 94,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sc = StandardScaler()\n",
+ "X_train = sc.fit_transform(X_train)\n",
+ "X_test = sc.transform(X_test)"
+ ],
+ "metadata": {
+ "id": "cV2TNfaT6m-G"
+ },
+ "execution_count": 95,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# 3. Choose a Model\n",
+ "\n",
+ "https://en.wikipedia.org/wiki/Logistic_regression"
+ ],
+ "metadata": {
+ "id": "prBHDhku7R2I"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.linear_model import LogisticRegression\n",
+ "\n",
+ "model = LogisticRegression()\n",
+ "model.fit(X_train, y_train)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 80
+ },
+ "id": "-6h32lvi62pc",
+ "outputId": "c62a7a44-2f13-4f9f-e305-cb63918af714"
+ },
+ "execution_count": 96,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "LogisticRegression()"
+ ],
+ "text/html": [
+ "LogisticRegression() In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 96
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# 4. Train the Model\n"
+ ],
+ "metadata": {
+ "id": "VTIpDLia9KYN"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "y_pred = model.predict(X_test)"
+ ],
+ "metadata": {
+ "id": "SBhwo4cZ85tF"
+ },
+ "execution_count": 97,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# 5. Evaluate the Model\n"
+ ],
+ "metadata": {
+ "id": "7PghPlKQ9OTQ"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.metrics import accuracy_score, classification_report\n",
+ "\n",
+ "accuracy = accuracy_score(y_test, y_pred)\n",
+ "print(f\"Accuracy: {accuracy:.2f}\")\n",
+ "print(\"Classification Report:\")\n",
+ "print(classification_report(y_test, y_pred))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "9N89xiJG9FOx",
+ "outputId": "aea846a4-aad0-440f-f37d-50807fe3fd57"
+ },
+ "execution_count": 98,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Accuracy: 1.00\n",
+ "Classification Report:\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 1.00 1.00 1.00 10\n",
+ " 1 1.00 1.00 1.00 9\n",
+ " 2 1.00 1.00 1.00 11\n",
+ "\n",
+ " accuracy 1.00 30\n",
+ " macro avg 1.00 1.00 1.00 30\n",
+ "weighted avg 1.00 1.00 1.00 30\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import joblib\n",
+ "\n",
+ "joblib.dump(model, 'iris_logistic_regression_model.pkl')\n",
+ "print(\"Model saved to iris_logistic_regression_model.pkl\")\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "LbRKB89f9PeA",
+ "outputId": "dd884167-32aa-444b-984b-6c2302e6c80f"
+ },
+ "execution_count": 99,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Model saved to iris_logistic_regression_model.pkl\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "loaded_model = joblib.load('logistic_regression_model.pkl')\n",
+ "print(\"Model loaded successfully\")\n",
+ "\n",
+ "\n",
+ "new_predictions = loaded_model.predict(X_test)\n",
+ "print(new_predictions)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "HxP0rhZa9g-X",
+ "outputId": "2df09e3e-98a3-4d90-fd48-6007dfe2838f"
+ },
+ "execution_count": 100,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Model loaded successfully\n",
+ "[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "accuracy = accuracy_score(y_test, new_predictions)\n",
+ "print(f\"Accuracy: {accuracy:.2f}\")\n",
+ "print(\"Classification Report:\")\n",
+ "print(classification_report(y_test, new_predictions))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "xWcxawsV9jA5",
+ "outputId": "f22a4df2-1014-4c94-d802-ce1f2d9ea836"
+ },
+ "execution_count": 101,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Accuracy: 1.00\n",
+ "Classification Report:\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 1.00 1.00 1.00 10\n",
+ " 1 1.00 1.00 1.00 9\n",
+ " 2 1.00 1.00 1.00 11\n",
+ "\n",
+ " accuracy 1.00 30\n",
+ " macro avg 1.00 1.00 1.00 30\n",
+ "weighted avg 1.00 1.00 1.00 30\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [],
+ "metadata": {
+ "id": "etijgWI29oUs"
+ },
+ "execution_count": 101,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file