{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "from datetime import datetime \n",
    "from datetime import date\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from keras.models import Sequential\n",
    "from keras.layers import LSTM, Dense\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import MinMaxScaler,StandardScaler\n",
    "from keras.callbacks import ModelCheckpoint\n",
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "merged = pd.read_csv(r'../data/long_merge.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "zones = [72, 71, 63, 62, 60, 59, 58,57, 50, 49, 44, 43, 35, 34, 33, 32, 31, 30, 29, 28, ]\n",
    "rtus = [2]\n",
    "cols = []\n",
    "\n",
    "for zone in zones:\n",
    "        for column in merged.columns:\n",
    "            if f\"zone_0{zone}\" in column and 'co2' not in column and \"hw_valve\" not in column and \"cooling_sp\" not in column and \"heating_sp\" not in column:\n",
    "                cols.append(column)\n",
    "\n",
    "for zone in zones:\n",
    "        for column in merged.columns:\n",
    "              if f\"zone_0{zone}\" in column: \n",
    "                if \"cooling_sp\" in column or \"heating_sp\" in column:\n",
    "                  cols.append(column)\n",
    "# for rtu in rtus:\n",
    "#     for column in merged.columns:\n",
    "#         if f\"rtu_00{rtu}_fltrd_sa\" in column:\n",
    "#                 cols.append(column)\n",
    "cols =['date'] + cols + ['air_temp_set_1',\n",
    " 'air_temp_set_2',\n",
    " 'dew_point_temperature_set_1d',\n",
    " 'relative_humidity_set_1',\n",
    " 'solar_radiation_set_1']\n",
    "input_dataset = merged[cols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\arbal\\AppData\\Local\\Temp\\ipykernel_34660\\1855433847.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  input_dataset['date'] = pd.to_datetime(input_dataset['date'], format = \"%Y-%m-%d %H:%M:%S\")\n"
     ]
    }
   ],
   "source": [
    "input_dataset['date'] = pd.to_datetime(input_dataset['date'], format = \"%Y-%m-%d %H:%M:%S\")\n",
    "df_filtered = input_dataset[ (input_dataset.date.dt.date >date(2019, 3, 1)) & (input_dataset.date.dt.date< date(2021, 1, 1))]\n",
    "\n",
    "if df_filtered.isna().any().any():\n",
    "    print(\"There are NA values in the DataFrame columns.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>zone_072_temp</th>\n",
       "      <th>zone_072_fan_spd</th>\n",
       "      <th>zone_071_temp</th>\n",
       "      <th>zone_071_fan_spd</th>\n",
       "      <th>zone_063_temp</th>\n",
       "      <th>zone_063_fan_spd</th>\n",
       "      <th>zone_062_temp</th>\n",
       "      <th>zone_062_fan_spd</th>\n",
       "      <th>zone_059_temp</th>\n",
       "      <th>...</th>\n",
       "      <th>zone_035_heating_sp</th>\n",
       "      <th>zone_032_cooling_sp</th>\n",
       "      <th>zone_032_heating_sp</th>\n",
       "      <th>zone_030_cooling_sp</th>\n",
       "      <th>zone_030_heating_sp</th>\n",
       "      <th>air_temp_set_1</th>\n",
       "      <th>air_temp_set_2</th>\n",
       "      <th>dew_point_temperature_set_1d</th>\n",
       "      <th>relative_humidity_set_1</th>\n",
       "      <th>solar_radiation_set_1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>511205</th>\n",
       "      <td>2019-03-02 00:00:00</td>\n",
       "      <td>71.2</td>\n",
       "      <td>40.0</td>\n",
       "      <td>71.5</td>\n",
       "      <td>20.0</td>\n",
       "      <td>72.3</td>\n",
       "      <td>20.0</td>\n",
       "      <td>72.9</td>\n",
       "      <td>55.0</td>\n",
       "      <td>71.9</td>\n",
       "      <td>...</td>\n",
       "      <td>70.0</td>\n",
       "      <td>74.000000</td>\n",
       "      <td>68.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>67.0</td>\n",
       "      <td>11.590</td>\n",
       "      <td>11.130</td>\n",
       "      <td>3.00</td>\n",
       "      <td>55.87</td>\n",
       "      <td>120.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>511206</th>\n",
       "      <td>2019-03-02 00:01:00</td>\n",
       "      <td>71.2</td>\n",
       "      <td>40.0</td>\n",
       "      <td>71.5</td>\n",
       "      <td>20.0</td>\n",
       "      <td>72.3</td>\n",
       "      <td>20.0</td>\n",
       "      <td>72.9</td>\n",
       "      <td>55.0</td>\n",
       "      <td>71.9</td>\n",
       "      <td>...</td>\n",
       "      <td>70.0</td>\n",
       "      <td>74.000000</td>\n",
       "      <td>68.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>67.0</td>\n",
       "      <td>11.590</td>\n",
       "      <td>11.130</td>\n",
       "      <td>3.00</td>\n",
       "      <td>55.87</td>\n",
       "      <td>120.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>511207</th>\n",
       "      <td>2019-03-02 00:02:00</td>\n",
       "      <td>71.2</td>\n",
       "      <td>40.0</td>\n",
       "      <td>71.5</td>\n",
       "      <td>20.0</td>\n",
       "      <td>72.3</td>\n",
       "      <td>20.0</td>\n",
       "      <td>72.6</td>\n",
       "      <td>55.0</td>\n",
       "      <td>71.9</td>\n",
       "      <td>...</td>\n",
       "      <td>70.0</td>\n",
       "      <td>74.000000</td>\n",
       "      <td>68.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>67.0</td>\n",
       "      <td>11.590</td>\n",
       "      <td>11.130</td>\n",
       "      <td>3.00</td>\n",
       "      <td>55.87</td>\n",
       "      <td>120.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>511208</th>\n",
       "      <td>2019-03-02 00:03:00</td>\n",
       "      <td>71.2</td>\n",
       "      <td>40.0</td>\n",
       "      <td>71.5</td>\n",
       "      <td>20.0</td>\n",
       "      <td>72.3</td>\n",
       "      <td>20.0</td>\n",
       "      <td>72.9</td>\n",
       "      <td>55.0</td>\n",
       "      <td>71.9</td>\n",
       "      <td>...</td>\n",
       "      <td>70.0</td>\n",
       "      <td>74.000000</td>\n",
       "      <td>68.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>67.0</td>\n",
       "      <td>11.590</td>\n",
       "      <td>11.130</td>\n",
       "      <td>3.00</td>\n",
       "      <td>55.87</td>\n",
       "      <td>120.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>511209</th>\n",
       "      <td>2019-03-02 00:04:00</td>\n",
       "      <td>71.2</td>\n",
       "      <td>40.0</td>\n",
       "      <td>71.4</td>\n",
       "      <td>20.0</td>\n",
       "      <td>72.3</td>\n",
       "      <td>20.0</td>\n",
       "      <td>72.9</td>\n",
       "      <td>55.0</td>\n",
       "      <td>71.9</td>\n",
       "      <td>...</td>\n",
       "      <td>70.0</td>\n",
       "      <td>74.000000</td>\n",
       "      <td>68.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>67.0</td>\n",
       "      <td>11.590</td>\n",
       "      <td>11.130</td>\n",
       "      <td>3.00</td>\n",
       "      <td>55.87</td>\n",
       "      <td>120.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2072148</th>\n",
       "      <td>2020-12-31 23:57:00</td>\n",
       "      <td>69.5</td>\n",
       "      <td>40.0</td>\n",
       "      <td>71.2</td>\n",
       "      <td>20.0</td>\n",
       "      <td>68.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>67.6</td>\n",
       "      <td>40.0</td>\n",
       "      <td>67.5</td>\n",
       "      <td>...</td>\n",
       "      <td>68.0</td>\n",
       "      <td>72.714138</td>\n",
       "      <td>71.0</td>\n",
       "      <td>71.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>13.994</td>\n",
       "      <td>13.528</td>\n",
       "      <td>4.11</td>\n",
       "      <td>51.61</td>\n",
       "      <td>188.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2072149</th>\n",
       "      <td>2020-12-31 23:58:00</td>\n",
       "      <td>69.5</td>\n",
       "      <td>40.0</td>\n",
       "      <td>71.3</td>\n",
       "      <td>20.0</td>\n",
       "      <td>68.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>67.6</td>\n",
       "      <td>40.0</td>\n",
       "      <td>67.5</td>\n",
       "      <td>...</td>\n",
       "      <td>68.0</td>\n",
       "      <td>72.714138</td>\n",
       "      <td>71.0</td>\n",
       "      <td>71.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>13.994</td>\n",
       "      <td>13.528</td>\n",
       "      <td>4.11</td>\n",
       "      <td>51.61</td>\n",
       "      <td>188.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2072150</th>\n",
       "      <td>2020-12-31 23:58:00</td>\n",
       "      <td>69.5</td>\n",
       "      <td>40.0</td>\n",
       "      <td>71.3</td>\n",
       "      <td>20.0</td>\n",
       "      <td>68.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>67.6</td>\n",
       "      <td>40.0</td>\n",
       "      <td>67.5</td>\n",
       "      <td>...</td>\n",
       "      <td>68.0</td>\n",
       "      <td>72.714138</td>\n",
       "      <td>71.0</td>\n",
       "      <td>71.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>13.994</td>\n",
       "      <td>13.528</td>\n",
       "      <td>4.11</td>\n",
       "      <td>51.61</td>\n",
       "      <td>188.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2072151</th>\n",
       "      <td>2020-12-31 23:59:00</td>\n",
       "      <td>69.5</td>\n",
       "      <td>40.0</td>\n",
       "      <td>71.5</td>\n",
       "      <td>20.0</td>\n",
       "      <td>68.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>67.6</td>\n",
       "      <td>40.0</td>\n",
       "      <td>67.5</td>\n",
       "      <td>...</td>\n",
       "      <td>68.0</td>\n",
       "      <td>72.714138</td>\n",
       "      <td>71.0</td>\n",
       "      <td>71.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>13.994</td>\n",
       "      <td>13.528</td>\n",
       "      <td>4.11</td>\n",
       "      <td>51.61</td>\n",
       "      <td>188.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2072152</th>\n",
       "      <td>2020-12-31 23:59:00</td>\n",
       "      <td>69.5</td>\n",
       "      <td>40.0</td>\n",
       "      <td>71.5</td>\n",
       "      <td>20.0</td>\n",
       "      <td>68.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>67.6</td>\n",
       "      <td>40.0</td>\n",
       "      <td>67.5</td>\n",
       "      <td>...</td>\n",
       "      <td>68.0</td>\n",
       "      <td>72.714138</td>\n",
       "      <td>71.0</td>\n",
       "      <td>71.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>13.994</td>\n",
       "      <td>13.528</td>\n",
       "      <td>4.11</td>\n",
       "      <td>51.61</td>\n",
       "      <td>188.8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1560948 rows × 56 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                       date  zone_072_temp  zone_072_fan_spd  zone_071_temp  \\\n",
       "511205  2019-03-02 00:00:00           71.2              40.0           71.5   \n",
       "511206  2019-03-02 00:01:00           71.2              40.0           71.5   \n",
       "511207  2019-03-02 00:02:00           71.2              40.0           71.5   \n",
       "511208  2019-03-02 00:03:00           71.2              40.0           71.5   \n",
       "511209  2019-03-02 00:04:00           71.2              40.0           71.4   \n",
       "...                     ...            ...               ...            ...   \n",
       "2072148 2020-12-31 23:57:00           69.5              40.0           71.2   \n",
       "2072149 2020-12-31 23:58:00           69.5              40.0           71.3   \n",
       "2072150 2020-12-31 23:58:00           69.5              40.0           71.3   \n",
       "2072151 2020-12-31 23:59:00           69.5              40.0           71.5   \n",
       "2072152 2020-12-31 23:59:00           69.5              40.0           71.5   \n",
       "\n",
       "         zone_071_fan_spd  zone_063_temp  zone_063_fan_spd  zone_062_temp  \\\n",
       "511205               20.0           72.3              20.0           72.9   \n",
       "511206               20.0           72.3              20.0           72.9   \n",
       "511207               20.0           72.3              20.0           72.6   \n",
       "511208               20.0           72.3              20.0           72.9   \n",
       "511209               20.0           72.3              20.0           72.9   \n",
       "...                   ...            ...               ...            ...   \n",
       "2072148              20.0           68.0              20.0           67.6   \n",
       "2072149              20.0           68.0              20.0           67.6   \n",
       "2072150              20.0           68.0              20.0           67.6   \n",
       "2072151              20.0           68.0              20.0           67.6   \n",
       "2072152              20.0           68.0              20.0           67.6   \n",
       "\n",
       "         zone_062_fan_spd  zone_059_temp  ...  zone_035_heating_sp  \\\n",
       "511205               55.0           71.9  ...                 70.0   \n",
       "511206               55.0           71.9  ...                 70.0   \n",
       "511207               55.0           71.9  ...                 70.0   \n",
       "511208               55.0           71.9  ...                 70.0   \n",
       "511209               55.0           71.9  ...                 70.0   \n",
       "...                   ...            ...  ...                  ...   \n",
       "2072148              40.0           67.5  ...                 68.0   \n",
       "2072149              40.0           67.5  ...                 68.0   \n",
       "2072150              40.0           67.5  ...                 68.0   \n",
       "2072151              40.0           67.5  ...                 68.0   \n",
       "2072152              40.0           67.5  ...                 68.0   \n",
       "\n",
       "         zone_032_cooling_sp  zone_032_heating_sp  zone_030_cooling_sp  \\\n",
       "511205             74.000000                 68.0                 73.0   \n",
       "511206             74.000000                 68.0                 73.0   \n",
       "511207             74.000000                 68.0                 73.0   \n",
       "511208             74.000000                 68.0                 73.0   \n",
       "511209             74.000000                 68.0                 73.0   \n",
       "...                      ...                  ...                  ...   \n",
       "2072148            72.714138                 71.0                 71.0   \n",
       "2072149            72.714138                 71.0                 71.0   \n",
       "2072150            72.714138                 71.0                 71.0   \n",
       "2072151            72.714138                 71.0                 71.0   \n",
       "2072152            72.714138                 71.0                 71.0   \n",
       "\n",
       "         zone_030_heating_sp  air_temp_set_1  air_temp_set_2  \\\n",
       "511205                  67.0          11.590          11.130   \n",
       "511206                  67.0          11.590          11.130   \n",
       "511207                  67.0          11.590          11.130   \n",
       "511208                  67.0          11.590          11.130   \n",
       "511209                  67.0          11.590          11.130   \n",
       "...                      ...             ...             ...   \n",
       "2072148                 70.0          13.994          13.528   \n",
       "2072149                 70.0          13.994          13.528   \n",
       "2072150                 70.0          13.994          13.528   \n",
       "2072151                 70.0          13.994          13.528   \n",
       "2072152                 70.0          13.994          13.528   \n",
       "\n",
       "         dew_point_temperature_set_1d  relative_humidity_set_1  \\\n",
       "511205                           3.00                    55.87   \n",
       "511206                           3.00                    55.87   \n",
       "511207                           3.00                    55.87   \n",
       "511208                           3.00                    55.87   \n",
       "511209                           3.00                    55.87   \n",
       "...                               ...                      ...   \n",
       "2072148                          4.11                    51.61   \n",
       "2072149                          4.11                    51.61   \n",
       "2072150                          4.11                    51.61   \n",
       "2072151                          4.11                    51.61   \n",
       "2072152                          4.11                    51.61   \n",
       "\n",
       "         solar_radiation_set_1  \n",
       "511205                   120.3  \n",
       "511206                   120.3  \n",
       "511207                   120.3  \n",
       "511208                   120.3  \n",
       "511209                   120.3  \n",
       "...                        ...  \n",
       "2072148                  188.8  \n",
       "2072149                  188.8  \n",
       "2072150                  188.8  \n",
       "2072151                  188.8  \n",
       "2072152                  188.8  \n",
       "\n",
       "[1560948 rows x 56 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_filtered"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "testdataset_df = df_filtered[(df_filtered.date.dt.date >date(2020, 3, 1)) & (df_filtered.date.dt.date <date(2020,7, 1))]\n",
    "\n",
    "# traindataset_df = df_filtered[ (df_filtered.date.dt.date >date(2019, 11, 8))]\n",
    "\n",
    "traindataset_df = df_filtered[(df_filtered.date.dt.date >date(2019, 3, 1)) & (df_filtered.date.dt.date <date(2020, 3, 1)) | (df_filtered.date.dt.date >date(2020, 7, 1)) & (df_filtered.date.dt.date <date(2020, 12, 1))]\n",
    "testdataset = testdataset_df.drop(columns=[\"date\"]).values\n",
    "traindataset = traindataset_df.drop(columns=[\"date\"]).values\n",
    "\n",
    "columns_with_na = traindataset_df.columns[traindataset_df.isna().any()].tolist()\n",
    "columns_with_na"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['date', 'zone_072_temp', 'zone_072_fan_spd', 'zone_071_temp',\n",
       "       'zone_071_fan_spd', 'zone_063_temp', 'zone_063_fan_spd',\n",
       "       'zone_062_temp', 'zone_062_fan_spd', 'zone_059_temp',\n",
       "       'zone_059_fan_spd', 'zone_058_temp', 'zone_058_fan_spd',\n",
       "       'zone_057_temp', 'zone_057_fan_spd', 'zone_049_temp',\n",
       "       'zone_049_fan_spd', 'zone_044_temp', 'zone_044_fan_spd',\n",
       "       'zone_043_temp', 'zone_043_fan_spd', 'zone_035_temp',\n",
       "       'zone_035_fan_spd', 'zone_033_temp', 'zone_033_fan_spd',\n",
       "       'zone_032_temp', 'zone_032_fan_spd', 'zone_030_temp',\n",
       "       'zone_030_fan_spd', 'zone_028_temp', 'zone_028_fan_spd'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "traindataset_df.columns[0:31]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 0\n"
     ]
    }
   ],
   "source": [
    "print(traindataset_df.isna().sum().sum(), testdataset_df.isna().sum().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1073512, 391818)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(traindataset), len(testdataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "traindataset = traindataset.astype('float32')\n",
    "testdataset = testdataset.astype('float32')\n",
    "\n",
    "scaler = StandardScaler()\n",
    "traindataset = scaler.fit_transform(traindataset)\n",
    "testdataset = scaler.transform(testdataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1073512, 55)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "traindataset.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "train,test = traindataset,testdataset\n",
    "\n",
    "def create_dataset(dataset,time_step):\n",
    "    x = []\n",
    "    Y = []\n",
    "    for i in range(len(dataset) - time_step - 1):\n",
    "        x.append(dataset[i:(i+time_step),:])\n",
    "        Y.append(dataset[i+time_step,0:31])\n",
    "    x= np.array(x)\n",
    "    Y = np.array(Y)\n",
    "    return x,Y\n",
    "time_step = 30\n",
    "X_train, y_train = create_dataset(train, time_step)\n",
    "X_test, y_test = create_dataset(test, time_step)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1073481, 30, 55), (1073481, 31))"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.shape, y_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/5\n",
      "8386/8387 [============================>.] - ETA: 0s - loss: 0.0512\n",
      "Epoch 1: val_loss improved from inf to 0.29908, saving model to lstm_vav_02.tf\n",
      "INFO:tensorflow:Assets written to: lstm_vav_02.tf\\assets\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: lstm_vav_02.tf\\assets\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8387/8387 [==============================] - 337s 40ms/step - loss: 0.0512 - val_loss: 0.2991\n",
      "Epoch 2/5\n",
      "8387/8387 [==============================] - ETA: 0s - loss: 0.0213\n",
      "Epoch 2: val_loss improved from 0.29908 to 0.23285, saving model to lstm_vav_02.tf\n",
      "INFO:tensorflow:Assets written to: lstm_vav_02.tf\\assets\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: lstm_vav_02.tf\\assets\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8387/8387 [==============================] - 300s 36ms/step - loss: 0.0213 - val_loss: 0.2328\n",
      "Epoch 3/5\n",
      "8387/8387 [==============================] - ETA: 0s - loss: 0.0152\n",
      "Epoch 3: val_loss did not improve from 0.23285\n",
      "8387/8387 [==============================] - 359s 43ms/step - loss: 0.0152 - val_loss: 0.2426\n",
      "Epoch 4/5\n",
      "8387/8387 [==============================] - ETA: 0s - loss: 0.0106\n",
      "Epoch 4: val_loss did not improve from 0.23285\n",
      "8387/8387 [==============================] - 350s 42ms/step - loss: 0.0106 - val_loss: 0.2556\n",
      "Epoch 5/5\n",
      "8386/8387 [============================>.] - ETA: 0s - loss: 0.0096\n",
      "Epoch 5: val_loss did not improve from 0.23285\n",
      "8387/8387 [==============================] - 349s 42ms/step - loss: 0.0096 - val_loss: 0.2635\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.src.callbacks.History at 0x27dbd115490>"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "model = Sequential()\n",
    "model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))\n",
    "model.add(LSTM(units=50, return_sequences=True))\n",
    "model.add(LSTM(units=30))\n",
    "model.add(Dense(units=y_train.shape[1]))\n",
    "\n",
    "model.compile(optimizer='adam', loss='mean_squared_error')\n",
    "\n",
    "checkpoint_path = \"lstm_vav_02.tf\"\n",
    "checkpoint_callback = ModelCheckpoint(filepath=checkpoint_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')\n",
    "model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=128, verbose=1, callbacks=[checkpoint_callback])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x2a4b2344610>"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.load_weights(checkpoint_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "12244/12244 [==============================] - 60s 5ms/step\n"
     ]
    }
   ],
   "source": [
    "test_predict1 = model.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib qt\n",
    "plt.figure()\n",
    "var = 1\n",
    "plt.plot(y_test[:,var], label='Original Testing Data', color='blue')\n",
    "plt.plot(test_predict1[:,var], label='Predicted Testing Data', color='red',alpha=0.8)\n",
    "anomalies = np.where(abs(test_predict1[:,var] - y_test[:,var]) > 0.38)\n",
    "plt.scatter(anomalies,test_predict1[anomalies,var], color='black',marker =\"o\",s=100 )\n",
    "\n",
    "\n",
    "plt.title('Testing Data - Predicted vs Actual')\n",
    "plt.xlabel('Time')\n",
    "plt.ylabel('Value')\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "33547/33547 [==============================] - 201s 6ms/step\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "operands could not be broadcast together with shapes (1073481,) (391787,) ",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[36], line 8\u001b[0m\n\u001b[0;32m      6\u001b[0m plt\u001b[38;5;241m.\u001b[39mplot(y_train[:,var], label\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mOriginal Testing Data\u001b[39m\u001b[38;5;124m'\u001b[39m, color\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mblue\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m      7\u001b[0m plt\u001b[38;5;241m.\u001b[39mplot(test_predict2[:,var], label\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mPredicted Testing Data\u001b[39m\u001b[38;5;124m'\u001b[39m, color\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mred\u001b[39m\u001b[38;5;124m'\u001b[39m,alpha\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.8\u001b[39m)\n\u001b[1;32m----> 8\u001b[0m anomalies \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mwhere(\u001b[38;5;28mabs\u001b[39m(\u001b[43mtest_predict2\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[43m,\u001b[49m\u001b[43mvar\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43m \u001b[49m\u001b[43my_test\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[43m,\u001b[49m\u001b[43mvar\u001b[49m\u001b[43m]\u001b[49m) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0.38\u001b[39m)\n\u001b[0;32m      9\u001b[0m plt\u001b[38;5;241m.\u001b[39mscatter(anomalies,test_predict2[anomalies,var], color\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mblack\u001b[39m\u001b[38;5;124m'\u001b[39m,marker \u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mo\u001b[39m\u001b[38;5;124m\"\u001b[39m,s\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m )\n\u001b[0;32m     12\u001b[0m plt\u001b[38;5;241m.\u001b[39mtitle(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mTraining Data - Predicted vs Actual\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
      "\u001b[1;31mValueError\u001b[0m: operands could not be broadcast together with shapes (1073481,) (391787,) "
     ]
    }
   ],
   "source": [
    "%matplotlib qt\n",
    "test_predict2 = model.predict(X_train)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure()\n",
    "var = 3\n",
    "plt.plot(y_train[:,var], label='Original Training Data', color='blue')\n",
    "plt.plot(test_predict2[:,var], label='Predicted Training Data', color='red',alpha=0.8)\n",
    "anomalies = np.where(abs(test_predict2[:,var] - y_train[:,var]) > 0.38)\n",
    "plt.scatter(anomalies,test_predict2[anomalies,var], color='black',marker =\"o\",s=100 )\n",
    "\n",
    "\n",
    "plt.title('Training Data - Predicted vs Actual')\n",
    "plt.xlabel('Time')\n",
    "plt.ylabel('Value')\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.mixture import GaussianMixture\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "\n",
    "# Generating random data for demonstration\n",
    "np.random.seed(0)\n",
    "X =  test_predict1 - y_test\n",
    "\n",
    "\n",
    "pca = PCA(n_components=2)\n",
    "X = pca.fit_transform(X)\n",
    "\n",
    "\n",
    "# Creating the GMM instance with desired number of clusters\n",
    "gmm = GaussianMixture(n_components=2)\n",
    "\n",
    "# Fitting the model to the data\n",
    "gmm.fit(X)\n",
    "\n",
    "# Getting the cluster labels\n",
    "labels = gmm.predict(X)\n",
    "\n",
    "# Plotting the data points with colors representing different clusters\n",
    "plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', alpha=0.5)\n",
    "plt.title('GMM Clustering')\n",
    "plt.xlabel('Feature 1')\n",
    "plt.ylabel('Feature 2')\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "# Generating random data for demonstration\n",
    "np.random.seed(0)\n",
    "X = (test_predict1 - y_test)\n",
    "\n",
    "k = 6\n",
    "\n",
    "kmeans = KMeans(n_clusters=k)\n",
    "\n",
    "kmeans.fit(X)\n",
    "\n",
    "\n",
    "pca = PCA(n_components=2)\n",
    "X = pca.fit_transform(X)\n",
    "\n",
    "\n",
    "\n",
    "# Getting the cluster centers and labels\n",
    "centroids = kmeans.cluster_centers_\n",
    "centroids = pca.transform(centroids)\n",
    "labels = kmeans.labels_\n",
    "\n",
    "# Plotting the data points and cluster centers\n",
    "plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', alpha=0.5)\n",
    "plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', c='red', s=200, linewidths=2)\n",
    "plt.title('KMeans Clustering')\n",
    "plt.xlabel('Feature 1')\n",
    "plt.ylabel('Feature 2')\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "k = 60\n",
    "X= test_predict1 - y_test\n",
    "processed_data = []\n",
    "feat_df = pd.DataFrame(columns=[\"mean\",\"std\",])\n",
    "for i in range(0,len(X), 60):\n",
    "    mean = X[i:i+k].mean(axis = 0)\n",
    "    std = X[i:i+k].std(axis = 0)\n",
    "    max = X[i:i+k].max(axis = 0)\n",
    "    min = X[i:i+k].min(axis = 0)\n",
    "    iqr = np.percentile(X[i:i+k], 75, axis=0) - np.percentile(X[i:i+k], 25,axis=0)\n",
    "    data = np.concatenate([mean, std, max, min, iqr])\n",
    "    processed_data.append([data])\n",
    "processed_data = np.concatenate(processed_data,axis=0) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = processed_data\n",
    "\n",
    "kmeans = KMeans(n_clusters=2, algorithm='elkan', max_iter=1000, n_init = 5)\n",
    "\n",
    "kmeans.fit(X)\n",
    "\n",
    "pca = PCA(n_components=2)\n",
    "X = pca.fit_transform(X)\n",
    "\n",
    "\n",
    "# Getting the cluster centers and labels\n",
    "centroids = kmeans.cluster_centers_\n",
    "centroids = pca.transform(centroids)\n",
    "labels = kmeans.labels_\n",
    "\n",
    "# Plotting the data points and cluster centers\n",
    "plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', alpha=0.5)\n",
    "plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', c='red', s=200, linewidths=2)\n",
    "plt.title('KMeans Clustering')\n",
    "plt.xlabel('Feature 1')\n",
    "plt.ylabel('Feature 2')\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.mixture import GaussianMixture\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "# Generating random data for demonstration\n",
    "np.random.seed(0)\n",
    "X = processed_data\n",
    "\n",
    "# Creating the GMM instance with desired number of clusters\n",
    "gmm = GaussianMixture(n_components=2, init_params='k-means++')\n",
    "\n",
    "# Fitting the model to the data\n",
    "gmm.fit(X)\n",
    "labels = gmm.predict(X)\n",
    "\n",
    "\n",
    "pca = PCA(n_components=2)\n",
    "X = pca.fit_transform(X)\n",
    "\n",
    "\n",
    "# Getting the cluster labels\n",
    "\n",
    "# Plotting the data points with colors representing different clusters\n",
    "plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', alpha=0.5)\n",
    "plt.title('GMM Clustering')\n",
    "plt.xlabel('Feature 1')\n",
    "plt.ylabel('Feature 2')\n",
    "plt.show()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "# Generating random data for demonstration\n",
    "np.random.seed(0)\n",
    "X = test_predict1 - y_test \n",
    "\n",
    "kmeans = KMeans(n_clusters=2)\n",
    "\n",
    "kmeans.fit(X)\n",
    "\n",
    "\n",
    "pca = PCA(n_components=2)\n",
    "X = pca.fit_transform(X)\n",
    "\n",
    "\n",
    "\n",
    "# Getting the cluster centers and labels\n",
    "centroids = kmeans.cluster_centers_\n",
    "centroids = pca.transform(centroids)\n",
    "labels = kmeans.labels_\n",
    "\n",
    "# Plotting the data points and cluster centers\n",
    "plt.figure()\n",
    "plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', alpha=0.5)\n",
    "plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', c='red', s=200, linewidths=2)\n",
    "plt.text(centroids[0,0], centroids[0,1], 'Normal', fontsize=12, color='red')\n",
    "plt.text(centroids[1,0], centroids[1,1], 'Anomaly', fontsize=12, color='red')\n",
    "plt.title('KMeans Clustering')\n",
    "plt.xlabel('Feature 1')\n",
    "plt.ylabel('Feature 2')\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "329810"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum(labels==0)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "tensorflow",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}