{ "cells": [ { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "from datetime import datetime \n", "from datetime import date\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "from keras.models import Sequential\n", "from keras.layers import LSTM, Dense\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import MinMaxScaler,StandardScaler\n", "from keras.callbacks import ModelCheckpoint\n", "import tensorflow as tf\n", "import joblib\n", "from datetime import datetime" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "merged = pd.read_csv(r'../data/long_merge.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "zones = [72, 71, 63, 62, 60, 59, 58,57, 50, 49, 44, 43, 35, 34, 33, 32, 31, 30, 29, 28, ]\n", "rtus = [2]\n", "cols = []\n", "\n", "for zone in zones:\n", " for column in merged.columns:\n", " if f\"zone_0{zone}\" in column and 'co2' not in column and \"hw_valve\" not in column and \"cooling_sp\" not in column and \"heating_sp\" not in column:\n", " cols.append(column)\n", "\n", "for zone in zones:\n", " for column in merged.columns:\n", " if f\"zone_0{zone}\" in column: \n", " if \"cooling_sp\" in column or \"heating_sp\" in column:\n", " cols.append(column)\n", "# for rtu in rtus:\n", "# for column in merged.columns:\n", "# if f\"rtu_00{rtu}_fltrd_sa\" in column:\n", "# cols.append(column)\n", "cols =['date'] + cols + ['air_temp_set_1',\n", " 'air_temp_set_2',\n", " 'dew_point_temperature_set_1d',\n", " 'relative_humidity_set_1',\n", " 'solar_radiation_set_1']\n", "input_dataset = merged[cols]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\arbal\\AppData\\Local\\Temp\\ipykernel_18672\\1855433847.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " input_dataset['date'] = pd.to_datetime(input_dataset['date'], format = \"%Y-%m-%d %H:%M:%S\")\n" ] } ], "source": [ "input_dataset['date'] = pd.to_datetime(input_dataset['date'], format = \"%Y-%m-%d %H:%M:%S\")\n", "df_filtered = input_dataset[ (input_dataset.date.dt.date >date(2019, 3, 1)) & (input_dataset.date.dt.date< date(2021, 1, 1))]\n", "\n", "if df_filtered.isna().any().any():\n", " print(\"There are NA values in the DataFrame columns.\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datezone_072_tempzone_072_fan_spdzone_071_tempzone_071_fan_spdzone_063_tempzone_063_fan_spdzone_062_tempzone_062_fan_spdzone_059_temp...zone_035_heating_spzone_032_cooling_spzone_032_heating_spzone_030_cooling_spzone_030_heating_spair_temp_set_1air_temp_set_2dew_point_temperature_set_1drelative_humidity_set_1solar_radiation_set_1
5112052019-03-02 00:00:0071.240.071.520.072.320.072.955.071.9...70.074.00000068.073.067.011.59011.1303.0055.87120.3
5112062019-03-02 00:01:0071.240.071.520.072.320.072.955.071.9...70.074.00000068.073.067.011.59011.1303.0055.87120.3
5112072019-03-02 00:02:0071.240.071.520.072.320.072.655.071.9...70.074.00000068.073.067.011.59011.1303.0055.87120.3
5112082019-03-02 00:03:0071.240.071.520.072.320.072.955.071.9...70.074.00000068.073.067.011.59011.1303.0055.87120.3
5112092019-03-02 00:04:0071.240.071.420.072.320.072.955.071.9...70.074.00000068.073.067.011.59011.1303.0055.87120.3
..................................................................
20721482020-12-31 23:57:0069.540.071.220.068.020.067.640.067.5...68.072.71413871.071.070.013.99413.5284.1151.61188.8
20721492020-12-31 23:58:0069.540.071.320.068.020.067.640.067.5...68.072.71413871.071.070.013.99413.5284.1151.61188.8
20721502020-12-31 23:58:0069.540.071.320.068.020.067.640.067.5...68.072.71413871.071.070.013.99413.5284.1151.61188.8
20721512020-12-31 23:59:0069.540.071.520.068.020.067.640.067.5...68.072.71413871.071.070.013.99413.5284.1151.61188.8
20721522020-12-31 23:59:0069.540.071.520.068.020.067.640.067.5...68.072.71413871.071.070.013.99413.5284.1151.61188.8
\n", "

1560948 rows × 56 columns

\n", "
" ], "text/plain": [ " date zone_072_temp zone_072_fan_spd zone_071_temp \\\n", "511205 2019-03-02 00:00:00 71.2 40.0 71.5 \n", "511206 2019-03-02 00:01:00 71.2 40.0 71.5 \n", "511207 2019-03-02 00:02:00 71.2 40.0 71.5 \n", "511208 2019-03-02 00:03:00 71.2 40.0 71.5 \n", "511209 2019-03-02 00:04:00 71.2 40.0 71.4 \n", "... ... ... ... ... \n", "2072148 2020-12-31 23:57:00 69.5 40.0 71.2 \n", "2072149 2020-12-31 23:58:00 69.5 40.0 71.3 \n", "2072150 2020-12-31 23:58:00 69.5 40.0 71.3 \n", "2072151 2020-12-31 23:59:00 69.5 40.0 71.5 \n", "2072152 2020-12-31 23:59:00 69.5 40.0 71.5 \n", "\n", " zone_071_fan_spd zone_063_temp zone_063_fan_spd zone_062_temp \\\n", "511205 20.0 72.3 20.0 72.9 \n", "511206 20.0 72.3 20.0 72.9 \n", "511207 20.0 72.3 20.0 72.6 \n", "511208 20.0 72.3 20.0 72.9 \n", "511209 20.0 72.3 20.0 72.9 \n", "... ... ... ... ... \n", "2072148 20.0 68.0 20.0 67.6 \n", "2072149 20.0 68.0 20.0 67.6 \n", "2072150 20.0 68.0 20.0 67.6 \n", "2072151 20.0 68.0 20.0 67.6 \n", "2072152 20.0 68.0 20.0 67.6 \n", "\n", " zone_062_fan_spd zone_059_temp ... zone_035_heating_sp \\\n", "511205 55.0 71.9 ... 70.0 \n", "511206 55.0 71.9 ... 70.0 \n", "511207 55.0 71.9 ... 70.0 \n", "511208 55.0 71.9 ... 70.0 \n", "511209 55.0 71.9 ... 70.0 \n", "... ... ... ... ... \n", "2072148 40.0 67.5 ... 68.0 \n", "2072149 40.0 67.5 ... 68.0 \n", "2072150 40.0 67.5 ... 68.0 \n", "2072151 40.0 67.5 ... 68.0 \n", "2072152 40.0 67.5 ... 68.0 \n", "\n", " zone_032_cooling_sp zone_032_heating_sp zone_030_cooling_sp \\\n", "511205 74.000000 68.0 73.0 \n", "511206 74.000000 68.0 73.0 \n", "511207 74.000000 68.0 73.0 \n", "511208 74.000000 68.0 73.0 \n", "511209 74.000000 68.0 73.0 \n", "... ... ... ... \n", "2072148 72.714138 71.0 71.0 \n", "2072149 72.714138 71.0 71.0 \n", "2072150 72.714138 71.0 71.0 \n", "2072151 72.714138 71.0 71.0 \n", "2072152 72.714138 71.0 71.0 \n", "\n", " zone_030_heating_sp air_temp_set_1 air_temp_set_2 \\\n", "511205 67.0 11.590 11.130 \n", "511206 67.0 11.590 11.130 \n", "511207 67.0 11.590 11.130 \n", "511208 67.0 11.590 11.130 \n", "511209 67.0 11.590 11.130 \n", "... ... ... ... \n", "2072148 70.0 13.994 13.528 \n", "2072149 70.0 13.994 13.528 \n", "2072150 70.0 13.994 13.528 \n", "2072151 70.0 13.994 13.528 \n", "2072152 70.0 13.994 13.528 \n", "\n", " dew_point_temperature_set_1d relative_humidity_set_1 \\\n", "511205 3.00 55.87 \n", "511206 3.00 55.87 \n", "511207 3.00 55.87 \n", "511208 3.00 55.87 \n", "511209 3.00 55.87 \n", "... ... ... \n", "2072148 4.11 51.61 \n", "2072149 4.11 51.61 \n", "2072150 4.11 51.61 \n", "2072151 4.11 51.61 \n", "2072152 4.11 51.61 \n", "\n", " solar_radiation_set_1 \n", "511205 120.3 \n", "511206 120.3 \n", "511207 120.3 \n", "511208 120.3 \n", "511209 120.3 \n", "... ... \n", "2072148 188.8 \n", "2072149 188.8 \n", "2072150 188.8 \n", "2072151 188.8 \n", "2072152 188.8 \n", "\n", "[1560948 rows x 56 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_filtered" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "testdataset_df = df_filtered[(df_filtered.date.dt.date >date(2020, 3, 1)) & (df_filtered.date.dt.date date(2019, 11, 8))]\n", "\n", "traindataset_df = df_filtered[(df_filtered.date.dt.date >date(2019, 3, 1)) & (df_filtered.date.dt.date date(2020, 7, 1)) & (df_filtered.date.dt.date 0.38)[0]\n", "df['anomalies'] = anomalies\n", "\n", "df_new = df.dropna()\n", "\n", "df_new.plot.scatter(x='anomalies', y=1, c='r', ax = ax, label = 'Anomalies')\n", "\n", "# ax.scatter(anomalies,test_predict1[anomalies,var], color='black',marker =\"o\",s=100 )\n", "\n", "\n", "ax.set_title('Testing Data - Predicted vs Actual [Zone 72 Temperature]')\n", "ax.set_xlabel('Time')\n", "ax.set_ylabel('Value')\n", "ax.legend()\n", "fig.tight_layout()" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m33547/33547\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m308s\u001b[0m 9ms/step\n" ] } ], "source": [ "%matplotlib qt\n", "test_predict2 = model.predict(X_train)\n", "\n" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [], "source": [ "plt.figure()\n", "var = 0\n", "plt.plot(y_train[:,var], label='Original Training Data', color='blue')\n", "plt.plot(test_predict2[:,var], label='Predicted Training Data', color='red',alpha=0.8)\n", "anomalies = np.where(abs(test_predict2[:,var] - y_train[:,var]) > 0.38)\n", "plt.scatter(anomalies,test_predict2[anomalies,var], color='black',marker =\"o\",s=100 )\n", "\n", "\n", "plt.title('Training Data - Predicted vs Actual')\n", "plt.xlabel('Time')\n", "plt.ylabel('Value')\n", "plt.legend()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.mixture import GaussianMixture\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from sklearn.decomposition import PCA\n", "\n", "\n", "# Generating random data for demonstration\n", "np.random.seed(0)\n", "X = test_predict1 - y_test\n", "\n", "\n", "pca = PCA(n_components=2)\n", "X = pca.fit_transform(X)\n", "\n", "\n", "# Creating the GMM instance with desired number of clusters\n", "gmm = GaussianMixture(n_components=2)\n", "\n", "# Fitting the model to the data\n", "gmm.fit(X)\n", "\n", "# Getting the cluster labels\n", "labels = gmm.predict(X)\n", "\n", "# Plotting the data points with colors representing different clusters\n", "plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', alpha=0.5)\n", "plt.title('GMM Clustering')\n", "plt.xlabel('Feature 1')\n", "plt.ylabel('Feature 2')\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": 156, "metadata": {}, "outputs": [], "source": [ "from sklearn.cluster import KMeans\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from sklearn.decomposition import PCA\n", "\n", "# Generating random data for demonstration\n", "np.random.seed(0)\n", "X = (test_predict1 - y_test)\n", "\n", "k = 2\n", "\n", "pca = PCA(n_components=2)\n", "X = pca.fit_transform(X)\n", "\n", "kmeans = KMeans(n_clusters=k)\n", "\n", "kmeans.fit(X)\n", "\n", "\n", "\n", "# Getting the cluster centers and labels\n", "centroids = kmeans.cluster_centers_\n", "# centroids = pca.transform(centroids)\n", "labels = kmeans.labels_\n", "\n", "# Plotting the data points and cluster centers\n", "plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', alpha=0.5)\n", "plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', c='red', s=200, linewidths=2)\n", "plt.text(centroids[0,0]+0.2, centroids[0,1]+0.3, 'Normal', fontsize=12, color='red')\n", "plt.text(centroids[1,0]+0.5, centroids[1,1], 'Anomaly', fontsize=12, color='red')\n", "plt.title('KMeans Clustering')\n", "plt.xlabel('Feature 1')\n", "plt.ylabel('Feature 2')\n", "plt.tight_layout()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k = 60\n", "X= test_predict1 - y_test\n", "processed_data = []\n", "feat_df = pd.DataFrame(columns=[\"mean\",\"std\",])\n", "for i in range(0,len(X), 60):\n", " mean = X[i:i+k].mean(axis = 0)\n", " std = X[i:i+k].std(axis = 0)\n", " max = X[i:i+k].max(axis = 0)\n", " min = X[i:i+k].min(axis = 0)\n", " iqr = np.percentile(X[i:i+k], 75, axis=0) - np.percentile(X[i:i+k], 25,axis=0)\n", " data = np.concatenate([mean, std, max, min, iqr])\n", " processed_data.append([data])\n", "processed_data = np.concatenate(processed_data,axis=0) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X = processed_data\n", "\n", "kmeans = KMeans(n_clusters=2, algorithm='elkan', max_iter=1000, n_init = 5)\n", "\n", "kmeans.fit(X)\n", "\n", "pca = PCA(n_components=2)\n", "X = pca.fit_transform(X)\n", "\n", "\n", "# Getting the cluster centers and labels\n", "centroids = kmeans.cluster_centers_\n", "centroids = pca.transform(centroids)\n", "labels = kmeans.labels_\n", "\n", "# Plotting the data points and cluster centers\n", "plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', alpha=0.5)\n", "plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', c='red', s=200, linewidths=2)\n", "plt.title('KMeans Clustering')\n", "plt.xlabel('Feature 1')\n", "plt.ylabel('Feature 2')\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.mixture import GaussianMixture\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from sklearn.decomposition import PCA\n", "\n", "# Generating random data for demonstration\n", "np.random.seed(0)\n", "X = processed_data\n", "\n", "# Creating the GMM instance with desired number of clusters\n", "gmm = GaussianMixture(n_components=2, init_params='k-means++')\n", "\n", "# Fitting the model to the data\n", "gmm.fit(X)\n", "labels = gmm.predict(X)\n", "\n", "\n", "pca = PCA(n_components=2)\n", "X = pca.fit_transform(X)\n", "\n", "\n", "# Getting the cluster labels\n", "\n", "# Plotting the data points with colors representing different clusters\n", "plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', alpha=0.5)\n", "plt.title('GMM Clustering')\n", "plt.xlabel('Feature 1')\n", "plt.ylabel('Feature 2')\n", "plt.show()\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.cluster import KMeans\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "# Generating random data for demonstration\n", "np.random.seed(0)\n", "X = test_predict1 - y_test \n", "\n", "kmeans = KMeans(n_clusters=2)\n", "\n", "kmeans.fit(X)\n", "\n", "\n", "pca = PCA(n_components=2)\n", "X = pca.fit_transform(X)\n", "\n", "\n", "\n", "# Getting the cluster centers and labels\n", "centroids = kmeans.cluster_centers_\n", "centroids = pca.transform(centroids)\n", "labels = kmeans.labels_\n", "\n", "# Plotting the data points and cluster centers\n", "plt.figure()\n", "plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', alpha=0.5)\n", "plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', c='red', s=200, linewidths=2)\n", "plt.text(centroids[0,0], centroids[0,1], 'Normal', fontsize=12, color='red')\n", "plt.text(centroids[1,0], centroids[1,1], 'Anomaly', fontsize=12, color='red')\n", "plt.title('KMeans Clustering')\n", "plt.xlabel('Feature 1')\n", "plt.ylabel('Feature 2')\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "329763" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum(labels==0)" ] } ], "metadata": { "kernelspec": { "display_name": "tensorflow", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.8" } }, "nbformat": 4, "nbformat_minor": 2 }