diff --git "a/notebooks/01a_Classification models on incident category_Clean Data.ipynb" "b/notebooks/01a_Classification models on incident category_Clean Data.ipynb"
new file mode 100644--- /dev/null
+++ "b/notebooks/01a_Classification models on incident category_Clean Data.ipynb"
@@ -0,0 +1,1673 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "feaf77ab",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "workding dir: /Users/inflaton/code/engd/papers/maritime/global-incidents\n",
+      "loading env vars from: /Users/inflaton/code/engd/papers/maritime/global-incidents/.env\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import os\n",
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "workding_dir = str(Path.cwd().parent)\n",
+    "os.chdir(workding_dir)\n",
+    "sys.path.append(workding_dir)\n",
+    "print(\"workding dir:\", workding_dir)\n",
+    "\n",
+    "from dotenv import find_dotenv, load_dotenv\n",
+    "\n",
+    "found_dotenv = find_dotenv(\".env\")\n",
+    "\n",
+    "if len(found_dotenv) == 0:\n",
+    "    found_dotenv = find_dotenv(\".env.example\")\n",
+    "print(f\"loading env vars from: {found_dotenv}\")\n",
+    "load_dotenv(found_dotenv, override=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3a7dd7d8",
+   "metadata": {},
+   "source": [
+    "## Import Statement"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "86fc25e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fac53e88",
+   "metadata": {},
+   "source": [
+    "### read the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "dc33b13b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(\"data/all_port_labelled.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "31f58fd1",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Unnamed: 0</th>\n",
+       "      <th>Index</th>\n",
+       "      <th>Unnamed: 0.1</th>\n",
+       "      <th>Headline</th>\n",
+       "      <th>Details</th>\n",
+       "      <th>Severity</th>\n",
+       "      <th>Category</th>\n",
+       "      <th>Region</th>\n",
+       "      <th>Datetime</th>\n",
+       "      <th>Year</th>\n",
+       "      <th>...</th>\n",
+       "      <th>IT</th>\n",
+       "      <th>EP</th>\n",
+       "      <th>NEW</th>\n",
+       "      <th>CSD</th>\n",
+       "      <th>RPE</th>\n",
+       "      <th>MN</th>\n",
+       "      <th>NM</th>\n",
+       "      <th>if_labeled</th>\n",
+       "      <th>Month</th>\n",
+       "      <th>Week</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>34.0</td>\n",
+       "      <td>Grasberg Mine- Grasberg mine workers extend st...</td>\n",
+       "      <td>Media sources indicate that workers at the Gra...</td>\n",
+       "      <td>Moderate</td>\n",
+       "      <td>Mine Workers Strike</td>\n",
+       "      <td>Indonesia</td>\n",
+       "      <td>28/5/17 17:08</td>\n",
+       "      <td>2017.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>False</td>\n",
+       "      <td>5.0</td>\n",
+       "      <td>21.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1.0</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>63.0</td>\n",
+       "      <td>Indonesia: Undersea internet cables damaged by...</td>\n",
+       "      <td>News sources are stating that recent typhoons ...</td>\n",
+       "      <td>Minor</td>\n",
+       "      <td>Travel Warning</td>\n",
+       "      <td>Indonesia</td>\n",
+       "      <td>4/9/17 14:30</td>\n",
+       "      <td>2017.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>False</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>14.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2 rows × 46 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Unnamed: 0  Index  Unnamed: 0.1  \\\n",
+       "0         0.0    8.0          34.0   \n",
+       "1         1.0   10.0          63.0   \n",
+       "\n",
+       "                                            Headline  \\\n",
+       "0  Grasberg Mine- Grasberg mine workers extend st...   \n",
+       "1  Indonesia: Undersea internet cables damaged by...   \n",
+       "\n",
+       "                                             Details  Severity  \\\n",
+       "0  Media sources indicate that workers at the Gra...  Moderate   \n",
+       "1  News sources are stating that recent typhoons ...     Minor   \n",
+       "\n",
+       "              Category     Region       Datetime    Year  ...   IT   EP  NEW  \\\n",
+       "0  Mine Workers Strike  Indonesia  28/5/17 17:08  2017.0  ...  0.0  0.0  0.0   \n",
+       "1       Travel Warning  Indonesia   4/9/17 14:30  2017.0  ...  0.0  0.0  0.0   \n",
+       "\n",
+       "   CSD  RPE   MN   NM if_labeled  Month  Week  \n",
+       "0  0.0  0.0  0.0  1.0      False    5.0  21.0  \n",
+       "1  0.0  0.0  1.0  0.0      False    4.0  14.0  \n",
+       "\n",
+       "[2 rows x 46 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head(2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9bff68c9",
+   "metadata": {},
+   "source": [
+    "### Clean empty data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "41aa751c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nltk\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "from nltk.stem import WordNetLemmatizer\n",
+    "import string\n",
+    "\n",
+    "# nltk.download('punkt')\n",
+    "# nltk.download('stopwords')\n",
+    "# nltk.download('wordnet')\n",
+    "\n",
+    "\n",
+    "def clean_text(text):\n",
+    "    # Lowercase\n",
+    "    text = text.lower()\n",
+    "    # Tokenization\n",
+    "    tokens = word_tokenize(text)\n",
+    "    # Removing punctuation\n",
+    "    tokens = [word for word in tokens if word not in string.punctuation]\n",
+    "    # Removing stop words\n",
+    "    stop_words = set(stopwords.words(\"english\"))\n",
+    "    tokens = [word for word in tokens if word not in stop_words]\n",
+    "    # Lemmatization\n",
+    "    lemmatizer = WordNetLemmatizer()\n",
+    "    tokens = [lemmatizer.lemmatize(word) for word in tokens]\n",
+    "\n",
+    "    return \" \".join(tokens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "6293f613",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package omw-1.4 to\n",
+      "[nltk_data]     /Users/inflaton/nltk_data...\n",
+      "[nltk_data]   Package omw-1.4 is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "\n",
+    "nltk.download(\"omw-1.4\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fad3210d",
+   "metadata": {},
+   "source": [
+    "### The Details column has an issue\n",
+    "\n",
+    "some of the data are of the type float and none of the text processing functions can be applied to it therefore we have to process it"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "b1799269",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 5782 entries, 0 to 5781\n",
+      "Data columns (total 2 columns):\n",
+      " #   Column    Non-Null Count  Dtype \n",
+      "---  ------    --------------  ----- \n",
+      " 0   Details   5781 non-null   object\n",
+      " 1   Category  5780 non-null   object\n",
+      "dtypes: object(2)\n",
+      "memory usage: 90.5+ KB\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 5782 entries, 0 to 5781\n",
+      "Data columns (total 4 columns):\n",
+      " #   Column            Non-Null Count  Dtype \n",
+      "---  ------            --------------  ----- \n",
+      " 0   Details           5781 non-null   object\n",
+      " 1   Category          5780 non-null   object\n",
+      " 2   Details_cleaned   5781 non-null   object\n",
+      " 3   Category_cleaned  5780 non-null   object\n",
+      "dtypes: object(4)\n",
+      "memory usage: 180.8+ KB\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/7x/56svhln929zdh2xhr3mwqg4r0000gn/T/ipykernel_76478/4121100139.py:3: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  text_df[\"Details_cleaned\"] = text_df[\"Details\"].apply(\n",
+      "/var/folders/7x/56svhln929zdh2xhr3mwqg4r0000gn/T/ipykernel_76478/4121100139.py:6: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  text_df[\"Category_cleaned\"] = text_df[\"Category\"].apply(\n"
+     ]
+    }
+   ],
+   "source": [
+    "text_df = df[[\"Details\", \"Category\"]]\n",
+    "text_df.info()\n",
+    "text_df[\"Details_cleaned\"] = text_df[\"Details\"].apply(\n",
+    "    lambda x: clean_text(x) if not isinstance(x, float) else None\n",
+    ")\n",
+    "text_df[\"Category_cleaned\"] = text_df[\"Category\"].apply(\n",
+    "    lambda x: None if isinstance(x, float) else x\n",
+    ")\n",
+    "\n",
+    "# no_nan_df[no_nan_df[\"Details\"].apply(lambda x: print(type(x)))]\n",
+    "# cleaned_df = text_df[text_df[\"Details\"].apply(lambda x: clean_text(x))]\n",
+    "# cleaned_df = df['Details'][1:2]\n",
+    "# type(no_nan_df[\"Details\"][0])\n",
+    "# print(clean_text(no_nan_df[\"Details\"][0]))\n",
+    "text_df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "5fcc3b33",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Details</th>\n",
+       "      <th>Category</th>\n",
+       "      <th>Details_cleaned</th>\n",
+       "      <th>Category_cleaned</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Media sources indicate that workers at the Gra...</td>\n",
+       "      <td>Mine Workers Strike</td>\n",
+       "      <td>medium source indicate worker grasberg mine ex...</td>\n",
+       "      <td>Mine Workers Strike</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>News sources are stating that recent typhoons ...</td>\n",
+       "      <td>Travel Warning</td>\n",
+       "      <td>news source stating recent typhoon impact hong...</td>\n",
+       "      <td>Travel Warning</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>The persisting port congestion at Shanghai’s Y...</td>\n",
+       "      <td>Port Congestion</td>\n",
+       "      <td>persisting port congestion shanghai ’ yangshan...</td>\n",
+       "      <td>Port Congestion</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Updated local media sources from Jakarta indic...</td>\n",
+       "      <td>Bombing, Police Operations</td>\n",
+       "      <td>updated local medium source jakarta indicate e...</td>\n",
+       "      <td>Bombing, Police Operations</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>According to local police in Jakarta, two expl...</td>\n",
+       "      <td>Bombing, Police Operations</td>\n",
+       "      <td>according local police jakarta two explosion c...</td>\n",
+       "      <td>Bombing, Police Operations</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Severe winds have downed billboards and trees ...</td>\n",
+       "      <td>Roadway Closure / Disruption, Flooding, Severe...</td>\n",
+       "      <td>severe wind downed billboard tree bandung wedn...</td>\n",
+       "      <td>Roadway Closure / Disruption, Flooding, Severe...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Local media sources indicated on October 29 th...</td>\n",
+       "      <td>Cargo/Warehouse Theft</td>\n",
+       "      <td>local medium source indicated october 29 wareh...</td>\n",
+       "      <td>Cargo/Warehouse Theft</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Tropical Storm Rumbia had dissipated after tra...</td>\n",
+       "      <td>Tropical Cyclone / Storm</td>\n",
+       "      <td>tropical storm rumbia dissipated travelling ar...</td>\n",
+       "      <td>Tropical Cyclone / Storm</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Tropical Depression Yutu, also referred to as ...</td>\n",
+       "      <td>Storm</td>\n",
+       "      <td>tropical depression yutu also referred `` '' r...</td>\n",
+       "      <td>Storm</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>A magnitude 4.5 earthquake was detected 14 mil...</td>\n",
+       "      <td>Earthquake</td>\n",
+       "      <td>magnitude 4.5 earthquake detected 14 mile nort...</td>\n",
+       "      <td>Earthquake</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                             Details  \\\n",
+       "0  Media sources indicate that workers at the Gra...   \n",
+       "1  News sources are stating that recent typhoons ...   \n",
+       "2  The persisting port congestion at Shanghai’s Y...   \n",
+       "3  Updated local media sources from Jakarta indic...   \n",
+       "4  According to local police in Jakarta, two expl...   \n",
+       "5  Severe winds have downed billboards and trees ...   \n",
+       "6  Local media sources indicated on October 29 th...   \n",
+       "7  Tropical Storm Rumbia had dissipated after tra...   \n",
+       "8  Tropical Depression Yutu, also referred to as ...   \n",
+       "9  A magnitude 4.5 earthquake was detected 14 mil...   \n",
+       "\n",
+       "                                            Category  \\\n",
+       "0                                Mine Workers Strike   \n",
+       "1                                     Travel Warning   \n",
+       "2                                    Port Congestion   \n",
+       "3                         Bombing, Police Operations   \n",
+       "4                         Bombing, Police Operations   \n",
+       "5  Roadway Closure / Disruption, Flooding, Severe...   \n",
+       "6                              Cargo/Warehouse Theft   \n",
+       "7                           Tropical Cyclone / Storm   \n",
+       "8                                              Storm   \n",
+       "9                                         Earthquake   \n",
+       "\n",
+       "                                     Details_cleaned  \\\n",
+       "0  medium source indicate worker grasberg mine ex...   \n",
+       "1  news source stating recent typhoon impact hong...   \n",
+       "2  persisting port congestion shanghai ’ yangshan...   \n",
+       "3  updated local medium source jakarta indicate e...   \n",
+       "4  according local police jakarta two explosion c...   \n",
+       "5  severe wind downed billboard tree bandung wedn...   \n",
+       "6  local medium source indicated october 29 wareh...   \n",
+       "7  tropical storm rumbia dissipated travelling ar...   \n",
+       "8  tropical depression yutu also referred `` '' r...   \n",
+       "9  magnitude 4.5 earthquake detected 14 mile nort...   \n",
+       "\n",
+       "                                    Category_cleaned  \n",
+       "0                                Mine Workers Strike  \n",
+       "1                                     Travel Warning  \n",
+       "2                                    Port Congestion  \n",
+       "3                         Bombing, Police Operations  \n",
+       "4                         Bombing, Police Operations  \n",
+       "5  Roadway Closure / Disruption, Flooding, Severe...  \n",
+       "6                              Cargo/Warehouse Theft  \n",
+       "7                           Tropical Cyclone / Storm  \n",
+       "8                                              Storm  \n",
+       "9                                         Earthquake  "
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "processed_data = text_df.dropna()\n",
+    "processed_data.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "d02b4b00",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "857"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "processed_data[\"Category\"].nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "9ee856a1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 5780 entries, 0 to 5781\n",
+      "Data columns (total 4 columns):\n",
+      " #   Column            Non-Null Count  Dtype \n",
+      "---  ------            --------------  ----- \n",
+      " 0   Details           5780 non-null   object\n",
+      " 1   Category          5780 non-null   object\n",
+      " 2   Details_cleaned   5780 non-null   object\n",
+      " 3   Category_cleaned  5780 non-null   object\n",
+      "dtypes: object(4)\n",
+      "memory usage: 225.8+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "processed_data.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3f6d478f",
+   "metadata": {},
+   "source": [
+    "## Process the Category column\n",
+    "this is not seldom done as we don't usually process the y of the data\n",
+    "However, the category is too complex and requires processing if not the labels are just too much"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "285013d3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "111\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create a function that will split the labels into individual\n",
+    "import re\n",
+    "\n",
+    "\n",
+    "def split_string(text):\n",
+    "    # Split the string using either \"/\" or \",\" as separator\n",
+    "    words = re.split(r\"[\\/,]\", text)\n",
+    "    # Remove any leading or trailing whitespace from each word\n",
+    "    words = [word.strip() for word in words if word.strip()]\n",
+    "    return words\n",
+    "\n",
+    "\n",
+    "# Example usage:\n",
+    "# input_str = \"Roadway Closure / Disruption, Flooding, Severe Winds, Weather Advisory\"\n",
+    "# result = split_string(input_str)\n",
+    "# print(result)\n",
+    "\n",
+    "# create a list to find the number of unique individual labels\n",
+    "label_list = []\n",
+    "\n",
+    "for i in processed_data[\"Category_cleaned\"]:\n",
+    "    for j in split_string(i):\n",
+    "        if j not in label_list:\n",
+    "            label_list.append(j)\n",
+    "\n",
+    "# print(label)\n",
+    "print(len(label_list))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e7b48e8",
+   "metadata": {},
+   "source": [
+    "#### After filtering out the unique labels in the Category column we are still left with 111 labels which is still considered too much"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "33234f8c",
+   "metadata": {},
+   "source": [
+    "#### The next step would be to to reduce a data's category label into 1 single label \n",
+    "Previously the data looks like Roadway Closure / Disruption, Flooding, Severe... we need to reduce it to 1 single label \n",
+    "The next process we are going to use in is that we assume the first label in is the most prominent category then we will remove the other categories"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "12f9b9b4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 5780 entries, 0 to 5781\n",
+      "Data columns (total 5 columns):\n",
+      " #   Column            Non-Null Count  Dtype \n",
+      "---  ------            --------------  ----- \n",
+      " 0   Details           5780 non-null   object\n",
+      " 1   Category          5780 non-null   object\n",
+      " 2   Details_cleaned   5780 non-null   object\n",
+      " 3   Category_cleaned  5780 non-null   object\n",
+      " 4   Category_single   5780 non-null   object\n",
+      "dtypes: object(5)\n",
+      "memory usage: 270.9+ KB\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/7x/56svhln929zdh2xhr3mwqg4r0000gn/T/ipykernel_76478/2791632185.py:29: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  text_df[\"Category_single\"] = text_df[\"Category_cleaned\"].apply(\n"
+     ]
+    }
+   ],
+   "source": [
+    "def split_and_get_first(text):\n",
+    "    # Split the string using either \"/\" or \",\" as separator\n",
+    "    if text == None:\n",
+    "        return None\n",
+    "    words = re.split(r\"[\\/,]\", text)\n",
+    "    # Remove any leading or trailing whitespace from each word\n",
+    "    words = [word.strip() for word in words if word.strip()]\n",
+    "    # Return the first word after split\n",
+    "    if words:\n",
+    "        return words[0]\n",
+    "    else:\n",
+    "        return None\n",
+    "\n",
+    "\n",
+    "def remove_none_rows(df, column_name):\n",
+    "    # Iterate through the DataFrame\n",
+    "    for index, value in enumerate(df[column_name]):\n",
+    "        # Check if the value is None\n",
+    "        if value is None:\n",
+    "            # Remove the row where the data belongs to\n",
+    "            df = df.drop(index, axis=0)\n",
+    "    return df\n",
+    "\n",
+    "\n",
+    "# Example usage:\n",
+    "# input_str = \"Roadway Closure / Disruption, Flooding, Severe Winds, Weather Advisory\"\n",
+    "# result = split_and_get_first(input_str)\n",
+    "# print(result)\n",
+    "text_df[\"Category_single\"] = text_df[\"Category_cleaned\"].apply(\n",
+    "    lambda x: split_and_get_first(x)\n",
+    ")\n",
+    "result_df = remove_none_rows(text_df, \"Category_cleaned\")\n",
+    "result_df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "b5931fe1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Details</th>\n",
+       "      <th>Category</th>\n",
+       "      <th>Details_cleaned</th>\n",
+       "      <th>Category_cleaned</th>\n",
+       "      <th>Category_single</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Media sources indicate that workers at the Gra...</td>\n",
+       "      <td>Mine Workers Strike</td>\n",
+       "      <td>medium source indicate worker grasberg mine ex...</td>\n",
+       "      <td>Mine Workers Strike</td>\n",
+       "      <td>Mine Workers Strike</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>News sources are stating that recent typhoons ...</td>\n",
+       "      <td>Travel Warning</td>\n",
+       "      <td>news source stating recent typhoon impact hong...</td>\n",
+       "      <td>Travel Warning</td>\n",
+       "      <td>Travel Warning</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>The persisting port congestion at Shanghai’s Y...</td>\n",
+       "      <td>Port Congestion</td>\n",
+       "      <td>persisting port congestion shanghai ’ yangshan...</td>\n",
+       "      <td>Port Congestion</td>\n",
+       "      <td>Port Congestion</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Updated local media sources from Jakarta indic...</td>\n",
+       "      <td>Bombing, Police Operations</td>\n",
+       "      <td>updated local medium source jakarta indicate e...</td>\n",
+       "      <td>Bombing, Police Operations</td>\n",
+       "      <td>Bombing</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>According to local police in Jakarta, two expl...</td>\n",
+       "      <td>Bombing, Police Operations</td>\n",
+       "      <td>according local police jakarta two explosion c...</td>\n",
+       "      <td>Bombing, Police Operations</td>\n",
+       "      <td>Bombing</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Severe winds have downed billboards and trees ...</td>\n",
+       "      <td>Roadway Closure / Disruption, Flooding, Severe...</td>\n",
+       "      <td>severe wind downed billboard tree bandung wedn...</td>\n",
+       "      <td>Roadway Closure / Disruption, Flooding, Severe...</td>\n",
+       "      <td>Roadway Closure</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Local media sources indicated on October 29 th...</td>\n",
+       "      <td>Cargo/Warehouse Theft</td>\n",
+       "      <td>local medium source indicated october 29 wareh...</td>\n",
+       "      <td>Cargo/Warehouse Theft</td>\n",
+       "      <td>Cargo</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Tropical Storm Rumbia had dissipated after tra...</td>\n",
+       "      <td>Tropical Cyclone / Storm</td>\n",
+       "      <td>tropical storm rumbia dissipated travelling ar...</td>\n",
+       "      <td>Tropical Cyclone / Storm</td>\n",
+       "      <td>Tropical Cyclone</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Tropical Depression Yutu, also referred to as ...</td>\n",
+       "      <td>Storm</td>\n",
+       "      <td>tropical depression yutu also referred `` '' r...</td>\n",
+       "      <td>Storm</td>\n",
+       "      <td>Storm</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>A magnitude 4.5 earthquake was detected 14 mil...</td>\n",
+       "      <td>Earthquake</td>\n",
+       "      <td>magnitude 4.5 earthquake detected 14 mile nort...</td>\n",
+       "      <td>Earthquake</td>\n",
+       "      <td>Earthquake</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Multiple sources report that a magnitude 5.5 e...</td>\n",
+       "      <td>Earthquake</td>\n",
+       "      <td>multiple source report magnitude 5.5 earthquak...</td>\n",
+       "      <td>Earthquake</td>\n",
+       "      <td>Earthquake</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Post-Tropical Cyclone Michael is approximately...</td>\n",
+       "      <td>Tropical Cyclone / Storm</td>\n",
+       "      <td>post-tropical cyclone michael approximately 18...</td>\n",
+       "      <td>Tropical Cyclone / Storm</td>\n",
+       "      <td>Tropical Cyclone</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Industry sources indicate on September 11 that...</td>\n",
+       "      <td>Workplace Accident</td>\n",
+       "      <td>industry source indicate september 11 2 worker...</td>\n",
+       "      <td>Workplace Accident</td>\n",
+       "      <td>Workplace Accident</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Government sources are reporting a tornado has...</td>\n",
+       "      <td>Tornado</td>\n",
+       "      <td>government source reporting tornado touched tw...</td>\n",
+       "      <td>Tornado</td>\n",
+       "      <td>Tornado</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Media sources are informing on September 24 th...</td>\n",
+       "      <td>Industrial Action</td>\n",
+       "      <td>medium source informing september 24 oil worke...</td>\n",
+       "      <td>Industrial Action</td>\n",
+       "      <td>Industrial Action</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>A magnitude 4.5 earthquake was detected in cen...</td>\n",
+       "      <td>Earthquake</td>\n",
+       "      <td>magnitude 4.5 earthquake detected central taiw...</td>\n",
+       "      <td>Earthquake</td>\n",
+       "      <td>Earthquake</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>Industry sources indicate on August 31 that th...</td>\n",
+       "      <td>Port Congestion</td>\n",
+       "      <td>industry source indicate august 31 port durban...</td>\n",
+       "      <td>Port Congestion</td>\n",
+       "      <td>Port Congestion</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>Tropical Depression Gordon continues to weaken...</td>\n",
+       "      <td>Storm</td>\n",
+       "      <td>tropical depression gordon continues weaken mo...</td>\n",
+       "      <td>Storm</td>\n",
+       "      <td>Storm</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>Local media sources indicated on November 8 th...</td>\n",
+       "      <td>Public Safety / Security</td>\n",
+       "      <td>local medium source indicated november 8 270 k...</td>\n",
+       "      <td>Public Safety / Security</td>\n",
+       "      <td>Public Safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>The European-Mediterranean Seismological Centr...</td>\n",
+       "      <td>Earthquake</td>\n",
+       "      <td>european-mediterranean seismological centre re...</td>\n",
+       "      <td>Earthquake</td>\n",
+       "      <td>Earthquake</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                              Details  \\\n",
+       "0   Media sources indicate that workers at the Gra...   \n",
+       "1   News sources are stating that recent typhoons ...   \n",
+       "2   The persisting port congestion at Shanghai’s Y...   \n",
+       "3   Updated local media sources from Jakarta indic...   \n",
+       "4   According to local police in Jakarta, two expl...   \n",
+       "5   Severe winds have downed billboards and trees ...   \n",
+       "6   Local media sources indicated on October 29 th...   \n",
+       "7   Tropical Storm Rumbia had dissipated after tra...   \n",
+       "8   Tropical Depression Yutu, also referred to as ...   \n",
+       "9   A magnitude 4.5 earthquake was detected 14 mil...   \n",
+       "10  Multiple sources report that a magnitude 5.5 e...   \n",
+       "11  Post-Tropical Cyclone Michael is approximately...   \n",
+       "12  Industry sources indicate on September 11 that...   \n",
+       "13  Government sources are reporting a tornado has...   \n",
+       "14  Media sources are informing on September 24 th...   \n",
+       "15  A magnitude 4.5 earthquake was detected in cen...   \n",
+       "16  Industry sources indicate on August 31 that th...   \n",
+       "17  Tropical Depression Gordon continues to weaken...   \n",
+       "18  Local media sources indicated on November 8 th...   \n",
+       "19  The European-Mediterranean Seismological Centr...   \n",
+       "\n",
+       "                                             Category  \\\n",
+       "0                                 Mine Workers Strike   \n",
+       "1                                      Travel Warning   \n",
+       "2                                     Port Congestion   \n",
+       "3                          Bombing, Police Operations   \n",
+       "4                          Bombing, Police Operations   \n",
+       "5   Roadway Closure / Disruption, Flooding, Severe...   \n",
+       "6                               Cargo/Warehouse Theft   \n",
+       "7                            Tropical Cyclone / Storm   \n",
+       "8                                               Storm   \n",
+       "9                                          Earthquake   \n",
+       "10                                         Earthquake   \n",
+       "11                           Tropical Cyclone / Storm   \n",
+       "12                                 Workplace Accident   \n",
+       "13                                            Tornado   \n",
+       "14                                  Industrial Action   \n",
+       "15                                         Earthquake   \n",
+       "16                                    Port Congestion   \n",
+       "17                                              Storm   \n",
+       "18                           Public Safety / Security   \n",
+       "19                                         Earthquake   \n",
+       "\n",
+       "                                      Details_cleaned  \\\n",
+       "0   medium source indicate worker grasberg mine ex...   \n",
+       "1   news source stating recent typhoon impact hong...   \n",
+       "2   persisting port congestion shanghai ’ yangshan...   \n",
+       "3   updated local medium source jakarta indicate e...   \n",
+       "4   according local police jakarta two explosion c...   \n",
+       "5   severe wind downed billboard tree bandung wedn...   \n",
+       "6   local medium source indicated october 29 wareh...   \n",
+       "7   tropical storm rumbia dissipated travelling ar...   \n",
+       "8   tropical depression yutu also referred `` '' r...   \n",
+       "9   magnitude 4.5 earthquake detected 14 mile nort...   \n",
+       "10  multiple source report magnitude 5.5 earthquak...   \n",
+       "11  post-tropical cyclone michael approximately 18...   \n",
+       "12  industry source indicate september 11 2 worker...   \n",
+       "13  government source reporting tornado touched tw...   \n",
+       "14  medium source informing september 24 oil worke...   \n",
+       "15  magnitude 4.5 earthquake detected central taiw...   \n",
+       "16  industry source indicate august 31 port durban...   \n",
+       "17  tropical depression gordon continues weaken mo...   \n",
+       "18  local medium source indicated november 8 270 k...   \n",
+       "19  european-mediterranean seismological centre re...   \n",
+       "\n",
+       "                                     Category_cleaned      Category_single  \n",
+       "0                                 Mine Workers Strike  Mine Workers Strike  \n",
+       "1                                      Travel Warning       Travel Warning  \n",
+       "2                                     Port Congestion      Port Congestion  \n",
+       "3                          Bombing, Police Operations              Bombing  \n",
+       "4                          Bombing, Police Operations              Bombing  \n",
+       "5   Roadway Closure / Disruption, Flooding, Severe...      Roadway Closure  \n",
+       "6                               Cargo/Warehouse Theft                Cargo  \n",
+       "7                            Tropical Cyclone / Storm     Tropical Cyclone  \n",
+       "8                                               Storm                Storm  \n",
+       "9                                          Earthquake           Earthquake  \n",
+       "10                                         Earthquake           Earthquake  \n",
+       "11                           Tropical Cyclone / Storm     Tropical Cyclone  \n",
+       "12                                 Workplace Accident   Workplace Accident  \n",
+       "13                                            Tornado              Tornado  \n",
+       "14                                  Industrial Action    Industrial Action  \n",
+       "15                                         Earthquake           Earthquake  \n",
+       "16                                    Port Congestion      Port Congestion  \n",
+       "17                                              Storm                Storm  \n",
+       "18                           Public Safety / Security        Public Safety  \n",
+       "19                                         Earthquake           Earthquake  "
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result_df.head(20)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "9c19b11a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "94"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result_df[\"Category_single\"].nunique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "29d4037f",
+   "metadata": {},
+   "source": [
+    "### After taking out the first label in the Category column we are still left with 94 unique labels\n",
+    "This is still unacceptable amount of labels the next step we are planning to use is to manually group the labels in more generalize label by using a rule based system"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "10f07d05",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### first export the unique labels into excel for better visualization\n",
+    "unique_labels_df = pd.DataFrame({\"String\": label_list})\n",
+    "file_path = \"data/label_list.xlsx\"\n",
+    "\n",
+    "# Save DataFrame to Excel\n",
+    "unique_labels_df.to_excel(file_path, index=False)"
+   ]
+  },
+  {
+   "attachments": {
+    "converstion.png": {
+     "image/png": ""
+    }
+   },
+   "cell_type": "markdown",
+   "id": "398e6da8",
+   "metadata": {},
+   "source": [
+    "![converstion.png](attachment:converstion.png)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "d4357af0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Weather</th>\n",
+       "      <th>Worker Strike</th>\n",
+       "      <th>Administrative Issue</th>\n",
+       "      <th>Human Error</th>\n",
+       "      <th>Cyber Attack</th>\n",
+       "      <th>Terrorism</th>\n",
+       "      <th>Accident</th>\n",
+       "      <th>Others</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Flooding</td>\n",
+       "      <td>Mine Workers Strike</td>\n",
+       "      <td>Port Congestion</td>\n",
+       "      <td>Workplace Accident</td>\n",
+       "      <td>Network Disruption</td>\n",
+       "      <td>Bombing</td>\n",
+       "      <td>Maritime Accident</td>\n",
+       "      <td>Miscellaneous Events</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Severe Winds</td>\n",
+       "      <td>Production Halt</td>\n",
+       "      <td>Police Operations</td>\n",
+       "      <td>Individuals in Focus</td>\n",
+       "      <td>Ransomware</td>\n",
+       "      <td>Warehouse Theft</td>\n",
+       "      <td>Vehicle Accident</td>\n",
+       "      <td>Miscellaneous Strikes</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Weather Advisory</td>\n",
+       "      <td>Protest</td>\n",
+       "      <td>Roadway Closure</td>\n",
+       "      <td>Military Operations</td>\n",
+       "      <td>Data breach</td>\n",
+       "      <td>Public Safety</td>\n",
+       "      <td>Death</td>\n",
+       "      <td>Outbreak of disease</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Tropical Cyclone</td>\n",
+       "      <td>Riot</td>\n",
+       "      <td>Disruption</td>\n",
+       "      <td>Flight Delays</td>\n",
+       "      <td>Phishing</td>\n",
+       "      <td>Security</td>\n",
+       "      <td>Injury</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Storm</td>\n",
+       "      <td>Port Strike</td>\n",
+       "      <td>Cargo</td>\n",
+       "      <td>Cancellations</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Organized Crime</td>\n",
+       "      <td>Non-industrial Fire</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            Weather        Worker Strike Administrative Issue  \\\n",
+       "0          Flooding  Mine Workers Strike      Port Congestion   \n",
+       "1      Severe Winds      Production Halt    Police Operations   \n",
+       "2  Weather Advisory              Protest      Roadway Closure   \n",
+       "3  Tropical Cyclone                 Riot           Disruption   \n",
+       "4             Storm          Port Strike                Cargo   \n",
+       "\n",
+       "            Human Error        Cyber Attack        Terrorism  \\\n",
+       "0    Workplace Accident  Network Disruption          Bombing   \n",
+       "1  Individuals in Focus          Ransomware  Warehouse Theft   \n",
+       "2   Military Operations         Data breach    Public Safety   \n",
+       "3         Flight Delays            Phishing         Security   \n",
+       "4         Cancellations                 NaN  Organized Crime   \n",
+       "\n",
+       "              Accident                 Others  \n",
+       "0    Maritime Accident   Miscellaneous Events  \n",
+       "1     Vehicle Accident  Miscellaneous Strikes  \n",
+       "2                Death    Outbreak of disease  \n",
+       "3               Injury                    NaN  \n",
+       "4  Non-industrial Fire                    NaN  "
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "new_labels_df = pd.read_excel(\"data/new_labels.xlsx\")\n",
+    "new_labels_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "407189c9",
+   "metadata": {},
+   "source": [
+    "#### convert them into lists"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "73939327",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Weather\n",
+      "\n",
+      "['Flooding', 'Severe Winds', 'Weather Advisory', 'Tropical Cyclone', 'Storm', 'Earthquake', 'Tornado', 'Typhoon', 'Landslide', 'Water', 'Hurricane', 'Wildfire', 'Blizzard', 'Hail']\n",
+      "\n",
+      "\n",
+      "Worker Strike\n",
+      "\n",
+      "['Mine Workers Strike', 'Production Halt', 'Protest', 'Riot', 'Port Strike', 'General Strike', 'Civil Service Strike', 'Civil Unrest Advisory', 'Cargo Transportation Strike', 'Energy Sector Strike']\n",
+      "\n",
+      "\n",
+      "Administrative Issue\n",
+      "\n",
+      "['Port Congestion', 'Police Operations', 'Roadway Closure', 'Disruption', 'Cargo', 'Industrial Action', 'Port Disruption', 'Cargo Disruption', 'Power Outage', 'Port Closure', 'Maritime Advisory', 'Train Delays', 'Ground Transportation Advisory', 'Public Transportation Disruption', 'Trade Regulation', 'Customs Regulation', 'Regulatory Advisory', 'Industry Directives', 'Security Advisory', 'Public Holidays', 'Customs Delay', 'Public Health Advisory', 'Detention', 'Aviation Advisory', 'Waterway closure', 'Waterway Closure', 'Plant Closure', 'Border Closure', 'Delay', 'Industrial zone shutdown', 'Trade Restrictions', 'Closure', 'Truck Driving Ban', 'Insolvency', 'Environmental Regulations', 'Postal Disruption', 'Ice Storm', 'Travel Warning']\n",
+      "\n",
+      "\n",
+      "Human Error\n",
+      "\n",
+      "['Workplace Accident', 'Individuals in Focus', 'Military Operations', 'Flight Delays', 'Cancellations', 'Political Info', 'Event']\n",
+      "\n",
+      "\n",
+      "Cyber Attack\n",
+      "\n",
+      "['Network Disruption', 'Ransomware', 'Data breach', 'Phishing']\n",
+      "\n",
+      "\n",
+      "Terrorism\n",
+      "\n",
+      "['Bombing', 'Warehouse Theft', 'Public Safety', 'Security', 'Organized Crime', 'Hazmat Response', 'Piracy', 'Kidnap', 'Shooting', 'Robbery', 'Cargo theft', 'Bomb Detonation', 'Terror Attack', 'Outbreak Of War', 'Militant Action']\n",
+      "\n",
+      "\n",
+      "Accident\n",
+      "\n",
+      "['Maritime Accident', 'Vehicle Accident', 'Death', 'Injury', 'Non-industrial Fire', 'Chemical Spill', 'Industrial Fire', 'Fuel Disruption', 'Airline Incident', 'Crash', 'Explosion', 'Train Accident', 'Derailment', 'Sewage Disruption', 'Barge Accident', 'Bridge Collapse', 'Structure Collapse', 'Airport Accident', 'Force Majeure', 'Telecom Outage']\n",
+      "\n",
+      "\n",
+      "Others\n",
+      "\n",
+      "['Miscellaneous Events', 'Miscellaneous Strikes', 'Outbreak of disease']\n"
+     ]
+    }
+   ],
+   "source": [
+    "new_labels_dict = new_labels_df.to_dict(orient=\"list\")\n",
+    "\n",
+    "\n",
+    "for key, value in new_labels_dict.items():\n",
+    "    new_labels_dict[key] = [item for item in value if not pd.isnull(item)]\n",
+    "\n",
+    "for category in new_labels_dict:\n",
+    "    print(\"\\n\")\n",
+    "    print(category + \"\\n\")\n",
+    "    print(new_labels_dict[category])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8516af0e",
+   "metadata": {},
+   "source": [
+    "### create a new column with the summarized label"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "0d316bb4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Details</th>\n",
+       "      <th>Category</th>\n",
+       "      <th>Details_cleaned</th>\n",
+       "      <th>Category_cleaned</th>\n",
+       "      <th>Category_single</th>\n",
+       "      <th>Summarized_label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Media sources indicate that workers at the Gra...</td>\n",
+       "      <td>Mine Workers Strike</td>\n",
+       "      <td>medium source indicate worker grasberg mine ex...</td>\n",
+       "      <td>Mine Workers Strike</td>\n",
+       "      <td>Mine Workers Strike</td>\n",
+       "      <td>Worker Strike</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>News sources are stating that recent typhoons ...</td>\n",
+       "      <td>Travel Warning</td>\n",
+       "      <td>news source stating recent typhoon impact hong...</td>\n",
+       "      <td>Travel Warning</td>\n",
+       "      <td>Travel Warning</td>\n",
+       "      <td>Administrative Issue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>The persisting port congestion at Shanghai’s Y...</td>\n",
+       "      <td>Port Congestion</td>\n",
+       "      <td>persisting port congestion shanghai ’ yangshan...</td>\n",
+       "      <td>Port Congestion</td>\n",
+       "      <td>Port Congestion</td>\n",
+       "      <td>Administrative Issue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Updated local media sources from Jakarta indic...</td>\n",
+       "      <td>Bombing, Police Operations</td>\n",
+       "      <td>updated local medium source jakarta indicate e...</td>\n",
+       "      <td>Bombing, Police Operations</td>\n",
+       "      <td>Bombing</td>\n",
+       "      <td>Terrorism</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>According to local police in Jakarta, two expl...</td>\n",
+       "      <td>Bombing, Police Operations</td>\n",
+       "      <td>according local police jakarta two explosion c...</td>\n",
+       "      <td>Bombing, Police Operations</td>\n",
+       "      <td>Bombing</td>\n",
+       "      <td>Terrorism</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5777</th>\n",
+       "      <td>Intelligence received by Everstream Analytics ...</td>\n",
+       "      <td>Ice Storm</td>\n",
+       "      <td>intelligence received everstream analytics ind...</td>\n",
+       "      <td>Ice Storm</td>\n",
+       "      <td>Ice Storm</td>\n",
+       "      <td>Administrative Issue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5778</th>\n",
+       "      <td>Meteorological sources indicate that a series ...</td>\n",
+       "      <td>Roadway Closure / Disruption, Ground Transport...</td>\n",
+       "      <td>meteorological source indicate series winter s...</td>\n",
+       "      <td>Roadway Closure / Disruption, Ground Transport...</td>\n",
+       "      <td>Roadway Closure</td>\n",
+       "      <td>Administrative Issue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5779</th>\n",
+       "      <td>Industry sources report on December 7 that Svi...</td>\n",
+       "      <td>Industrial Action</td>\n",
+       "      <td>industry source report december 7 svitzer aust...</td>\n",
+       "      <td>Industrial Action</td>\n",
+       "      <td>Industrial Action</td>\n",
+       "      <td>Administrative Issue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5780</th>\n",
+       "      <td>Industry sources indicate on December 14 that ...</td>\n",
+       "      <td>Port Strike</td>\n",
+       "      <td>industry source indicate december 14 worker dp...</td>\n",
+       "      <td>Port Strike</td>\n",
+       "      <td>Port Strike</td>\n",
+       "      <td>Worker Strike</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5781</th>\n",
+       "      <td>On November 17, Dutch media sources reported t...</td>\n",
+       "      <td>Port Strike</td>\n",
+       "      <td>november 17 dutch medium source reported worke...</td>\n",
+       "      <td>Port Strike</td>\n",
+       "      <td>Port Strike</td>\n",
+       "      <td>Worker Strike</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5780 rows × 6 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                Details  \\\n",
+       "0     Media sources indicate that workers at the Gra...   \n",
+       "1     News sources are stating that recent typhoons ...   \n",
+       "2     The persisting port congestion at Shanghai’s Y...   \n",
+       "3     Updated local media sources from Jakarta indic...   \n",
+       "4     According to local police in Jakarta, two expl...   \n",
+       "...                                                 ...   \n",
+       "5777  Intelligence received by Everstream Analytics ...   \n",
+       "5778  Meteorological sources indicate that a series ...   \n",
+       "5779  Industry sources report on December 7 that Svi...   \n",
+       "5780  Industry sources indicate on December 14 that ...   \n",
+       "5781  On November 17, Dutch media sources reported t...   \n",
+       "\n",
+       "                                               Category  \\\n",
+       "0                                   Mine Workers Strike   \n",
+       "1                                        Travel Warning   \n",
+       "2                                       Port Congestion   \n",
+       "3                            Bombing, Police Operations   \n",
+       "4                            Bombing, Police Operations   \n",
+       "...                                                 ...   \n",
+       "5777                                          Ice Storm   \n",
+       "5778  Roadway Closure / Disruption, Ground Transport...   \n",
+       "5779                                  Industrial Action   \n",
+       "5780                                        Port Strike   \n",
+       "5781                                        Port Strike   \n",
+       "\n",
+       "                                        Details_cleaned  \\\n",
+       "0     medium source indicate worker grasberg mine ex...   \n",
+       "1     news source stating recent typhoon impact hong...   \n",
+       "2     persisting port congestion shanghai ’ yangshan...   \n",
+       "3     updated local medium source jakarta indicate e...   \n",
+       "4     according local police jakarta two explosion c...   \n",
+       "...                                                 ...   \n",
+       "5777  intelligence received everstream analytics ind...   \n",
+       "5778  meteorological source indicate series winter s...   \n",
+       "5779  industry source report december 7 svitzer aust...   \n",
+       "5780  industry source indicate december 14 worker dp...   \n",
+       "5781  november 17 dutch medium source reported worke...   \n",
+       "\n",
+       "                                       Category_cleaned      Category_single  \\\n",
+       "0                                   Mine Workers Strike  Mine Workers Strike   \n",
+       "1                                        Travel Warning       Travel Warning   \n",
+       "2                                       Port Congestion      Port Congestion   \n",
+       "3                            Bombing, Police Operations              Bombing   \n",
+       "4                            Bombing, Police Operations              Bombing   \n",
+       "...                                                 ...                  ...   \n",
+       "5777                                          Ice Storm            Ice Storm   \n",
+       "5778  Roadway Closure / Disruption, Ground Transport...      Roadway Closure   \n",
+       "5779                                  Industrial Action    Industrial Action   \n",
+       "5780                                        Port Strike          Port Strike   \n",
+       "5781                                        Port Strike          Port Strike   \n",
+       "\n",
+       "          Summarized_label  \n",
+       "0            Worker Strike  \n",
+       "1     Administrative Issue  \n",
+       "2     Administrative Issue  \n",
+       "3                Terrorism  \n",
+       "4                Terrorism  \n",
+       "...                    ...  \n",
+       "5777  Administrative Issue  \n",
+       "5778  Administrative Issue  \n",
+       "5779  Administrative Issue  \n",
+       "5780         Worker Strike  \n",
+       "5781         Worker Strike  \n",
+       "\n",
+       "[5780 rows x 6 columns]"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result_df[\"Summarized_label\"] = None\n",
+    "\n",
+    "for index, row in result_df.iterrows():\n",
+    "    value = row[\"Category_single\"]\n",
+    "    for key, values in new_labels_dict.items():\n",
+    "        if value in values:\n",
+    "            result_df.at[index, \"Summarized_label\"] = key\n",
+    "            break  # No need to check other keys if match found\n",
+    "result_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "27d12104",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Details</th>\n",
+       "      <th>Category</th>\n",
+       "      <th>Details_cleaned</th>\n",
+       "      <th>Category_cleaned</th>\n",
+       "      <th>Category_single</th>\n",
+       "      <th>Summarized_label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>5780</td>\n",
+       "      <td>5780</td>\n",
+       "      <td>5780</td>\n",
+       "      <td>5780</td>\n",
+       "      <td>5780</td>\n",
+       "      <td>5780</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>unique</th>\n",
+       "      <td>5750</td>\n",
+       "      <td>857</td>\n",
+       "      <td>5744</td>\n",
+       "      <td>857</td>\n",
+       "      <td>94</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>top</th>\n",
+       "      <td>Updated media sources indicated on December 4 ...</td>\n",
+       "      <td>Port Congestion</td>\n",
+       "      <td>source indicated july 23 captain port united s...</td>\n",
+       "      <td>Port Congestion</td>\n",
+       "      <td>Port Congestion</td>\n",
+       "      <td>Administrative Issue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>freq</th>\n",
+       "      <td>3</td>\n",
+       "      <td>710</td>\n",
+       "      <td>3</td>\n",
+       "      <td>710</td>\n",
+       "      <td>791</td>\n",
+       "      <td>3210</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                  Details         Category  \\\n",
+       "count                                                5780             5780   \n",
+       "unique                                               5750              857   \n",
+       "top     Updated media sources indicated on December 4 ...  Port Congestion   \n",
+       "freq                                                    3              710   \n",
+       "\n",
+       "                                          Details_cleaned Category_cleaned  \\\n",
+       "count                                                5780             5780   \n",
+       "unique                                               5744              857   \n",
+       "top     source indicated july 23 captain port united s...  Port Congestion   \n",
+       "freq                                                    3              710   \n",
+       "\n",
+       "        Category_single      Summarized_label  \n",
+       "count              5780                  5780  \n",
+       "unique               94                     8  \n",
+       "top     Port Congestion  Administrative Issue  \n",
+       "freq                791                  3210  "
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result_df.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "b708ae97",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result_df.to_csv(\"data/processed_data.csv\", index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}