diff --git "a/notebooks/01a_Classification models on incident category_Clean Data.ipynb" "b/notebooks/01a_Classification models on incident category_Clean Data.ipynb" new file mode 100644--- /dev/null +++ "b/notebooks/01a_Classification models on incident category_Clean Data.ipynb" @@ -0,0 +1,1673 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "feaf77ab", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "workding dir: /Users/inflaton/code/engd/papers/maritime/global-incidents\n", + "loading env vars from: /Users/inflaton/code/engd/papers/maritime/global-incidents/.env\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "import sys\n", + "from pathlib import Path\n", + "\n", + "workding_dir = str(Path.cwd().parent)\n", + "os.chdir(workding_dir)\n", + "sys.path.append(workding_dir)\n", + "print(\"workding dir:\", workding_dir)\n", + "\n", + "from dotenv import find_dotenv, load_dotenv\n", + "\n", + "found_dotenv = find_dotenv(\".env\")\n", + "\n", + "if len(found_dotenv) == 0:\n", + " found_dotenv = find_dotenv(\".env.example\")\n", + "print(f\"loading env vars from: {found_dotenv}\")\n", + "load_dotenv(found_dotenv, override=True)" + ] + }, + { + "cell_type": "markdown", + "id": "3a7dd7d8", + "metadata": {}, + "source": [ + "## Import Statement" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "86fc25e6", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "fac53e88", + "metadata": {}, + "source": [ + "### read the data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "dc33b13b", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"data/all_port_labelled.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "31f58fd1", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Unnamed: 0</th>\n", + " <th>Index</th>\n", + " <th>Unnamed: 0.1</th>\n", + " <th>Headline</th>\n", + " <th>Details</th>\n", + " <th>Severity</th>\n", + " <th>Category</th>\n", + " <th>Region</th>\n", + " <th>Datetime</th>\n", + " <th>Year</th>\n", + " <th>...</th>\n", + " <th>IT</th>\n", + " <th>EP</th>\n", + " <th>NEW</th>\n", + " <th>CSD</th>\n", + " <th>RPE</th>\n", + " <th>MN</th>\n", + " <th>NM</th>\n", + " <th>if_labeled</th>\n", + " <th>Month</th>\n", + " <th>Week</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0.0</td>\n", + " <td>8.0</td>\n", + " <td>34.0</td>\n", + " <td>Grasberg Mine- Grasberg mine workers extend st...</td>\n", + " <td>Media sources indicate that workers at the Gra...</td>\n", + " <td>Moderate</td>\n", + " <td>Mine Workers Strike</td>\n", + " <td>Indonesia</td>\n", + " <td>28/5/17 17:08</td>\n", + " <td>2017.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1.0</td>\n", + " <td>False</td>\n", + " <td>5.0</td>\n", + " <td>21.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1.0</td>\n", + " <td>10.0</td>\n", + " <td>63.0</td>\n", + " <td>Indonesia: Undersea internet cables damaged by...</td>\n", + " <td>News sources are stating that recent typhoons ...</td>\n", + " <td>Minor</td>\n", + " <td>Travel Warning</td>\n", + " <td>Indonesia</td>\n", + " <td>4/9/17 14:30</td>\n", + " <td>2017.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>False</td>\n", + " <td>4.0</td>\n", + " <td>14.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>2 rows × 46 columns</p>\n", + "</div>" + ], + "text/plain": [ + " Unnamed: 0 Index Unnamed: 0.1 \\\n", + "0 0.0 8.0 34.0 \n", + "1 1.0 10.0 63.0 \n", + "\n", + " Headline \\\n", + "0 Grasberg Mine- Grasberg mine workers extend st... \n", + "1 Indonesia: Undersea internet cables damaged by... \n", + "\n", + " Details Severity \\\n", + "0 Media sources indicate that workers at the Gra... Moderate \n", + "1 News sources are stating that recent typhoons ... Minor \n", + "\n", + " Category Region Datetime Year ... IT EP NEW \\\n", + "0 Mine Workers Strike Indonesia 28/5/17 17:08 2017.0 ... 0.0 0.0 0.0 \n", + "1 Travel Warning Indonesia 4/9/17 14:30 2017.0 ... 0.0 0.0 0.0 \n", + "\n", + " CSD RPE MN NM if_labeled Month Week \n", + "0 0.0 0.0 0.0 1.0 False 5.0 21.0 \n", + "1 0.0 0.0 1.0 0.0 False 4.0 14.0 \n", + "\n", + "[2 rows x 46 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(2)" + ] + }, + { + "cell_type": "markdown", + "id": "9bff68c9", + "metadata": {}, + "source": [ + "### Clean empty data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "41aa751c", + "metadata": {}, + "outputs": [], + "source": [ + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.stem import WordNetLemmatizer\n", + "import string\n", + "\n", + "# nltk.download('punkt')\n", + "# nltk.download('stopwords')\n", + "# nltk.download('wordnet')\n", + "\n", + "\n", + "def clean_text(text):\n", + " # Lowercase\n", + " text = text.lower()\n", + " # Tokenization\n", + " tokens = word_tokenize(text)\n", + " # Removing punctuation\n", + " tokens = [word for word in tokens if word not in string.punctuation]\n", + " # Removing stop words\n", + " stop_words = set(stopwords.words(\"english\"))\n", + " tokens = [word for word in tokens if word not in stop_words]\n", + " # Lemmatization\n", + " lemmatizer = WordNetLemmatizer()\n", + " tokens = [lemmatizer.lemmatize(word) for word in tokens]\n", + "\n", + " return \" \".join(tokens)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6293f613", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package omw-1.4 to\n", + "[nltk_data] /Users/inflaton/nltk_data...\n", + "[nltk_data] Package omw-1.4 is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import nltk\n", + "\n", + "nltk.download(\"omw-1.4\")" + ] + }, + { + "cell_type": "markdown", + "id": "fad3210d", + "metadata": {}, + "source": [ + "### The Details column has an issue\n", + "\n", + "some of the data are of the type float and none of the text processing functions can be applied to it therefore we have to process it" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b1799269", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 5782 entries, 0 to 5781\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Details 5781 non-null object\n", + " 1 Category 5780 non-null object\n", + "dtypes: object(2)\n", + "memory usage: 90.5+ KB\n", + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 5782 entries, 0 to 5781\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Details 5781 non-null object\n", + " 1 Category 5780 non-null object\n", + " 2 Details_cleaned 5781 non-null object\n", + " 3 Category_cleaned 5780 non-null object\n", + "dtypes: object(4)\n", + "memory usage: 180.8+ KB\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/7x/56svhln929zdh2xhr3mwqg4r0000gn/T/ipykernel_76478/4121100139.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " text_df[\"Details_cleaned\"] = text_df[\"Details\"].apply(\n", + "/var/folders/7x/56svhln929zdh2xhr3mwqg4r0000gn/T/ipykernel_76478/4121100139.py:6: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " text_df[\"Category_cleaned\"] = text_df[\"Category\"].apply(\n" + ] + } + ], + "source": [ + "text_df = df[[\"Details\", \"Category\"]]\n", + "text_df.info()\n", + "text_df[\"Details_cleaned\"] = text_df[\"Details\"].apply(\n", + " lambda x: clean_text(x) if not isinstance(x, float) else None\n", + ")\n", + "text_df[\"Category_cleaned\"] = text_df[\"Category\"].apply(\n", + " lambda x: None if isinstance(x, float) else x\n", + ")\n", + "\n", + "# no_nan_df[no_nan_df[\"Details\"].apply(lambda x: print(type(x)))]\n", + "# cleaned_df = text_df[text_df[\"Details\"].apply(lambda x: clean_text(x))]\n", + "# cleaned_df = df['Details'][1:2]\n", + "# type(no_nan_df[\"Details\"][0])\n", + "# print(clean_text(no_nan_df[\"Details\"][0]))\n", + "text_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "5fcc3b33", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Details</th>\n", + " <th>Category</th>\n", + " <th>Details_cleaned</th>\n", + " <th>Category_cleaned</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Media sources indicate that workers at the Gra...</td>\n", + " <td>Mine Workers Strike</td>\n", + " <td>medium source indicate worker grasberg mine ex...</td>\n", + " <td>Mine Workers Strike</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>News sources are stating that recent typhoons ...</td>\n", + " <td>Travel Warning</td>\n", + " <td>news source stating recent typhoon impact hong...</td>\n", + " <td>Travel Warning</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>The persisting port congestion at Shanghai’s Y...</td>\n", + " <td>Port Congestion</td>\n", + " <td>persisting port congestion shanghai ’ yangshan...</td>\n", + " <td>Port Congestion</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Updated local media sources from Jakarta indic...</td>\n", + " <td>Bombing, Police Operations</td>\n", + " <td>updated local medium source jakarta indicate e...</td>\n", + " <td>Bombing, Police Operations</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>According to local police in Jakarta, two expl...</td>\n", + " <td>Bombing, Police Operations</td>\n", + " <td>according local police jakarta two explosion c...</td>\n", + " <td>Bombing, Police Operations</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>Severe winds have downed billboards and trees ...</td>\n", + " <td>Roadway Closure / Disruption, Flooding, Severe...</td>\n", + " <td>severe wind downed billboard tree bandung wedn...</td>\n", + " <td>Roadway Closure / Disruption, Flooding, Severe...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>Local media sources indicated on October 29 th...</td>\n", + " <td>Cargo/Warehouse Theft</td>\n", + " <td>local medium source indicated october 29 wareh...</td>\n", + " <td>Cargo/Warehouse Theft</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>Tropical Storm Rumbia had dissipated after tra...</td>\n", + " <td>Tropical Cyclone / Storm</td>\n", + " <td>tropical storm rumbia dissipated travelling ar...</td>\n", + " <td>Tropical Cyclone / Storm</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>Tropical Depression Yutu, also referred to as ...</td>\n", + " <td>Storm</td>\n", + " <td>tropical depression yutu also referred `` '' r...</td>\n", + " <td>Storm</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>A magnitude 4.5 earthquake was detected 14 mil...</td>\n", + " <td>Earthquake</td>\n", + " <td>magnitude 4.5 earthquake detected 14 mile nort...</td>\n", + " <td>Earthquake</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Details \\\n", + "0 Media sources indicate that workers at the Gra... \n", + "1 News sources are stating that recent typhoons ... \n", + "2 The persisting port congestion at Shanghai’s Y... \n", + "3 Updated local media sources from Jakarta indic... \n", + "4 According to local police in Jakarta, two expl... \n", + "5 Severe winds have downed billboards and trees ... \n", + "6 Local media sources indicated on October 29 th... \n", + "7 Tropical Storm Rumbia had dissipated after tra... \n", + "8 Tropical Depression Yutu, also referred to as ... \n", + "9 A magnitude 4.5 earthquake was detected 14 mil... \n", + "\n", + " Category \\\n", + "0 Mine Workers Strike \n", + "1 Travel Warning \n", + "2 Port Congestion \n", + "3 Bombing, Police Operations \n", + "4 Bombing, Police Operations \n", + "5 Roadway Closure / Disruption, Flooding, Severe... \n", + "6 Cargo/Warehouse Theft \n", + "7 Tropical Cyclone / Storm \n", + "8 Storm \n", + "9 Earthquake \n", + "\n", + " Details_cleaned \\\n", + "0 medium source indicate worker grasberg mine ex... \n", + "1 news source stating recent typhoon impact hong... \n", + "2 persisting port congestion shanghai ’ yangshan... \n", + "3 updated local medium source jakarta indicate e... \n", + "4 according local police jakarta two explosion c... \n", + "5 severe wind downed billboard tree bandung wedn... \n", + "6 local medium source indicated october 29 wareh... \n", + "7 tropical storm rumbia dissipated travelling ar... \n", + "8 tropical depression yutu also referred `` '' r... \n", + "9 magnitude 4.5 earthquake detected 14 mile nort... \n", + "\n", + " Category_cleaned \n", + "0 Mine Workers Strike \n", + "1 Travel Warning \n", + "2 Port Congestion \n", + "3 Bombing, Police Operations \n", + "4 Bombing, Police Operations \n", + "5 Roadway Closure / Disruption, Flooding, Severe... \n", + "6 Cargo/Warehouse Theft \n", + "7 Tropical Cyclone / Storm \n", + "8 Storm \n", + "9 Earthquake " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "processed_data = text_df.dropna()\n", + "processed_data.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d02b4b00", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "857" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "processed_data[\"Category\"].nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "9ee856a1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "Index: 5780 entries, 0 to 5781\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Details 5780 non-null object\n", + " 1 Category 5780 non-null object\n", + " 2 Details_cleaned 5780 non-null object\n", + " 3 Category_cleaned 5780 non-null object\n", + "dtypes: object(4)\n", + "memory usage: 225.8+ KB\n" + ] + } + ], + "source": [ + "processed_data.info()" + ] + }, + { + "cell_type": "markdown", + "id": "3f6d478f", + "metadata": {}, + "source": [ + "## Process the Category column\n", + "this is not seldom done as we don't usually process the y of the data\n", + "However, the category is too complex and requires processing if not the labels are just too much" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "285013d3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "111\n" + ] + } + ], + "source": [ + "# Create a function that will split the labels into individual\n", + "import re\n", + "\n", + "\n", + "def split_string(text):\n", + " # Split the string using either \"/\" or \",\" as separator\n", + " words = re.split(r\"[\\/,]\", text)\n", + " # Remove any leading or trailing whitespace from each word\n", + " words = [word.strip() for word in words if word.strip()]\n", + " return words\n", + "\n", + "\n", + "# Example usage:\n", + "# input_str = \"Roadway Closure / Disruption, Flooding, Severe Winds, Weather Advisory\"\n", + "# result = split_string(input_str)\n", + "# print(result)\n", + "\n", + "# create a list to find the number of unique individual labels\n", + "label_list = []\n", + "\n", + "for i in processed_data[\"Category_cleaned\"]:\n", + " for j in split_string(i):\n", + " if j not in label_list:\n", + " label_list.append(j)\n", + "\n", + "# print(label)\n", + "print(len(label_list))" + ] + }, + { + "cell_type": "markdown", + "id": "8e7b48e8", + "metadata": {}, + "source": [ + "#### After filtering out the unique labels in the Category column we are still left with 111 labels which is still considered too much" + ] + }, + { + "cell_type": "markdown", + "id": "33234f8c", + "metadata": {}, + "source": [ + "#### The next step would be to to reduce a data's category label into 1 single label \n", + "Previously the data looks like Roadway Closure / Disruption, Flooding, Severe... we need to reduce it to 1 single label \n", + "The next process we are going to use in is that we assume the first label in is the most prominent category then we will remove the other categories" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "12f9b9b4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "Index: 5780 entries, 0 to 5781\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Details 5780 non-null object\n", + " 1 Category 5780 non-null object\n", + " 2 Details_cleaned 5780 non-null object\n", + " 3 Category_cleaned 5780 non-null object\n", + " 4 Category_single 5780 non-null object\n", + "dtypes: object(5)\n", + "memory usage: 270.9+ KB\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/7x/56svhln929zdh2xhr3mwqg4r0000gn/T/ipykernel_76478/2791632185.py:29: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " text_df[\"Category_single\"] = text_df[\"Category_cleaned\"].apply(\n" + ] + } + ], + "source": [ + "def split_and_get_first(text):\n", + " # Split the string using either \"/\" or \",\" as separator\n", + " if text == None:\n", + " return None\n", + " words = re.split(r\"[\\/,]\", text)\n", + " # Remove any leading or trailing whitespace from each word\n", + " words = [word.strip() for word in words if word.strip()]\n", + " # Return the first word after split\n", + " if words:\n", + " return words[0]\n", + " else:\n", + " return None\n", + "\n", + "\n", + "def remove_none_rows(df, column_name):\n", + " # Iterate through the DataFrame\n", + " for index, value in enumerate(df[column_name]):\n", + " # Check if the value is None\n", + " if value is None:\n", + " # Remove the row where the data belongs to\n", + " df = df.drop(index, axis=0)\n", + " return df\n", + "\n", + "\n", + "# Example usage:\n", + "# input_str = \"Roadway Closure / Disruption, Flooding, Severe Winds, Weather Advisory\"\n", + "# result = split_and_get_first(input_str)\n", + "# print(result)\n", + "text_df[\"Category_single\"] = text_df[\"Category_cleaned\"].apply(\n", + " lambda x: split_and_get_first(x)\n", + ")\n", + "result_df = remove_none_rows(text_df, \"Category_cleaned\")\n", + "result_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b5931fe1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Details</th>\n", + " <th>Category</th>\n", + " <th>Details_cleaned</th>\n", + " <th>Category_cleaned</th>\n", + " <th>Category_single</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Media sources indicate that workers at the Gra...</td>\n", + " <td>Mine Workers Strike</td>\n", + " <td>medium source indicate worker grasberg mine ex...</td>\n", + " <td>Mine Workers Strike</td>\n", + " <td>Mine Workers Strike</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>News sources are stating that recent typhoons ...</td>\n", + " <td>Travel Warning</td>\n", + " <td>news source stating recent typhoon impact hong...</td>\n", + " <td>Travel Warning</td>\n", + " <td>Travel Warning</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>The persisting port congestion at Shanghai’s Y...</td>\n", + " <td>Port Congestion</td>\n", + " <td>persisting port congestion shanghai ’ yangshan...</td>\n", + " <td>Port Congestion</td>\n", + " <td>Port Congestion</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Updated local media sources from Jakarta indic...</td>\n", + " <td>Bombing, Police Operations</td>\n", + " <td>updated local medium source jakarta indicate e...</td>\n", + " <td>Bombing, Police Operations</td>\n", + " <td>Bombing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>According to local police in Jakarta, two expl...</td>\n", + " <td>Bombing, Police Operations</td>\n", + " <td>according local police jakarta two explosion c...</td>\n", + " <td>Bombing, Police Operations</td>\n", + " <td>Bombing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>Severe winds have downed billboards and trees ...</td>\n", + " <td>Roadway Closure / Disruption, Flooding, Severe...</td>\n", + " <td>severe wind downed billboard tree bandung wedn...</td>\n", + " <td>Roadway Closure / Disruption, Flooding, Severe...</td>\n", + " <td>Roadway Closure</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>Local media sources indicated on October 29 th...</td>\n", + " <td>Cargo/Warehouse Theft</td>\n", + " <td>local medium source indicated october 29 wareh...</td>\n", + " <td>Cargo/Warehouse Theft</td>\n", + " <td>Cargo</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>Tropical Storm Rumbia had dissipated after tra...</td>\n", + " <td>Tropical Cyclone / Storm</td>\n", + " <td>tropical storm rumbia dissipated travelling ar...</td>\n", + " <td>Tropical Cyclone / Storm</td>\n", + " <td>Tropical Cyclone</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>Tropical Depression Yutu, also referred to as ...</td>\n", + " <td>Storm</td>\n", + " <td>tropical depression yutu also referred `` '' r...</td>\n", + " <td>Storm</td>\n", + " <td>Storm</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>A magnitude 4.5 earthquake was detected 14 mil...</td>\n", + " <td>Earthquake</td>\n", + " <td>magnitude 4.5 earthquake detected 14 mile nort...</td>\n", + " <td>Earthquake</td>\n", + " <td>Earthquake</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>Multiple sources report that a magnitude 5.5 e...</td>\n", + " <td>Earthquake</td>\n", + " <td>multiple source report magnitude 5.5 earthquak...</td>\n", + " <td>Earthquake</td>\n", + " <td>Earthquake</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>Post-Tropical Cyclone Michael is approximately...</td>\n", + " <td>Tropical Cyclone / Storm</td>\n", + " <td>post-tropical cyclone michael approximately 18...</td>\n", + " <td>Tropical Cyclone / Storm</td>\n", + " <td>Tropical Cyclone</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>Industry sources indicate on September 11 that...</td>\n", + " <td>Workplace Accident</td>\n", + " <td>industry source indicate september 11 2 worker...</td>\n", + " <td>Workplace Accident</td>\n", + " <td>Workplace Accident</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>Government sources are reporting a tornado has...</td>\n", + " <td>Tornado</td>\n", + " <td>government source reporting tornado touched tw...</td>\n", + " <td>Tornado</td>\n", + " <td>Tornado</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>Media sources are informing on September 24 th...</td>\n", + " <td>Industrial Action</td>\n", + " <td>medium source informing september 24 oil worke...</td>\n", + " <td>Industrial Action</td>\n", + " <td>Industrial Action</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>A magnitude 4.5 earthquake was detected in cen...</td>\n", + " <td>Earthquake</td>\n", + " <td>magnitude 4.5 earthquake detected central taiw...</td>\n", + " <td>Earthquake</td>\n", + " <td>Earthquake</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>Industry sources indicate on August 31 that th...</td>\n", + " <td>Port Congestion</td>\n", + " <td>industry source indicate august 31 port durban...</td>\n", + " <td>Port Congestion</td>\n", + " <td>Port Congestion</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>Tropical Depression Gordon continues to weaken...</td>\n", + " <td>Storm</td>\n", + " <td>tropical depression gordon continues weaken mo...</td>\n", + " <td>Storm</td>\n", + " <td>Storm</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>Local media sources indicated on November 8 th...</td>\n", + " <td>Public Safety / Security</td>\n", + " <td>local medium source indicated november 8 270 k...</td>\n", + " <td>Public Safety / Security</td>\n", + " <td>Public Safety</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>The European-Mediterranean Seismological Centr...</td>\n", + " <td>Earthquake</td>\n", + " <td>european-mediterranean seismological centre re...</td>\n", + " <td>Earthquake</td>\n", + " <td>Earthquake</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Details \\\n", + "0 Media sources indicate that workers at the Gra... \n", + "1 News sources are stating that recent typhoons ... \n", + "2 The persisting port congestion at Shanghai’s Y... \n", + "3 Updated local media sources from Jakarta indic... \n", + "4 According to local police in Jakarta, two expl... \n", + "5 Severe winds have downed billboards and trees ... \n", + "6 Local media sources indicated on October 29 th... \n", + "7 Tropical Storm Rumbia had dissipated after tra... \n", + "8 Tropical Depression Yutu, also referred to as ... \n", + "9 A magnitude 4.5 earthquake was detected 14 mil... \n", + "10 Multiple sources report that a magnitude 5.5 e... \n", + "11 Post-Tropical Cyclone Michael is approximately... \n", + "12 Industry sources indicate on September 11 that... \n", + "13 Government sources are reporting a tornado has... \n", + "14 Media sources are informing on September 24 th... \n", + "15 A magnitude 4.5 earthquake was detected in cen... \n", + "16 Industry sources indicate on August 31 that th... \n", + "17 Tropical Depression Gordon continues to weaken... \n", + "18 Local media sources indicated on November 8 th... \n", + "19 The European-Mediterranean Seismological Centr... \n", + "\n", + " Category \\\n", + "0 Mine Workers Strike \n", + "1 Travel Warning \n", + "2 Port Congestion \n", + "3 Bombing, Police Operations \n", + "4 Bombing, Police Operations \n", + "5 Roadway Closure / Disruption, Flooding, Severe... \n", + "6 Cargo/Warehouse Theft \n", + "7 Tropical Cyclone / Storm \n", + "8 Storm \n", + "9 Earthquake \n", + "10 Earthquake \n", + "11 Tropical Cyclone / Storm \n", + "12 Workplace Accident \n", + "13 Tornado \n", + "14 Industrial Action \n", + "15 Earthquake \n", + "16 Port Congestion \n", + "17 Storm \n", + "18 Public Safety / Security \n", + "19 Earthquake \n", + "\n", + " Details_cleaned \\\n", + "0 medium source indicate worker grasberg mine ex... \n", + "1 news source stating recent typhoon impact hong... \n", + "2 persisting port congestion shanghai ’ yangshan... \n", + "3 updated local medium source jakarta indicate e... \n", + "4 according local police jakarta two explosion c... \n", + "5 severe wind downed billboard tree bandung wedn... \n", + "6 local medium source indicated october 29 wareh... \n", + "7 tropical storm rumbia dissipated travelling ar... \n", + "8 tropical depression yutu also referred `` '' r... \n", + "9 magnitude 4.5 earthquake detected 14 mile nort... \n", + "10 multiple source report magnitude 5.5 earthquak... \n", + "11 post-tropical cyclone michael approximately 18... \n", + "12 industry source indicate september 11 2 worker... \n", + "13 government source reporting tornado touched tw... \n", + "14 medium source informing september 24 oil worke... \n", + "15 magnitude 4.5 earthquake detected central taiw... \n", + "16 industry source indicate august 31 port durban... \n", + "17 tropical depression gordon continues weaken mo... \n", + "18 local medium source indicated november 8 270 k... \n", + "19 european-mediterranean seismological centre re... \n", + "\n", + " Category_cleaned Category_single \n", + "0 Mine Workers Strike Mine Workers Strike \n", + "1 Travel Warning Travel Warning \n", + "2 Port Congestion Port Congestion \n", + "3 Bombing, Police Operations Bombing \n", + "4 Bombing, Police Operations Bombing \n", + "5 Roadway Closure / Disruption, Flooding, Severe... Roadway Closure \n", + "6 Cargo/Warehouse Theft Cargo \n", + "7 Tropical Cyclone / Storm Tropical Cyclone \n", + "8 Storm Storm \n", + "9 Earthquake Earthquake \n", + "10 Earthquake Earthquake \n", + "11 Tropical Cyclone / Storm Tropical Cyclone \n", + "12 Workplace Accident Workplace Accident \n", + "13 Tornado Tornado \n", + "14 Industrial Action Industrial Action \n", + "15 Earthquake Earthquake \n", + "16 Port Congestion Port Congestion \n", + "17 Storm Storm \n", + "18 Public Safety / Security Public Safety \n", + "19 Earthquake Earthquake " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result_df.head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "9c19b11a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "94" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result_df[\"Category_single\"].nunique()" + ] + }, + { + "cell_type": "markdown", + "id": "29d4037f", + "metadata": {}, + "source": [ + "### After taking out the first label in the Category column we are still left with 94 unique labels\n", + "This is still unacceptable amount of labels the next step we are planning to use is to manually group the labels in more generalize label by using a rule based system" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "10f07d05", + "metadata": {}, + "outputs": [], + "source": [ + "### first export the unique labels into excel for better visualization\n", + "unique_labels_df = pd.DataFrame({\"String\": label_list})\n", + "file_path = \"data/label_list.xlsx\"\n", + "\n", + "# Save DataFrame to Excel\n", + "unique_labels_df.to_excel(file_path, index=False)" + ] + }, + { + "attachments": { + "converstion.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "id": "398e6da8", + "metadata": {}, + "source": [ + "![converstion.png](attachment:converstion.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "d4357af0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Weather</th>\n", + " <th>Worker Strike</th>\n", + " <th>Administrative Issue</th>\n", + " <th>Human Error</th>\n", + " <th>Cyber Attack</th>\n", + " <th>Terrorism</th>\n", + " <th>Accident</th>\n", + " <th>Others</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Flooding</td>\n", + " <td>Mine Workers Strike</td>\n", + " <td>Port Congestion</td>\n", + " <td>Workplace Accident</td>\n", + " <td>Network Disruption</td>\n", + " <td>Bombing</td>\n", + " <td>Maritime Accident</td>\n", + " <td>Miscellaneous Events</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Severe Winds</td>\n", + " <td>Production Halt</td>\n", + " <td>Police Operations</td>\n", + " <td>Individuals in Focus</td>\n", + " <td>Ransomware</td>\n", + " <td>Warehouse Theft</td>\n", + " <td>Vehicle Accident</td>\n", + " <td>Miscellaneous Strikes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Weather Advisory</td>\n", + " <td>Protest</td>\n", + " <td>Roadway Closure</td>\n", + " <td>Military Operations</td>\n", + " <td>Data breach</td>\n", + " <td>Public Safety</td>\n", + " <td>Death</td>\n", + " <td>Outbreak of disease</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Tropical Cyclone</td>\n", + " <td>Riot</td>\n", + " <td>Disruption</td>\n", + " <td>Flight Delays</td>\n", + " <td>Phishing</td>\n", + " <td>Security</td>\n", + " <td>Injury</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Storm</td>\n", + " <td>Port Strike</td>\n", + " <td>Cargo</td>\n", + " <td>Cancellations</td>\n", + " <td>NaN</td>\n", + " <td>Organized Crime</td>\n", + " <td>Non-industrial Fire</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Weather Worker Strike Administrative Issue \\\n", + "0 Flooding Mine Workers Strike Port Congestion \n", + "1 Severe Winds Production Halt Police Operations \n", + "2 Weather Advisory Protest Roadway Closure \n", + "3 Tropical Cyclone Riot Disruption \n", + "4 Storm Port Strike Cargo \n", + "\n", + " Human Error Cyber Attack Terrorism \\\n", + "0 Workplace Accident Network Disruption Bombing \n", + "1 Individuals in Focus Ransomware Warehouse Theft \n", + "2 Military Operations Data breach Public Safety \n", + "3 Flight Delays Phishing Security \n", + "4 Cancellations NaN Organized Crime \n", + "\n", + " Accident Others \n", + "0 Maritime Accident Miscellaneous Events \n", + "1 Vehicle Accident Miscellaneous Strikes \n", + "2 Death Outbreak of disease \n", + "3 Injury NaN \n", + "4 Non-industrial Fire NaN " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_labels_df = pd.read_excel(\"data/new_labels.xlsx\")\n", + "new_labels_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "407189c9", + "metadata": {}, + "source": [ + "#### convert them into lists" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "73939327", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "Weather\n", + "\n", + "['Flooding', 'Severe Winds', 'Weather Advisory', 'Tropical Cyclone', 'Storm', 'Earthquake', 'Tornado', 'Typhoon', 'Landslide', 'Water', 'Hurricane', 'Wildfire', 'Blizzard', 'Hail']\n", + "\n", + "\n", + "Worker Strike\n", + "\n", + "['Mine Workers Strike', 'Production Halt', 'Protest', 'Riot', 'Port Strike', 'General Strike', 'Civil Service Strike', 'Civil Unrest Advisory', 'Cargo Transportation Strike', 'Energy Sector Strike']\n", + "\n", + "\n", + "Administrative Issue\n", + "\n", + "['Port Congestion', 'Police Operations', 'Roadway Closure', 'Disruption', 'Cargo', 'Industrial Action', 'Port Disruption', 'Cargo Disruption', 'Power Outage', 'Port Closure', 'Maritime Advisory', 'Train Delays', 'Ground Transportation Advisory', 'Public Transportation Disruption', 'Trade Regulation', 'Customs Regulation', 'Regulatory Advisory', 'Industry Directives', 'Security Advisory', 'Public Holidays', 'Customs Delay', 'Public Health Advisory', 'Detention', 'Aviation Advisory', 'Waterway closure', 'Waterway Closure', 'Plant Closure', 'Border Closure', 'Delay', 'Industrial zone shutdown', 'Trade Restrictions', 'Closure', 'Truck Driving Ban', 'Insolvency', 'Environmental Regulations', 'Postal Disruption', 'Ice Storm', 'Travel Warning']\n", + "\n", + "\n", + "Human Error\n", + "\n", + "['Workplace Accident', 'Individuals in Focus', 'Military Operations', 'Flight Delays', 'Cancellations', 'Political Info', 'Event']\n", + "\n", + "\n", + "Cyber Attack\n", + "\n", + "['Network Disruption', 'Ransomware', 'Data breach', 'Phishing']\n", + "\n", + "\n", + "Terrorism\n", + "\n", + "['Bombing', 'Warehouse Theft', 'Public Safety', 'Security', 'Organized Crime', 'Hazmat Response', 'Piracy', 'Kidnap', 'Shooting', 'Robbery', 'Cargo theft', 'Bomb Detonation', 'Terror Attack', 'Outbreak Of War', 'Militant Action']\n", + "\n", + "\n", + "Accident\n", + "\n", + "['Maritime Accident', 'Vehicle Accident', 'Death', 'Injury', 'Non-industrial Fire', 'Chemical Spill', 'Industrial Fire', 'Fuel Disruption', 'Airline Incident', 'Crash', 'Explosion', 'Train Accident', 'Derailment', 'Sewage Disruption', 'Barge Accident', 'Bridge Collapse', 'Structure Collapse', 'Airport Accident', 'Force Majeure', 'Telecom Outage']\n", + "\n", + "\n", + "Others\n", + "\n", + "['Miscellaneous Events', 'Miscellaneous Strikes', 'Outbreak of disease']\n" + ] + } + ], + "source": [ + "new_labels_dict = new_labels_df.to_dict(orient=\"list\")\n", + "\n", + "\n", + "for key, value in new_labels_dict.items():\n", + " new_labels_dict[key] = [item for item in value if not pd.isnull(item)]\n", + "\n", + "for category in new_labels_dict:\n", + " print(\"\\n\")\n", + " print(category + \"\\n\")\n", + " print(new_labels_dict[category])" + ] + }, + { + "cell_type": "markdown", + "id": "8516af0e", + "metadata": {}, + "source": [ + "### create a new column with the summarized label" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "0d316bb4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Details</th>\n", + " <th>Category</th>\n", + " <th>Details_cleaned</th>\n", + " <th>Category_cleaned</th>\n", + " <th>Category_single</th>\n", + " <th>Summarized_label</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Media sources indicate that workers at the Gra...</td>\n", + " <td>Mine Workers Strike</td>\n", + " <td>medium source indicate worker grasberg mine ex...</td>\n", + " <td>Mine Workers Strike</td>\n", + " <td>Mine Workers Strike</td>\n", + " <td>Worker Strike</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>News sources are stating that recent typhoons ...</td>\n", + " <td>Travel Warning</td>\n", + " <td>news source stating recent typhoon impact hong...</td>\n", + " <td>Travel Warning</td>\n", + " <td>Travel Warning</td>\n", + " <td>Administrative Issue</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>The persisting port congestion at Shanghai’s Y...</td>\n", + " <td>Port Congestion</td>\n", + " <td>persisting port congestion shanghai ’ yangshan...</td>\n", + " <td>Port Congestion</td>\n", + " <td>Port Congestion</td>\n", + " <td>Administrative Issue</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Updated local media sources from Jakarta indic...</td>\n", + " <td>Bombing, Police Operations</td>\n", + " <td>updated local medium source jakarta indicate e...</td>\n", + " <td>Bombing, Police Operations</td>\n", + " <td>Bombing</td>\n", + " <td>Terrorism</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>According to local police in Jakarta, two expl...</td>\n", + " <td>Bombing, Police Operations</td>\n", + " <td>according local police jakarta two explosion c...</td>\n", + " <td>Bombing, Police Operations</td>\n", + " <td>Bombing</td>\n", + " <td>Terrorism</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5777</th>\n", + " <td>Intelligence received by Everstream Analytics ...</td>\n", + " <td>Ice Storm</td>\n", + " <td>intelligence received everstream analytics ind...</td>\n", + " <td>Ice Storm</td>\n", + " <td>Ice Storm</td>\n", + " <td>Administrative Issue</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5778</th>\n", + " <td>Meteorological sources indicate that a series ...</td>\n", + " <td>Roadway Closure / Disruption, Ground Transport...</td>\n", + " <td>meteorological source indicate series winter s...</td>\n", + " <td>Roadway Closure / Disruption, Ground Transport...</td>\n", + " <td>Roadway Closure</td>\n", + " <td>Administrative Issue</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5779</th>\n", + " <td>Industry sources report on December 7 that Svi...</td>\n", + " <td>Industrial Action</td>\n", + " <td>industry source report december 7 svitzer aust...</td>\n", + " <td>Industrial Action</td>\n", + " <td>Industrial Action</td>\n", + " <td>Administrative Issue</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5780</th>\n", + " <td>Industry sources indicate on December 14 that ...</td>\n", + " <td>Port Strike</td>\n", + " <td>industry source indicate december 14 worker dp...</td>\n", + " <td>Port Strike</td>\n", + " <td>Port Strike</td>\n", + " <td>Worker Strike</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5781</th>\n", + " <td>On November 17, Dutch media sources reported t...</td>\n", + " <td>Port Strike</td>\n", + " <td>november 17 dutch medium source reported worke...</td>\n", + " <td>Port Strike</td>\n", + " <td>Port Strike</td>\n", + " <td>Worker Strike</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5780 rows × 6 columns</p>\n", + "</div>" + ], + "text/plain": [ + " Details \\\n", + "0 Media sources indicate that workers at the Gra... \n", + "1 News sources are stating that recent typhoons ... \n", + "2 The persisting port congestion at Shanghai’s Y... \n", + "3 Updated local media sources from Jakarta indic... \n", + "4 According to local police in Jakarta, two expl... \n", + "... ... \n", + "5777 Intelligence received by Everstream Analytics ... \n", + "5778 Meteorological sources indicate that a series ... \n", + "5779 Industry sources report on December 7 that Svi... \n", + "5780 Industry sources indicate on December 14 that ... \n", + "5781 On November 17, Dutch media sources reported t... \n", + "\n", + " Category \\\n", + "0 Mine Workers Strike \n", + "1 Travel Warning \n", + "2 Port Congestion \n", + "3 Bombing, Police Operations \n", + "4 Bombing, Police Operations \n", + "... ... \n", + "5777 Ice Storm \n", + "5778 Roadway Closure / Disruption, Ground Transport... \n", + "5779 Industrial Action \n", + "5780 Port Strike \n", + "5781 Port Strike \n", + "\n", + " Details_cleaned \\\n", + "0 medium source indicate worker grasberg mine ex... \n", + "1 news source stating recent typhoon impact hong... \n", + "2 persisting port congestion shanghai ’ yangshan... \n", + "3 updated local medium source jakarta indicate e... \n", + "4 according local police jakarta two explosion c... \n", + "... ... \n", + "5777 intelligence received everstream analytics ind... \n", + "5778 meteorological source indicate series winter s... \n", + "5779 industry source report december 7 svitzer aust... \n", + "5780 industry source indicate december 14 worker dp... \n", + "5781 november 17 dutch medium source reported worke... \n", + "\n", + " Category_cleaned Category_single \\\n", + "0 Mine Workers Strike Mine Workers Strike \n", + "1 Travel Warning Travel Warning \n", + "2 Port Congestion Port Congestion \n", + "3 Bombing, Police Operations Bombing \n", + "4 Bombing, Police Operations Bombing \n", + "... ... ... \n", + "5777 Ice Storm Ice Storm \n", + "5778 Roadway Closure / Disruption, Ground Transport... Roadway Closure \n", + "5779 Industrial Action Industrial Action \n", + "5780 Port Strike Port Strike \n", + "5781 Port Strike Port Strike \n", + "\n", + " Summarized_label \n", + "0 Worker Strike \n", + "1 Administrative Issue \n", + "2 Administrative Issue \n", + "3 Terrorism \n", + "4 Terrorism \n", + "... ... \n", + "5777 Administrative Issue \n", + "5778 Administrative Issue \n", + "5779 Administrative Issue \n", + "5780 Worker Strike \n", + "5781 Worker Strike \n", + "\n", + "[5780 rows x 6 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result_df[\"Summarized_label\"] = None\n", + "\n", + "for index, row in result_df.iterrows():\n", + " value = row[\"Category_single\"]\n", + " for key, values in new_labels_dict.items():\n", + " if value in values:\n", + " result_df.at[index, \"Summarized_label\"] = key\n", + " break # No need to check other keys if match found\n", + "result_df" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "27d12104", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Details</th>\n", + " <th>Category</th>\n", + " <th>Details_cleaned</th>\n", + " <th>Category_cleaned</th>\n", + " <th>Category_single</th>\n", + " <th>Summarized_label</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>count</th>\n", + " <td>5780</td>\n", + " <td>5780</td>\n", + " <td>5780</td>\n", + " <td>5780</td>\n", + " <td>5780</td>\n", + " <td>5780</td>\n", + " </tr>\n", + " <tr>\n", + " <th>unique</th>\n", + " <td>5750</td>\n", + " <td>857</td>\n", + " <td>5744</td>\n", + " <td>857</td>\n", + " <td>94</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>top</th>\n", + " <td>Updated media sources indicated on December 4 ...</td>\n", + " <td>Port Congestion</td>\n", + " <td>source indicated july 23 captain port united s...</td>\n", + " <td>Port Congestion</td>\n", + " <td>Port Congestion</td>\n", + " <td>Administrative Issue</td>\n", + " </tr>\n", + " <tr>\n", + " <th>freq</th>\n", + " <td>3</td>\n", + " <td>710</td>\n", + " <td>3</td>\n", + " <td>710</td>\n", + " <td>791</td>\n", + " <td>3210</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Details Category \\\n", + "count 5780 5780 \n", + "unique 5750 857 \n", + "top Updated media sources indicated on December 4 ... Port Congestion \n", + "freq 3 710 \n", + "\n", + " Details_cleaned Category_cleaned \\\n", + "count 5780 5780 \n", + "unique 5744 857 \n", + "top source indicated july 23 captain port united s... Port Congestion \n", + "freq 3 710 \n", + "\n", + " Category_single Summarized_label \n", + "count 5780 5780 \n", + "unique 94 8 \n", + "top Port Congestion Administrative Issue \n", + "freq 791 3210 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result_df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "b708ae97", + "metadata": {}, + "outputs": [], + "source": [ + "result_df.to_csv(\"data/processed_data.csv\", index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}