{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "e550a89c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "workding dir: /Users/inflaton/code/engd/papers/maritime/global-incidents\n", "loading env vars from: /Users/inflaton/code/engd/papers/maritime/global-incidents/.env\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "import sys\n", "from pathlib import Path\n", "\n", "workding_dir = str(Path.cwd().parent)\n", "os.chdir(workding_dir)\n", "sys.path.append(workding_dir)\n", "print(\"workding dir:\", workding_dir)\n", "\n", "from dotenv import find_dotenv, load_dotenv\n", "\n", "found_dotenv = find_dotenv(\".env\")\n", "\n", "if len(found_dotenv) == 0:\n", " found_dotenv = find_dotenv(\".env.example\")\n", "print(f\"loading env vars from: {found_dotenv}\")\n", "load_dotenv(found_dotenv, override=True)" ] }, { "cell_type": "markdown", "id": "1fecbc87", "metadata": {}, "source": [ "## Import Statement" ] }, { "cell_type": "code", "execution_count": 2, "id": "5169e3ee", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "markdown", "id": "76905f72", "metadata": {}, "source": [ "### read the data" ] }, { "cell_type": "code", "execution_count": 3, "id": "b1043895", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"data/all_port_labelled.csv\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "2e40d90a", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0IndexUnnamed: 0.1HeadlineDetailsSeverityCategoryRegionDatetimeYear...ITEPNEWCSDRPEMNNMif_labeledMonthWeek
00.08.034.0Grasberg Mine- Grasberg mine workers extend st...Media sources indicate that workers at the Gra...ModerateMine Workers StrikeIndonesia28/5/17 17:082017.0...0.00.00.00.00.00.01.0False5.021.0
11.010.063.0Indonesia: Undersea internet cables damaged by...News sources are stating that recent typhoons ...MinorTravel WarningIndonesia4/9/17 14:302017.0...0.00.00.00.00.01.00.0False4.014.0
\n", "

2 rows × 46 columns

\n", "
" ], "text/plain": [ " Unnamed: 0 Index Unnamed: 0.1 \\\n", "0 0.0 8.0 34.0 \n", "1 1.0 10.0 63.0 \n", "\n", " Headline \\\n", "0 Grasberg Mine- Grasberg mine workers extend st... \n", "1 Indonesia: Undersea internet cables damaged by... \n", "\n", " Details Severity \\\n", "0 Media sources indicate that workers at the Gra... Moderate \n", "1 News sources are stating that recent typhoons ... Minor \n", "\n", " Category Region Datetime Year ... IT EP NEW \\\n", "0 Mine Workers Strike Indonesia 28/5/17 17:08 2017.0 ... 0.0 0.0 0.0 \n", "1 Travel Warning Indonesia 4/9/17 14:30 2017.0 ... 0.0 0.0 0.0 \n", "\n", " CSD RPE MN NM if_labeled Month Week \n", "0 0.0 0.0 0.0 1.0 False 5.0 21.0 \n", "1 0.0 0.0 1.0 0.0 False 4.0 14.0 \n", "\n", "[2 rows x 46 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(2)" ] }, { "cell_type": "markdown", "id": "643a7e40", "metadata": {}, "source": [ "### Clean empty data" ] }, { "cell_type": "code", "execution_count": 5, "id": "d6ee1fd7", "metadata": {}, "outputs": [], "source": [ "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize\n", "from nltk.stem import WordNetLemmatizer\n", "import string\n", "\n", "# nltk.download('punkt')\n", "# nltk.download('stopwords')\n", "# nltk.download('wordnet')\n", "\n", "\n", "def clean_text(text):\n", " # Lowercase\n", " text = text.lower()\n", " # Tokenization\n", " tokens = word_tokenize(text)\n", " # Removing punctuation\n", " tokens = [word for word in tokens if word not in string.punctuation]\n", " # Removing stop words\n", " stop_words = set(stopwords.words(\"english\"))\n", " tokens = [word for word in tokens if word not in stop_words]\n", " # Lemmatization\n", " lemmatizer = WordNetLemmatizer()\n", " tokens = [lemmatizer.lemmatize(word) for word in tokens]\n", "\n", " return \" \".join(tokens)" ] }, { "cell_type": "code", "execution_count": 6, "id": "9e35b49a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package omw-1.4 to\n", "[nltk_data] /Users/inflaton/nltk_data...\n", "[nltk_data] Package omw-1.4 is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import nltk\n", "\n", "nltk.download(\"omw-1.4\")" ] }, { "cell_type": "markdown", "id": "ca331c4b", "metadata": {}, "source": [ "### The Details column has an issue\n", "\n", "some of the data are of the type float and none of the text processing functions can be applied to it therefore we have to process it" ] }, { "cell_type": "code", "execution_count": 7, "id": "2438c58f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 5782 entries, 0 to 5781\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Details 5781 non-null object\n", " 1 maritime_label 5781 non-null object\n", "dtypes: object(2)\n", "memory usage: 90.5+ KB\n", "\n", "RangeIndex: 5782 entries, 0 to 5781\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Details 5781 non-null object\n", " 1 maritime_label 5781 non-null object\n", " 2 Details_cleaned 5781 non-null object\n", "dtypes: object(3)\n", "memory usage: 135.6+ KB\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/7x/56svhln929zdh2xhr3mwqg4r0000gn/T/ipykernel_16237/2443564520.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " text_df['Details_cleaned'] = text_df['Details'].apply(lambda x: clean_text(x) if not isinstance(x, float) else None)\n" ] } ], "source": [ "text_df = df[[\"Details\", \"maritime_label\"]]\n", "text_df.info()\n", "text_df[\"Details_cleaned\"] = text_df[\"Details\"].apply(\n", " lambda x: clean_text(x) if not isinstance(x, float) else None\n", ")\n", "# no_nan_df[no_nan_df[\"Details\"].apply(lambda x: print(type(x)))]\n", "# cleaned_df = text_df[text_df[\"Details\"].apply(lambda x: clean_text(x))]\n", "# cleaned_df = df['Details'][1:2]\n", "# type(no_nan_df[\"Details\"][0])\n", "# print(clean_text(no_nan_df[\"Details\"][0]))\n", "text_df.info()" ] }, { "cell_type": "code", "execution_count": 8, "id": "4d3b0011", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Detailsmaritime_labelDetails_cleaned
0Media sources indicate that workers at the Gra...FALSEmedium source indicate worker grasberg mine ex...
1News sources are stating that recent typhoons ...FALSEnews source stating recent typhoon impact hong...
2The persisting port congestion at Shanghai’s Y...TRUEpersisting port congestion shanghai ’ yangshan...
3Updated local media sources from Jakarta indic...TRUEupdated local medium source jakarta indicate e...
4According to local police in Jakarta, two expl...TRUEaccording local police jakarta two explosion c...
\n", "
" ], "text/plain": [ " Details maritime_label \\\n", "0 Media sources indicate that workers at the Gra... FALSE \n", "1 News sources are stating that recent typhoons ... FALSE \n", "2 The persisting port congestion at Shanghai’s Y... TRUE \n", "3 Updated local media sources from Jakarta indic... TRUE \n", "4 According to local police in Jakarta, two expl... TRUE \n", "\n", " Details_cleaned \n", "0 medium source indicate worker grasberg mine ex... \n", "1 news source stating recent typhoon impact hong... \n", "2 persisting port congestion shanghai ’ yangshan... \n", "3 updated local medium source jakarta indicate e... \n", "4 according local police jakarta two explosion c... " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "processed_data = text_df.dropna()\n", "processed_data.head()" ] }, { "cell_type": "markdown", "id": "3c4be609", "metadata": {}, "source": [ "## Naive Bayes Model" ] }, { "cell_type": "code", "execution_count": 9, "id": "5c660011", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "# from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.metrics import accuracy_score, classification_report" ] }, { "cell_type": "code", "execution_count": 10, "id": "8f009a65", "metadata": {}, "outputs": [], "source": [ "X = processed_data[\"Details_cleaned\"]\n", "y = processed_data[\"maritime_label\"]" ] }, { "cell_type": "code", "execution_count": 11, "id": "0185a967", "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.2, random_state=42\n", ")" ] }, { "cell_type": "code", "execution_count": 12, "id": "d3c2de6b", "metadata": {}, "outputs": [], "source": [ "# vectorizer = CountVectorizer()\n", "# X_train_vec = vectorizer.fit_transform(X_train)\n", "# X_test_vec = vectorizer.transform(X_test)\n", "\n", "tfidf_vectorizer = TfidfVectorizer(max_features=1000)\n", "X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n", "X_test_tfidf = tfidf_vectorizer.transform(X_test)" ] }, { "cell_type": "code", "execution_count": 13, "id": "ead2fc7a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
MultinomialNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "MultinomialNB()" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "naive_bayes = MultinomialNB()\n", "naive_bayes.fit(X_train_tfidf, y_train)" ] }, { "cell_type": "code", "execution_count": 14, "id": "74c5df68", "metadata": {}, "outputs": [], "source": [ "predictions = naive_bayes.predict(X_test_tfidf)" ] }, { "cell_type": "code", "execution_count": 15, "id": "109e9456", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy of Naive Bayes model: 0.8582541054451167\n", " precision recall f1-score support\n", "\n", " FALSE 0.88 0.94 0.91 847\n", " TRUE 0.79 0.65 0.71 310\n", "\n", " accuracy 0.86 1157\n", " macro avg 0.83 0.79 0.81 1157\n", "weighted avg 0.85 0.86 0.85 1157\n", "\n" ] } ], "source": [ "accuracy = accuracy_score(y_test, predictions)\n", "print(\"Accuracy of Naive Bayes model:\", accuracy)\n", "print(classification_report(y_test, predictions))" ] }, { "cell_type": "markdown", "id": "9518614a", "metadata": {}, "source": [ "## Logistic Regression model" ] }, { "cell_type": "code", "execution_count": 16, "id": "912ad7a6", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score" ] }, { "cell_type": "code", "execution_count": 17, "id": "03eac734", "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.2, random_state=42\n", ")" ] }, { "cell_type": "code", "execution_count": 18, "id": "e84ff87c", "metadata": {}, "outputs": [], "source": [ "tfidf_vectorizer = TfidfVectorizer(max_features=1000)\n", "X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n", "X_test_tfidf = tfidf_vectorizer.transform(X_test)" ] }, { "cell_type": "code", "execution_count": 19, "id": "cedb263c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "LogisticRegression()" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = LogisticRegression()\n", "model.fit(X_train_tfidf, y_train)" ] }, { "cell_type": "code", "execution_count": 20, "id": "6f49fddb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy of Logistic Regression Model: 0.9308556611927399\n", " precision recall f1-score support\n", "\n", " FALSE 0.92 0.99 0.95 847\n", " TRUE 0.98 0.76 0.86 310\n", "\n", " accuracy 0.93 1157\n", " macro avg 0.95 0.88 0.90 1157\n", "weighted avg 0.93 0.93 0.93 1157\n", "\n" ] } ], "source": [ "y_pred = model.predict(X_test_tfidf)\n", "\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(\"Accuracy of Logistic Regression Model:\", accuracy)\n", "print(classification_report(y_test, y_pred))" ] }, { "cell_type": "markdown", "id": "613c0cdf", "metadata": {}, "source": [ "## Support Vector Machine (SVM) model" ] }, { "cell_type": "code", "execution_count": 21, "id": "706302c1", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.svm import SVC\n", "from sklearn.metrics import accuracy_score" ] }, { "cell_type": "code", "execution_count": 22, "id": "b0988ca4", "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.2, random_state=42\n", ")" ] }, { "cell_type": "code", "execution_count": 23, "id": "4f682c60", "metadata": {}, "outputs": [], "source": [ "tfidf_vectorizer = TfidfVectorizer(max_features=1000)\n", "X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n", "X_test_tfidf = tfidf_vectorizer.transform(X_test)" ] }, { "cell_type": "code", "execution_count": 24, "id": "71ae91d9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
SVC(kernel='linear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "SVC(kernel='linear')" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "svm_model = SVC(kernel=\"linear\")\n", "svm_model.fit(X_train_tfidf, y_train)" ] }, { "cell_type": "code", "execution_count": 25, "id": "2dc1b193", "metadata": {}, "outputs": [], "source": [ "y_pred = svm_model.predict(X_test_tfidf)" ] }, { "cell_type": "code", "execution_count": 26, "id": "92801e61", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy of SVM model: 0.9524632670700086\n", " precision recall f1-score support\n", "\n", " FALSE 0.94 1.00 0.97 847\n", " TRUE 1.00 0.83 0.90 310\n", "\n", " accuracy 0.95 1157\n", " macro avg 0.97 0.91 0.94 1157\n", "weighted avg 0.96 0.95 0.95 1157\n", "\n" ] } ], "source": [ "accuracy = accuracy_score(y_test, y_pred)\n", "print(\"Accuracy of SVM model:\", accuracy)\n", "print(classification_report(y_test, y_pred))" ] }, { "cell_type": "markdown", "id": "1d1f6ebd", "metadata": {}, "source": [ "## Random Forest Model" ] }, { "cell_type": "code", "execution_count": 27, "id": "9170c174", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score" ] }, { "cell_type": "code", "execution_count": 28, "id": "2092ca05", "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.2, random_state=42\n", ")" ] }, { "cell_type": "code", "execution_count": 29, "id": "206296ce", "metadata": {}, "outputs": [], "source": [ "tfidf_vectorizer = TfidfVectorizer(max_features=1000)\n", "X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n", "X_test_tfidf = tfidf_vectorizer.transform(X_test)" ] }, { "cell_type": "code", "execution_count": 30, "id": "258bd78f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
RandomForestClassifier(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "RandomForestClassifier(random_state=42)" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n", "rf_model.fit(X_train_tfidf, y_train)" ] }, { "cell_type": "code", "execution_count": 31, "id": "0e2910f6", "metadata": {}, "outputs": [], "source": [ "y_pred = rf_model.predict(X_test_tfidf)" ] }, { "cell_type": "code", "execution_count": 32, "id": "f06900d3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy of Random Forest Model: 0.9628349178910977\n", " precision recall f1-score support\n", "\n", " FALSE 0.96 1.00 0.98 847\n", " TRUE 0.99 0.87 0.93 310\n", "\n", " accuracy 0.96 1157\n", " macro avg 0.97 0.93 0.95 1157\n", "weighted avg 0.96 0.96 0.96 1157\n", "\n" ] } ], "source": [ "accuracy = accuracy_score(y_test, y_pred)\n", "print(\"Accuracy of Random Forest Model:\", accuracy)\n", "print(classification_report(y_test, y_pred))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 5 }