{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "46322fb5-5918-4b70-9689-9e0781439ac4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "workding dir: /Users/inflaton/code/engd/papers/maritime/global-incidents\n", "loading env vars from: /Users/inflaton/code/engd/papers/maritime/global-incidents/.env\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "import sys\n", "from pathlib import Path\n", "\n", "workding_dir = str(Path.cwd().parent)\n", "os.chdir(workding_dir)\n", "sys.path.append(workding_dir)\n", "print(\"workding dir:\", workding_dir)\n", "\n", "from dotenv import find_dotenv, load_dotenv\n", "\n", "found_dotenv = find_dotenv(\".env\")\n", "\n", "if len(found_dotenv) == 0:\n", " found_dotenv = find_dotenv(\".env.example\")\n", "print(f\"loading env vars from: {found_dotenv}\")\n", "load_dotenv(found_dotenv, override=True)" ] }, { "cell_type": "code", "execution_count": 2, "id": "daf1e3d1-75ac-4299-8bed-2f413a49f9a6", "metadata": { "tags": [] }, "outputs": [], "source": [ "import nltk\n", "from nltk.tokenize import sent_tokenize\n", "from nltk.tokenize import word_tokenize\n", "\n", "import gensim\n", "from gensim import corpora\n", "from gensim import similarities\n", "from gensim import models\n", "from gensim.models import CoherenceModel\n", "\n", "# from wordcloud import WordCloud, ImageColorGenerator\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import pandas as pd\n", "import re\n", "import os\n", "import datetime\n", "\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "from pprint import pprint\n", "import pyLDAvis\n", "import pyLDAvis.gensim_models as gensimvis" ] }, { "cell_type": "markdown", "id": "49e6de6b-71bd-4948-8827-52601406058f", "metadata": {}, "source": [ "# Import Data with only the news headline and details" ] }, { "cell_type": "code", "execution_count": 6, "id": "49222182-7811-4fa6-8c0a-21d3a546863e", "metadata": {}, "outputs": [], "source": [ "df = pd.read_parquet(\"data/processed_data.parquet\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "3fb59a30", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idHeadlineDetailsSeverityCategoryRegionDatetimeYearlatlonmaritime_labelfound_portscontains_port_infoif_labeledMonthWeekHeadline_Detailscleaned_Headline_Detailsbinary_Headline_Detailsword_count
01.0Grasberg Mine- Grasberg mine workers extend st...Media sources indicate that workers at the Gra...ModerateMine Workers StrikeIndonesia28/5/17 17:082017.0-4.05608137.11302False['freeport']1.0False5.021.0Grasberg Mine- Grasberg mine workers extend st...[grasberg, grasberg, mine, worker, extend, str...[worker_grasberg_mine]1
12.0Indonesia: Undersea internet cables damaged by...News sources are stating that recent typhoons ...MinorTravel WarningIndonesia4/9/17 14:302017.0NaNNaNFalse['hong kong']1.0False4.014.0Indonesia: Undersea internet cables damaged by...[indonesia, undersea, internet, cable, damage,...[undersea_internet_cable]1
23.0Shanghai port congestion impacts terminals in ...The persisting port congestion at Shanghai’s Y...MinorPort CongestionChina27/4/17 9:162017.029.52000121.33190True['ningbo', 'qingdao', 'shanghai']1.0False4.017.0Shanghai port congestion impacts terminals in ...[shanghai, port, congestion, impact, terminal,...[]0
34.0UPDATE - Indonesia: Explosion at KP Terminal i...Updated local media sources from Jakarta indic...ExtremeBombing, Police OperationsIndonesia24/5/17 15:152017.0-6.22465106.86700True['jakarta']1.0False5.021.0UPDATE - Indonesia: Explosion at KP Terminal i...[update, indonesia, explosion, at, kp, termina...[]0
45.0UPDATE - Indonesia: Police confirm two explosi...According to local police in Jakarta, two expl...ExtremeBombing, Police OperationsIndonesia24/5/17 16:202017.0NaNNaNTrue['jakarta']1.0True5.021.0UPDATE - Indonesia: Police confirm two explosi...[update, indonesia, police, confirm, two, expl...[]0
\n", "
" ], "text/plain": [ " id Headline \\\n", "0 1.0 Grasberg Mine- Grasberg mine workers extend st... \n", "1 2.0 Indonesia: Undersea internet cables damaged by... \n", "2 3.0 Shanghai port congestion impacts terminals in ... \n", "3 4.0 UPDATE - Indonesia: Explosion at KP Terminal i... \n", "4 5.0 UPDATE - Indonesia: Police confirm two explosi... \n", "\n", " Details Severity \\\n", "0 Media sources indicate that workers at the Gra... Moderate \n", "1 News sources are stating that recent typhoons ... Minor \n", "2 The persisting port congestion at Shanghai’s Y... Minor \n", "3 Updated local media sources from Jakarta indic... Extreme \n", "4 According to local police in Jakarta, two expl... Extreme \n", "\n", " Category Region Datetime Year lat \\\n", "0 Mine Workers Strike Indonesia 28/5/17 17:08 2017.0 -4.05608 \n", "1 Travel Warning Indonesia 4/9/17 14:30 2017.0 NaN \n", "2 Port Congestion China 27/4/17 9:16 2017.0 29.52000 \n", "3 Bombing, Police Operations Indonesia 24/5/17 15:15 2017.0 -6.22465 \n", "4 Bombing, Police Operations Indonesia 24/5/17 16:20 2017.0 NaN \n", "\n", " lon maritime_label found_ports \\\n", "0 137.11302 False ['freeport'] \n", "1 NaN False ['hong kong'] \n", "2 121.33190 True ['ningbo', 'qingdao', 'shanghai'] \n", "3 106.86700 True ['jakarta'] \n", "4 NaN True ['jakarta'] \n", "\n", " contains_port_info if_labeled Month Week \\\n", "0 1.0 False 5.0 21.0 \n", "1 1.0 False 4.0 14.0 \n", "2 1.0 False 4.0 17.0 \n", "3 1.0 False 5.0 21.0 \n", "4 1.0 True 5.0 21.0 \n", "\n", " Headline_Details \\\n", "0 Grasberg Mine- Grasberg mine workers extend st... \n", "1 Indonesia: Undersea internet cables damaged by... \n", "2 Shanghai port congestion impacts terminals in ... \n", "3 UPDATE - Indonesia: Explosion at KP Terminal i... \n", "4 UPDATE - Indonesia: Police confirm two explosi... \n", "\n", " cleaned_Headline_Details \\\n", "0 [grasberg, grasberg, mine, worker, extend, str... \n", "1 [indonesia, undersea, internet, cable, damage,... \n", "2 [shanghai, port, congestion, impact, terminal,... \n", "3 [update, indonesia, explosion, at, kp, termina... \n", "4 [update, indonesia, police, confirm, two, expl... \n", "\n", " binary_Headline_Details word_count \n", "0 [worker_grasberg_mine] 1 \n", "1 [undersea_internet_cable] 1 \n", "2 [] 0 \n", "3 [] 0 \n", "4 [] 0 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 8, "id": "09113e88-66cc-414c-a953-da04db83c4ae", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5778, 20)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 26, "id": "be6fc947", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DataFrames are identical: True\n" ] } ], "source": [ "df2 = pd.read_parquet(\"IS424_Data_Mining/code/LDA/processed_data.parquet\")\n", "# Check if two DataFrames are exactly the same\n", "are_identical = df.equals(df2)\n", "print(f\"DataFrames are identical: {are_identical}\")" ] }, { "cell_type": "markdown", "id": "037e74fc-bbcd-43e3-8346-799920cca8d8", "metadata": {}, "source": [ "# Vectorisation" ] }, { "cell_type": "markdown", "id": "d67cef3a-59fb-4dd8-adc8-2cf288b90728", "metadata": {}, "source": [ "NLP vectorization refers to the process of converting text data into numerical vectors that machine learning algorithms can understand and process. \n", "\n", "Bag-of-Words (BoW) is used here that represents text as a collection of unique words along with their frequencies. Each word is assigned an index, and the vector contains the count of each word present in the document." ] }, { "cell_type": "code", "execution_count": 14, "id": "c95b7b8a-9767-469d-812d-c9a9d9fee0e9", "metadata": {}, "outputs": [], "source": [ "cleaned = df.copy()" ] }, { "cell_type": "code", "execution_count": 15, "id": "de71c523-a59e-44b2-aa96-5f17d872c9c6", "metadata": {}, "outputs": [], "source": [ "headline = cleaned.binary_Headline_Details" ] }, { "cell_type": "code", "execution_count": 16, "id": "5b1e34e1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['worker_grasberg_mine'], dtype=object)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "headline[0]" ] }, { "cell_type": "code", "execution_count": 17, "id": "677055b4-978e-4253-90f4-3f903662e225", "metadata": { "tags": [] }, "outputs": [], "source": [ "# vectorise the words\n", "doc_dict = gensim.corpora.Dictionary(headline)\n", "docs_vecs = [doc_dict.doc2bow(doc) for doc in headline]" ] }, { "cell_type": "code", "execution_count": 18, "id": "a54d1768-b069-4936-a156-deaf0b506d93", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of unique tokens: 5319\n", "Number of articles: 5778\n" ] } ], "source": [ "print(\"Number of unique tokens: %d\" % len(doc_dict))\n", "print(\"Number of articles: %d\" % len(docs_vecs))" ] }, { "cell_type": "code", "execution_count": 19, "id": "9147fa86-1503-4252-bd9b-92fea1e6a926", "metadata": { "scrolled": true, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('due_dense_fog', 20),\n", " ('strong_wind', 19),\n", " ('indicate_average', 19),\n", " ('day_port', 17),\n", " ('san_antonio', 17),\n", " ('vessel_port', 16),\n", " ('low_visibility', 15),\n", " ('average_wait', 13),\n", " ('port_qingdao', 12),\n", " ('east_coast_parkway', 12),\n", " ('port_shanghai', 11),\n", " ('port_hong_kong', 11),\n", " ('severe_wind', 11),\n", " ('blank_week_service', 11),\n", " ('congestion_port_manila', 10),\n", " ('wait_hour', 10),\n", " ('day_situation', 9),\n", " ('port_ningbo', 9),\n", " ('coastal_area', 9),\n", " ('strong_wind_forecast', 9),\n", " ('vessel_arrival', 8),\n", " ('high_wind', 8),\n", " ('pomeranian_voivodeship', 8),\n", " ('wait_day', 8),\n", " ('previous_week', 8),\n", " ('vessel_wait', 8),\n", " ('average_wait_port', 8),\n", " ('affect_operation_port', 7),\n", " ('berth_manila_south', 7),\n", " ('day_port_saigon', 7),\n", " ('vessel_port_hong', 7),\n", " ('vessel_port_shanghai', 7),\n", " ('high_risk_port', 7),\n", " ('wait_hour_port', 7),\n", " ('qianwan_container', 7),\n", " ('day_port_shanghai', 6),\n", " ('international_container', 6),\n", " ('tropical_storm', 6),\n", " ('arrive_window', 6),\n", " ('moderate_risk_port', 6),\n", " ('vessel_port_qingdao', 6),\n", " ('landside_rail_operation', 6),\n", " ('dp_world_southampton', 6),\n", " ('high_yard_density', 6),\n", " ('vessel_port_ningbo', 6),\n", " ('port_busan', 6),\n", " ('strong_wind_area', 6),\n", " ('new_york_city', 6),\n", " ('duration_closure', 6),\n", " ('vessel_berth_port', 5),\n", " ('wait_decrease_day', 5),\n", " ('disrupt_operation_port', 5),\n", " ('waterside_landside_operation', 5),\n", " ('congest_vessel', 5),\n", " ('port_charleston', 5),\n", " ('san_antonio_puerto', 5),\n", " ('yantian_shipping', 5),\n", " ('port_saigon', 5),\n", " ('port_ho_chi', 5),\n", " ('engine_failure', 5),\n", " ('port_hong', 5),\n", " ('dense_fog', 5),\n", " ('disrupt_port_operation', 5),\n", " ('high_wind_port', 5),\n", " ('vessel_wait_day', 5),\n", " ('vessel_port_tanjung', 5),\n", " ('landside_waterside_operation', 5),\n", " ('port_durban', 5),\n", " ('port_manila', 5),\n", " ('next_day_morning', 5),\n", " ('strong_wind_halt', 5),\n", " ('houston_ship_channel', 5),\n", " ('low_visibility_forecast', 5),\n", " ('full_port', 5),\n", " ('tropical_storm_hanna', 5),\n", " ('likely_port_closure', 5),\n", " ('strong_wind_industry', 5),\n", " ('hour_qingdao_qianwan', 5),\n", " ('antonio_halt_operation', 5),\n", " ('high_chance_port', 4),\n", " ('pilot_service', 4),\n", " ('low_productivity', 4),\n", " ('heavy_rain', 4),\n", " ('resume_operation', 4),\n", " ('terminal_indicate', 4),\n", " ('port_fuzhou', 4),\n", " ('disrupt_port', 4),\n", " ('nhava_sheva', 4),\n", " ('berth_day', 4),\n", " ('congestion_port', 4),\n", " ('indicate_waterside_operation', 4),\n", " ('injury_damage_incident', 4),\n", " ('due_fog', 4),\n", " ('wait_vessel_port', 4),\n", " ('kong_industry', 4),\n", " ('hong_kong', 4),\n", " ('wait_vessel_berth', 4),\n", " ('terminal_port', 4),\n", " ('wait_vessel', 4),\n", " ('affect_port_operation', 4)]\n" ] } ], "source": [ "# Calculate word frequencies\n", "word_frequencies = {doc_dict[word_id]: freq for word_id, freq in doc_dict.cfs.items()}\n", "sorted_words = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)\n", "\n", "pprint(sorted_words[:100])" ] }, { "cell_type": "markdown", "id": "5ed78239-2ce1-4784-a8f4-4c7438c8627b", "metadata": {}, "source": [ "# LDA Modelling" ] }, { "cell_type": "markdown", "id": "9db83273-461d-4f70-b23f-ec967579d94f", "metadata": {}, "source": [ "## Benchmark Model" ] }, { "cell_type": "code", "execution_count": 20, "id": "e6d577bd-9936-4d45-be90-345af2eb4827", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Build LDA benchmark model\n", "lda_model = gensim.models.LdaMulticore(\n", " corpus=docs_vecs,\n", " id2word=doc_dict,\n", " num_topics=4,\n", " random_state=42,\n", " chunksize=100,\n", " passes=10,\n", " per_word_topics=True,\n", ")" ] }, { "cell_type": "code", "execution_count": 21, "id": "c4f1521f-5f43-40d2-a3a3-a8ac2ca6fec2", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(0,\n", " '0.003*\"indicate_average\" + 0.002*\"coastal_area\" + 0.002*\"vessel_port_hong\" '\n", " '+ 0.002*\"east_coast_parkway\" + 0.002*\"port_charleston\" + '\n", " '0.002*\"average_wait_port\" + 0.002*\"severe_wind\" + '\n", " '0.002*\"pomeranian_voivodeship\" + 0.002*\"engine_failure\" + '\n", " '0.002*\"due_dense_fog\"'),\n", " (1,\n", " '0.005*\"san_antonio\" + 0.003*\"blank_week_service\" + 0.003*\"wait_hour\" + '\n", " '0.003*\"day_port_saigon\" + 0.002*\"low_visibility\" + 0.002*\"high_wind\" + '\n", " '0.002*\"waterside_landside_operation\" + 0.002*\"tropical_storm\" + '\n", " '0.002*\"qianwan_container\" + 0.002*\"port_ningbo\"'),\n", " (2,\n", " '0.004*\"strong_wind\" + 0.002*\"port_shanghai\" + 0.002*\"port_qingdao\" + '\n", " '0.002*\"vessel_port_qingdao\" + 0.002*\"day_port_shanghai\" + '\n", " '0.002*\"indicate_average\" + 0.002*\"disrupt_operation_port\" + '\n", " '0.002*\"strong_wind_area\" + 0.002*\"port_ho_chi\" + '\n", " '0.001*\"operation_pier_port\"'),\n", " (3,\n", " '0.004*\"port_hong_kong\" + 0.003*\"vessel_port\" + 0.003*\"day_port\" + '\n", " '0.003*\"congestion_port_manila\" + 0.002*\"berth_manila_south\" + '\n", " '0.002*\"arrive_window\" + 0.002*\"landside_rail_operation\" + '\n", " '0.002*\"international_container\" + 0.002*\"day_situation\" + '\n", " '0.002*\"congestion_port\"')]\n" ] } ], "source": [ "from pprint import pprint\n", "\n", "# Print the Keyword in the 10 topics\n", "pprint(lda_model.print_topics())\n", "doc_lda = lda_model[docs_vecs]" ] }, { "cell_type": "code", "execution_count": 22, "id": "fd57b1f4-a6cd-41e8-964f-d8a1d30aa3c9", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Coherence Score LDAModel: 0.7011993291597081\n" ] } ], "source": [ "# Compute Benchmark Coherence Score\n", "coherence_model_lda = CoherenceModel(\n", " model=lda_model, texts=headline, dictionary=doc_dict, coherence=\"c_v\"\n", ")\n", "coherence_lda = coherence_model_lda.get_coherence()\n", "print(\"\\nCoherence Score LDAModel: \", coherence_lda)" ] }, { "cell_type": "code", "execution_count": 23, "id": "152e5a3a-7afe-4fb8-a02f-d7492ad80936", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Perplexity for LDAModel: -9.594271136114548\n" ] } ], "source": [ "# Compute Benchmark Perplexity\n", "perplex = lda_model.log_perplexity(docs_vecs, total_docs=None) # For LDAModel\n", "# a measure of how good the model is. lower the better.\n", "\n", "print(\"\\nPerplexity for LDAModel: \", perplex)" ] }, { "cell_type": "code", "execution_count": 24, "id": "7dd3a60a-5c6f-4249-9868-30528a5b0ac8", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n" ] } ], "source": [ "from pprint import pprint\n", "import pyLDAvis\n", "import pyLDAvis.gensim_models as gensimvis\n", "\n", "# feed the LDA model into the pyLDAvis instance\n", "pyLDAvis.enable_notebook()\n", "visual = gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n", "\n", "# Save the output to the html file\n", "pyLDAvis.save_html(visual, \"topic_viz_benchmark.html\")" ] }, { "cell_type": "code", "execution_count": 25, "id": "48bd84a0-ce9b-4117-bf6e-d9afb9936147", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Topic KeywordsTopic ID
00.003*\"indicate_average\" + 0.002*\"coastal_area\" + 0.002*\"vessel_port_hong\" + 0.002*\"east_coast_parkway\" + 0.002*\"port_charleston\" + 0.002*\"average_wait_port\"0
10.005*\"san_antonio\" + 0.003*\"blank_week_service\" + 0.003*\"wait_hour\" + 0.003*\"day_port_saigon\" + 0.002*\"low_visibility\" + 0.002*\"high_wind\"1
20.004*\"strong_wind\" + 0.002*\"port_shanghai\" + 0.002*\"port_qingdao\" + 0.002*\"vessel_port_qingdao\" + 0.002*\"day_port_shanghai\" + 0.002*\"indicate_average\"2
30.004*\"port_hong_kong\" + 0.003*\"vessel_port\" + 0.003*\"day_port\" + 0.003*\"congestion_port_manila\" + 0.002*\"berth_manila_south\" + 0.002*\"arrive_window\"3
\n", "
" ], "text/plain": [ " Topic Keywords \\\n", "0 0.003*\"indicate_average\" + 0.002*\"coastal_area\" + 0.002*\"vessel_port_hong\" + 0.002*\"east_coast_parkway\" + 0.002*\"port_charleston\" + 0.002*\"average_wait_port\" \n", "1 0.005*\"san_antonio\" + 0.003*\"blank_week_service\" + 0.003*\"wait_hour\" + 0.003*\"day_port_saigon\" + 0.002*\"low_visibility\" + 0.002*\"high_wind\" \n", "2 0.004*\"strong_wind\" + 0.002*\"port_shanghai\" + 0.002*\"port_qingdao\" + 0.002*\"vessel_port_qingdao\" + 0.002*\"day_port_shanghai\" + 0.002*\"indicate_average\" \n", "3 0.004*\"port_hong_kong\" + 0.003*\"vessel_port\" + 0.003*\"day_port\" + 0.003*\"congestion_port_manila\" + 0.002*\"berth_manila_south\" + 0.002*\"arrive_window\" \n", "\n", " Topic ID \n", "0 0 \n", "1 1 \n", "2 2 \n", "3 3 " ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.set_option(\"max_colwidth\", 200)\n", "# Get the topics and their top keywords into a dataframe\n", "topics = lda_model.show_topics(num_words=6)\n", "\n", "topic_keywords = pd.DataFrame()\n", "for topic_id, topic in topics:\n", " topic_keywords.at[topic_id, \"Topic Keywords\"] = topic\n", "\n", "topic_keywords[\"Topic ID\"] = topic_keywords.index\n", "# topic_keywords['Topic Name'] = topic_mapping\n", "topic_keywords" ] }, { "cell_type": "markdown", "id": "3247fe12", "metadata": {}, "source": [ "## Conclusion\n", "\n", "No significant insights gained from the model result as we cannot identify any topics, maybe more data is needed, will deploy web scraping for full news content after mid term." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 5 }