{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "46322fb5-5918-4b70-9689-9e0781439ac4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "workding dir: /Users/inflaton/code/engd/papers/maritime/global-incidents\n",
      "loading env vars from: /Users/inflaton/code/engd/papers/maritime/global-incidents/.env\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import os\n",
    "import sys\n",
    "from pathlib import Path\n",
    "\n",
    "workding_dir = str(Path.cwd().parent)\n",
    "os.chdir(workding_dir)\n",
    "sys.path.append(workding_dir)\n",
    "print(\"workding dir:\", workding_dir)\n",
    "\n",
    "from dotenv import find_dotenv, load_dotenv\n",
    "\n",
    "found_dotenv = find_dotenv(\".env\")\n",
    "\n",
    "if len(found_dotenv) == 0:\n",
    "    found_dotenv = find_dotenv(\".env.example\")\n",
    "print(f\"loading env vars from: {found_dotenv}\")\n",
    "load_dotenv(found_dotenv, override=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "daf1e3d1-75ac-4299-8bed-2f413a49f9a6",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import nltk\n",
    "from nltk.tokenize import sent_tokenize\n",
    "from nltk.tokenize import word_tokenize\n",
    "\n",
    "import gensim\n",
    "from gensim import corpora\n",
    "from gensim import similarities\n",
    "from gensim import models\n",
    "from gensim.models import CoherenceModel\n",
    "\n",
    "# from wordcloud import WordCloud, ImageColorGenerator\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import pandas as pd\n",
    "import re\n",
    "import os\n",
    "import datetime\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "from pprint import pprint\n",
    "import pyLDAvis\n",
    "import pyLDAvis.gensim_models as gensimvis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c673c907-e1d8-4d64-9a73-c15c15b78e7f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2024-06-30 15:39:16.255404\n"
     ]
    }
   ],
   "source": [
    "print(datetime.datetime.now())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "49e6de6b-71bd-4948-8827-52601406058f",
   "metadata": {},
   "source": [
    "# Import the data with full news content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "49222182-7811-4fa6-8c0a-21d3a546863e",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_parquet('data/processed_data2.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "3fb59a30",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>Headline</th>\n",
       "      <th>Details</th>\n",
       "      <th>Severity</th>\n",
       "      <th>Category</th>\n",
       "      <th>Region</th>\n",
       "      <th>Datetime</th>\n",
       "      <th>Year</th>\n",
       "      <th>lat</th>\n",
       "      <th>lon</th>\n",
       "      <th>...</th>\n",
       "      <th>if_labeled</th>\n",
       "      <th>Month</th>\n",
       "      <th>Week</th>\n",
       "      <th>Headline_Details</th>\n",
       "      <th>url</th>\n",
       "      <th>title</th>\n",
       "      <th>content</th>\n",
       "      <th>cleaned_content</th>\n",
       "      <th>binary_content</th>\n",
       "      <th>word_count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Grasberg Mine- Grasberg mine workers extend st...</td>\n",
       "      <td>Media sources indicate that workers at the Gra...</td>\n",
       "      <td>Moderate</td>\n",
       "      <td>Mine Workers Strike</td>\n",
       "      <td>Indonesia</td>\n",
       "      <td>28/5/17 17:08</td>\n",
       "      <td>2017.0</td>\n",
       "      <td>-4.05608</td>\n",
       "      <td>137.11302</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>5.0</td>\n",
       "      <td>21.0</td>\n",
       "      <td>Grasberg Mine- Grasberg mine workers extend st...</td>\n",
       "      <td>https://news.google.com/rss/articles/CBMiZ2h0d...</td>\n",
       "      <td>Freeport Indonesia mine workers extend strike ...</td>\n",
       "      <td>Trucks are seen on a road in the Grasberg copp...</td>\n",
       "      <td>[truck, be, see, on, road, in, grasberg, coppe...</td>\n",
       "      <td>[adkerson_jakarta_try, agreement_freeport_indo...</td>\n",
       "      <td>53</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>Shanghai port congestion impacts terminals in ...</td>\n",
       "      <td>The persisting port congestion at Shanghai’s Y...</td>\n",
       "      <td>Minor</td>\n",
       "      <td>Port Congestion</td>\n",
       "      <td>China</td>\n",
       "      <td>27/4/17 9:16</td>\n",
       "      <td>2017.0</td>\n",
       "      <td>29.52000</td>\n",
       "      <td>121.33190</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>4.0</td>\n",
       "      <td>17.0</td>\n",
       "      <td>Shanghai port congestion impacts terminals in ...</td>\n",
       "      <td>https://news.google.com/rss/articles/CBMiVWh0d...</td>\n",
       "      <td>Typhoon Muifa to shut China ports for second t...</td>\n",
       "      <td>By Sam Whelan 13/09/2022\\n\\nAnother typhoon ha...</td>\n",
       "      <td>[by, sam, whelan, typhoon, have, prompt, port,...</td>\n",
       "      <td>[additional_ripple_effect, avoid_path_typhoon,...</td>\n",
       "      <td>44</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5</td>\n",
       "      <td>UPDATE - Indonesia: Police confirm two explosi...</td>\n",
       "      <td>According to local police in Jakarta, two expl...</td>\n",
       "      <td>Extreme</td>\n",
       "      <td>Bombing, Police Operations</td>\n",
       "      <td>Indonesia</td>\n",
       "      <td>24/5/17 16:20</td>\n",
       "      <td>2017.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>True</td>\n",
       "      <td>5.0</td>\n",
       "      <td>21.0</td>\n",
       "      <td>UPDATE - Indonesia: Police confirm two explosi...</td>\n",
       "      <td>https://news.google.com/rss/articles/CBMiZWh0d...</td>\n",
       "      <td>Jakarta Police Receive 2 More Reports on Coldp...</td>\n",
       "      <td>TEMPO.CO, Jakarta - South Jakarta Metro Police...</td>\n",
       "      <td>[jakarta, south, jakarta, metro, police, recei...</td>\n",
       "      <td>[actress_accord, available_day_concert, click_...</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>6</td>\n",
       "      <td>UPDATE - Indonesia: Severe winds damage infras...</td>\n",
       "      <td>Severe winds have downed billboards and trees ...</td>\n",
       "      <td>Moderate</td>\n",
       "      <td>Roadway Closure / Disruption, Flooding, Severe...</td>\n",
       "      <td>Indonesia</td>\n",
       "      <td>19/4/17 9:10</td>\n",
       "      <td>2017.0</td>\n",
       "      <td>-6.91264</td>\n",
       "      <td>107.65700</td>\n",
       "      <td>...</td>\n",
       "      <td>True</td>\n",
       "      <td>4.0</td>\n",
       "      <td>16.0</td>\n",
       "      <td>UPDATE - Indonesia: Severe winds damage infras...</td>\n",
       "      <td>https://news.google.com/rss/articles/CBMiSWh0d...</td>\n",
       "      <td>Indonesia hit by some of strongest winds recorded</td>\n",
       "      <td>A man stands near damaged houses following a t...</td>\n",
       "      <td>[man, stand, near, damage, house, follow, torn...</td>\n",
       "      <td>[bbc_indonesia, climatologist_government_resea...</td>\n",
       "      <td>28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>14</td>\n",
       "      <td>2 miles E of Chesterfield - A tornado has touc...</td>\n",
       "      <td>Government sources are reporting a tornado has...</td>\n",
       "      <td>Minor</td>\n",
       "      <td>Tornado</td>\n",
       "      <td>United States</td>\n",
       "      <td>17/9/18 19:55</td>\n",
       "      <td>2018.0</td>\n",
       "      <td>37.51000</td>\n",
       "      <td>-77.61000</td>\n",
       "      <td>...</td>\n",
       "      <td>True</td>\n",
       "      <td>9.0</td>\n",
       "      <td>38.0</td>\n",
       "      <td>2 miles E of Chesterfield - A tornado has touc...</td>\n",
       "      <td>https://news.google.com/rss/articles/CBMigAFod...</td>\n",
       "      <td>UPDATE: Number of homes without power down to ...</td>\n",
       "      <td>More than 90,000 homes and businesses across t...</td>\n",
       "      <td>[more, than, home, business, across, richmond,...</td>\n",
       "      <td>[advise_seek_alternate, affect_richmond, alter...</td>\n",
       "      <td>134</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 23 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                           Headline  \\\n",
       "0   1  Grasberg Mine- Grasberg mine workers extend st...   \n",
       "1   3  Shanghai port congestion impacts terminals in ...   \n",
       "2   5  UPDATE - Indonesia: Police confirm two explosi...   \n",
       "3   6  UPDATE - Indonesia: Severe winds damage infras...   \n",
       "4  14  2 miles E of Chesterfield - A tornado has touc...   \n",
       "\n",
       "                                             Details  Severity  \\\n",
       "0  Media sources indicate that workers at the Gra...  Moderate   \n",
       "1  The persisting port congestion at Shanghai’s Y...     Minor   \n",
       "2  According to local police in Jakarta, two expl...   Extreme   \n",
       "3  Severe winds have downed billboards and trees ...  Moderate   \n",
       "4  Government sources are reporting a tornado has...     Minor   \n",
       "\n",
       "                                            Category         Region  \\\n",
       "0                                Mine Workers Strike      Indonesia   \n",
       "1                                    Port Congestion          China   \n",
       "2                         Bombing, Police Operations      Indonesia   \n",
       "3  Roadway Closure / Disruption, Flooding, Severe...      Indonesia   \n",
       "4                                            Tornado  United States   \n",
       "\n",
       "        Datetime    Year       lat        lon  ...  if_labeled Month  Week  \\\n",
       "0  28/5/17 17:08  2017.0  -4.05608  137.11302  ...       False   5.0  21.0   \n",
       "1   27/4/17 9:16  2017.0  29.52000  121.33190  ...       False   4.0  17.0   \n",
       "2  24/5/17 16:20  2017.0       NaN        NaN  ...        True   5.0  21.0   \n",
       "3   19/4/17 9:10  2017.0  -6.91264  107.65700  ...        True   4.0  16.0   \n",
       "4  17/9/18 19:55  2018.0  37.51000  -77.61000  ...        True   9.0  38.0   \n",
       "\n",
       "                                    Headline_Details  \\\n",
       "0  Grasberg Mine- Grasberg mine workers extend st...   \n",
       "1  Shanghai port congestion impacts terminals in ...   \n",
       "2  UPDATE - Indonesia: Police confirm two explosi...   \n",
       "3  UPDATE - Indonesia: Severe winds damage infras...   \n",
       "4  2 miles E of Chesterfield - A tornado has touc...   \n",
       "\n",
       "                                                 url  \\\n",
       "0  https://news.google.com/rss/articles/CBMiZ2h0d...   \n",
       "1  https://news.google.com/rss/articles/CBMiVWh0d...   \n",
       "2  https://news.google.com/rss/articles/CBMiZWh0d...   \n",
       "3  https://news.google.com/rss/articles/CBMiSWh0d...   \n",
       "4  https://news.google.com/rss/articles/CBMigAFod...   \n",
       "\n",
       "                                               title  \\\n",
       "0  Freeport Indonesia mine workers extend strike ...   \n",
       "1  Typhoon Muifa to shut China ports for second t...   \n",
       "2  Jakarta Police Receive 2 More Reports on Coldp...   \n",
       "3  Indonesia hit by some of strongest winds recorded   \n",
       "4  UPDATE: Number of homes without power down to ...   \n",
       "\n",
       "                                             content  \\\n",
       "0  Trucks are seen on a road in the Grasberg copp...   \n",
       "1  By Sam Whelan 13/09/2022\\n\\nAnother typhoon ha...   \n",
       "2  TEMPO.CO, Jakarta - South Jakarta Metro Police...   \n",
       "3  A man stands near damaged houses following a t...   \n",
       "4  More than 90,000 homes and businesses across t...   \n",
       "\n",
       "                                     cleaned_content  \\\n",
       "0  [truck, be, see, on, road, in, grasberg, coppe...   \n",
       "1  [by, sam, whelan, typhoon, have, prompt, port,...   \n",
       "2  [jakarta, south, jakarta, metro, police, recei...   \n",
       "3  [man, stand, near, damage, house, follow, torn...   \n",
       "4  [more, than, home, business, across, richmond,...   \n",
       "\n",
       "                                      binary_content word_count  \n",
       "0  [adkerson_jakarta_try, agreement_freeport_indo...         53  \n",
       "1  [additional_ripple_effect, avoid_path_typhoon,...         44  \n",
       "2  [actress_accord, available_day_concert, click_...         24  \n",
       "3  [bbc_indonesia, climatologist_government_resea...         28  \n",
       "4  [advise_seek_alternate, affect_richmond, alter...        134  \n",
       "\n",
       "[5 rows x 23 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "09113e88-66cc-414c-a953-da04db83c4ae",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3681, 23)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "037e74fc-bbcd-43e3-8346-799920cca8d8",
   "metadata": {},
   "source": [
    "# Vectorisation"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d67cef3a-59fb-4dd8-adc8-2cf288b90728",
   "metadata": {},
   "source": [
    "NLP vectorization refers to the process of converting text data into numerical vectors that machine learning algorithms can understand and process. \n",
    "\n",
    "Bag-of-Words (BoW) is used here that represents text as a collection of unique words along with their frequencies. Each word is assigned an index, and the vector contains the count of each word present in the document."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "c95b7b8a-9767-469d-812d-c9a9d9fee0e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_copy = df.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "dfb2001e-04c1-49dc-b423-a64ea47af5a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# choose only the extreme and severe cases for modelling\n",
    "cleaned = df_copy[df_copy['Severity'].isin(['Extreme', 'Severe'])]\n",
    "cleaned.reset_index(drop=True, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "de71c523-a59e-44b2-aa96-5f17d872c9c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "headline = cleaned.binary_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "5b1e34e1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['number_container', 'accord_detective_llamas', 'anyone_talk_crime',\n",
       "       'arizmendi_girlfriend_become', 'auto_theft_robbery',\n",
       "       'clothing_makeup_shoe', 'clue_loot', 'decode_container_stack',\n",
       "       'detective_chavez', 'detective_put', 'electric_bicycle',\n",
       "       'empire_farther_south', 'freight_train_repeat',\n",
       "       'google_placard_lock', 'hard_drive_tablet', 'homicide_drug_gang',\n",
       "       'inside_container_secure', 'llama_straight_tell',\n",
       "       'llama_work_connie', 'metal_lock_size', 'mile_east',\n",
       "       'motel_room_storage', 'plentiful_tv_beer',\n",
       "       'southern_california_couple', 'succumb_bolt_cutter', 'sure_sign',\n",
       "       'upgraded_lock'], dtype=object)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "headline[5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "677055b4-978e-4253-90f4-3f903662e225",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# vectorise the words\n",
    "doc_dict = gensim.corpora.Dictionary(headline)\n",
    "docs_vecs = [doc_dict.doc2bow(doc) for doc in headline]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "a54d1768-b069-4936-a156-deaf0b506d93",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of unique tokens: 30464\n",
      "Number of articles: 300\n"
     ]
    }
   ],
   "source": [
    "print('Number of unique tokens: %d' % len(doc_dict)) \n",
    "print('Number of articles: %d' % len(docs_vecs)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "9147fa86-1503-4252-bd9b-92fea1e6a926",
   "metadata": {
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('heavy_rain', 15),\n",
      " ('global_supply_chain', 15),\n",
      " ('national_hurricane_center', 13),\n",
      " ('heavy_rainfall', 12),\n",
      " ('port_los', 12),\n",
      " ('hong_kong', 12),\n",
      " ('united_state', 11),\n",
      " ('critical_destination_port', 11),\n",
      " ('global_port_tracker', 11),\n",
      " ('global_shipping_disruption', 11),\n",
      " ('sign_confidence_consumer', 11),\n",
      " ('upgrade_import_forecast', 11),\n",
      " ('national_weather_service', 10),\n",
      " ('social_medium', 10),\n",
      " ('moment_exception_request', 10),\n",
      " ('sorry_site', 10),\n",
      " ('technical_difficulty_please', 10),\n",
      " ('trade_statement', 9),\n",
      " ('tropical_storm', 9),\n",
      " ('help_business', 9),\n",
      " ('meet_firm', 9),\n",
      " ('website_see_service', 9),\n",
      " ('supply_chain', 8),\n",
      " ('strong_wind', 8),\n",
      " ('coastal_area', 7),\n",
      " ('geological_survey', 7),\n",
      " ('asian_country', 7),\n",
      " ('day_trade_asia', 7),\n",
      " ('global_demand', 7),\n",
      " ('global_economy', 7),\n",
      " ('high_yard_density', 7),\n",
      " ('inch_rain', 7),\n",
      " ('coast_port', 7),\n",
      " ('empty_container', 7),\n",
      " ('union_worker', 7),\n",
      " ('many_area', 6),\n",
      " ('customer_demand', 6),\n",
      " ('economic_growth', 6),\n",
      " ('free_day', 6),\n",
      " ('full_network', 6),\n",
      " ('import_volume', 6),\n",
      " ('major_economy', 6),\n",
      " ('negative_impact', 6),\n",
      " ('vertical_insight', 6),\n",
      " ('supply_chain_issue', 6),\n",
      " ('economic_recovery', 6),\n",
      " ('death_toll', 6),\n",
      " ('new_york_city', 6),\n",
      " ('america_trade_statement', 6),\n",
      " ('average_day', 6),\n",
      " ('late_issue', 5),\n",
      " ('paul_brashier_vice', 5),\n",
      " ('president_drayage', 5),\n",
      " ('strike_action', 5),\n",
      " ('large_number', 5),\n",
      " ('current_situation', 5),\n",
      " ('high_inflation', 5),\n",
      " ('severe_weather_event', 5),\n",
      " ('accurate_quote_market', 5),\n",
      " ('adapt_supply_chain', 5),\n",
      " ('america_space', 5),\n",
      " ('apapa_tin_tema', 5),\n",
      " ('asia_day', 5),\n",
      " ('china_area_chb', 5),\n",
      " ('china_area_warehouse', 5),\n",
      " ('company_face', 5),\n",
      " ('critical_resource', 5),\n",
      " ('date_change', 5),\n",
      " ('energy_price', 5),\n",
      " ('energy_price_fall', 5),\n",
      " ('export_volume', 5),\n",
      " ('fourth_quarter', 5),\n",
      " ('full_truck', 5),\n",
      " ('future_consumer_demand', 5),\n",
      " ('general_administration_custom', 5),\n",
      " ('high_flexibility_transit', 5),\n",
      " ('high_interest_rate', 5),\n",
      " ('high_inventory', 5),\n",
      " ('increase_capacity', 5),\n",
      " ('india_freight_cost', 5),\n",
      " ('indirect_service', 5),\n",
      " ('january_trade_maersk', 5),\n",
      " ('load_truck_box', 5),\n",
      " ('low_confidence', 5),\n",
      " ('low_figure', 5),\n",
      " ('main_product_continue', 5),\n",
      " ('main_route_area', 5),\n",
      " ('many_company', 5),\n",
      " ('market_average', 5),\n",
      " ('matadi_cape_town', 5),\n",
      " ('mile_service', 5),\n",
      " ('monetary_fund', 5),\n",
      " ('new_air', 5),\n",
      " ('new_law_limit', 5),\n",
      " ('new_sea_rail', 5),\n",
      " ('ocean_market', 5),\n",
      " ('ocean_network', 5),\n",
      " ('ok_day', 5),\n",
      " ('operational_disruption', 5),\n",
      " ('relevant_rate', 5)]\n"
     ]
    }
   ],
   "source": [
    "# Calculate word frequencies\n",
    "word_frequencies = {doc_dict[word_id]: freq for word_id, freq in doc_dict.cfs.items()}\n",
    "sorted_words = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)\n",
    "\n",
    "pprint(sorted_words[:100])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5ed78239-2ce1-4784-a8f4-4c7438c8627b",
   "metadata": {},
   "source": [
    "# LDA Modelling"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aacc2eb2-bce9-462f-b048-cc26baa2383d",
   "metadata": {},
   "source": [
    "We initially selected a fixed topic number for model pipelien development and benchmark model setup. Then we used the full dataset for fine-tuning and evaluation."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9db83273-461d-4f70-b23f-ec967579d94f",
   "metadata": {},
   "source": [
    "## Benchmark Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "e6d577bd-9936-4d45-be90-345af2eb4827",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Build LDA benchmark model\n",
    "lda_model = gensim.models.LdaMulticore(corpus=docs_vecs,\n",
    "                                       id2word=doc_dict,\n",
    "                                       num_topics=4, \n",
    "                                       random_state=42,\n",
    "                                       chunksize=100,\n",
    "                                       passes=10,\n",
    "                                       per_word_topics=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "c4f1521f-5f43-40d2-a3a3-a8ac2ca6fec2",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0,\n",
      "  '0.001*\"technical_difficulty_please\" + 0.001*\"moment_exception_request\" + '\n",
      "  '0.001*\"sorry_site\" + 0.001*\"heavy_rain\" + 0.000*\"heavy_rainfall\" + '\n",
      "  '0.000*\"port_los\" + 0.000*\"national_weather_service\" + 0.000*\"coast_port\" + '\n",
      "  '0.000*\"united_state\" + 0.000*\"empty_container\"'),\n",
      " (1,\n",
      "  '0.001*\"upgrade_import_forecast\" + 0.001*\"sign_confidence_consumer\" + '\n",
      "  '0.001*\"global_shipping_disruption\" + 0.001*\"global_port_tracker\" + '\n",
      "  '0.001*\"meet_firm\" + 0.001*\"website_see_service\" + 0.001*\"help_business\" + '\n",
      "  '0.000*\"national_hurricane_center\" + 0.000*\"passenger_service\" + '\n",
      "  '0.000*\"hong_kong\"'),\n",
      " (2,\n",
      "  '0.001*\"global_supply_chain\" + 0.000*\"negative_impact\" + '\n",
      "  '0.000*\"critical_destination_port\" + 0.000*\"trade_statement\" + '\n",
      "  '0.000*\"warm_winter_china\" + 0.000*\"import_volume\" + 0.000*\"global_demand\" + '\n",
      "  '0.000*\"day_trade_asia\" + 0.000*\"slow_react\" + 0.000*\"full_network\"'),\n",
      " (3,\n",
      "  '0.000*\"critical_destination_port\" + 0.000*\"social_medium\" + '\n",
      "  '0.000*\"meteorological_agency\" + 0.000*\"hong_kong\" + 0.000*\"many_area\" + '\n",
      "  '0.000*\"trade_statement\" + 0.000*\"prime_minister\" + '\n",
      "  '0.000*\"america_trade_statement\" + 0.000*\"oakland_day\" + '\n",
      "  '0.000*\"ready_rescue_operation\"')]\n"
     ]
    }
   ],
   "source": [
    "from pprint import pprint\n",
    "\n",
    "# Print the Keyword in the 10 topics\n",
    "pprint(lda_model.print_topics())\n",
    "doc_lda = lda_model[docs_vecs]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "fd57b1f4-a6cd-41e8-964f-d8a1d30aa3c9",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Coherence Score LDAModel:  0.37181231277776183\n",
      "CPU times: user 216 ms, sys: 147 ms, total: 364 ms\n",
      "Wall time: 8min 58s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "# Compute Benchmark Coherence Score\n",
    "coherence_model_lda = CoherenceModel(model=lda_model, texts=headline, dictionary=doc_dict, coherence='c_v')\n",
    "coherence_lda = coherence_model_lda.get_coherence()\n",
    "print('\\nCoherence Score LDAModel: ', coherence_lda)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "152e5a3a-7afe-4fb8-a02f-d7492ad80936",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Perplexity for LDAModel:  -10.57378514568444\n"
     ]
    }
   ],
   "source": [
    "# Compute Benchmark Perplexity\n",
    "perplex= lda_model.log_perplexity(docs_vecs, total_docs=None) #For LDAModel\n",
    "  # a measure of how good the model is. lower the better.\n",
    "\n",
    "print('\\nPerplexity for LDAModel: ', perplex)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "7dd3a60a-5c6f-4249-9868-30528a5b0ac8",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
      "  pid = os.fork()\n",
      "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
      "  pid = os.fork()\n",
      "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
      "  pid = os.fork()\n",
      "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
      "  pid = os.fork()\n",
      "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
      "  pid = os.fork()\n",
      "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
      "  pid = os.fork()\n",
      "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
      "  pid = os.fork()\n",
      "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
      "  pid = os.fork()\n",
      "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
      "  pid = os.fork()\n",
      "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
      "  pid = os.fork()\n",
      "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
      "  pid = os.fork()\n",
      "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
      "  pid = os.fork()\n",
      "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
      "  pid = os.fork()\n",
      "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
      "  pid = os.fork()\n",
      "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
      "  pid = os.fork()\n"
     ]
    }
   ],
   "source": [
    "from pprint import pprint\n",
    "import pyLDAvis\n",
    "import pyLDAvis.gensim_models as gensimvis\n",
    "\n",
    "# feed the LDA model into the pyLDAvis instance\n",
    "pyLDAvis.enable_notebook()\n",
    "visual= gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n",
    "\n",
    "#Save the output to the html file\n",
    "pyLDAvis.save_html(visual, \"data/topic_viz_benchmark_severe.html\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "3a5612f7-6358-49c8-aba9-8aa54e275c6f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Topic Keywords</th>\n",
       "      <th>Topic ID</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.001*\"technical_difficulty_please\" + 0.001*\"moment_exception_request\" + 0.001*\"sorry_site\" + 0.001*\"heavy_rain\" + 0.000*\"heavy_rainfall\" + 0.000*\"port_los\"</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.001*\"upgrade_import_forecast\" + 0.001*\"sign_confidence_consumer\" + 0.001*\"global_shipping_disruption\" + 0.001*\"global_port_tracker\" + 0.001*\"meet_firm\" + 0.001*\"website_see_service\"</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.001*\"global_supply_chain\" + 0.000*\"negative_impact\" + 0.000*\"critical_destination_port\" + 0.000*\"trade_statement\" + 0.000*\"warm_winter_china\" + 0.000*\"import_volume\"</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.000*\"critical_destination_port\" + 0.000*\"social_medium\" + 0.000*\"meteorological_agency\" + 0.000*\"hong_kong\" + 0.000*\"many_area\" + 0.000*\"trade_statement\"</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                                                                            Topic Keywords  \\\n",
       "0                             0.001*\"technical_difficulty_please\" + 0.001*\"moment_exception_request\" + 0.001*\"sorry_site\" + 0.001*\"heavy_rain\" + 0.000*\"heavy_rainfall\" + 0.000*\"port_los\"   \n",
       "1  0.001*\"upgrade_import_forecast\" + 0.001*\"sign_confidence_consumer\" + 0.001*\"global_shipping_disruption\" + 0.001*\"global_port_tracker\" + 0.001*\"meet_firm\" + 0.001*\"website_see_service\"   \n",
       "2                  0.001*\"global_supply_chain\" + 0.000*\"negative_impact\" + 0.000*\"critical_destination_port\" + 0.000*\"trade_statement\" + 0.000*\"warm_winter_china\" + 0.000*\"import_volume\"   \n",
       "3                              0.000*\"critical_destination_port\" + 0.000*\"social_medium\" + 0.000*\"meteorological_agency\" + 0.000*\"hong_kong\" + 0.000*\"many_area\" + 0.000*\"trade_statement\"   \n",
       "\n",
       "   Topic ID  \n",
       "0         0  \n",
       "1         1  \n",
       "2         2  \n",
       "3         3  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.set_option('max_colwidth', 200)\n",
    "# Get the topics and their top keywords into a dataframe\n",
    "topics = lda_model.show_topics(num_words=6) \n",
    "\n",
    "topic_keywords = pd.DataFrame()\n",
    "for topic_id, topic in topics:\n",
    "    topic_keywords.at[topic_id, 'Topic Keywords'] = topic\n",
    "\n",
    "topic_keywords['Topic ID'] = topic_keywords.index\n",
    "topic_keywords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "26da4eea-06a0-4ff7-ae14-2f40fa0db04b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# break "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1895598f-3e5f-4acd-83a6-4491cc90f695",
   "metadata": {},
   "source": [
    "# Hyper-Perameter Tuning and Evaluation"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "47136c89-ff7b-4ac9-840f-04122fe62160",
   "metadata": {},
   "source": [
    "Run the cells below only for re-modelling with new datasets, the whole tuning and evaluation process may take hours to run."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "c79ca5c4-e078-43ce-a430-8c1ed93dcd64",
   "metadata": {},
   "outputs": [],
   "source": [
    "# hyper-perameter tuning (alpha and beta)\n",
    "def compute_coherence_values(corpus, dictionary, k, a, b):\n",
    "    \n",
    "    lda_model = gensim.models.LdaMulticore(corpus=corpus,\n",
    "                                           id2word=dictionary,\n",
    "                                           num_topics=k, \n",
    "                                           random_state=42,\n",
    "                                           chunksize=100,\n",
    "                                           passes=10,\n",
    "                                           alpha=a,\n",
    "                                           eta=b)\n",
    "    \n",
    "    coherence_model_lda = CoherenceModel(model=lda_model, texts=headline, dictionary=doc_dict, coherence='c_v')\n",
    "    coherence = coherence_model_lda.get_coherence()\n",
    "    perplex = lda_model.log_perplexity(docs_vecs, total_docs=None) \n",
    "    \n",
    "    return coherence, perplex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "1c3c8478-9336-40f2-bb30-a37db4243b67",
   "metadata": {},
   "outputs": [],
   "source": [
    "# setup\n",
    "import numpy as np\n",
    "\n",
    "from gensim.models import CoherenceModel\n",
    "\n",
    "model_list = []\n",
    "coherence_values = []\n",
    "perplexity_values = []\n",
    "model_topics = []\n",
    "alpha_result = []\n",
    "beta_result = []\n",
    "\n",
    "# topic ranges\n",
    "num_topics = range(4, 13)\n",
    "\n",
    "# Alpha parameter\n",
    "alpha = list(np.arange(0.31, 1, 0.3))\n",
    "alpha.append('symmetric')\n",
    "alpha.append('asymmetric')\n",
    "\n",
    "# Beta parameter\n",
    "beta = list(np.arange(0.31, 1, 0.3))\n",
    "beta.append('symmetric')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c7e6bc53-0b57-4858-879a-644eca54ddbc",
   "metadata": {},
   "source": [
    "Rational behind the alpha and eta: https://stats.stackexchange.com/questions/37405/natural-interpretation-for-lda-hyperparameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "02877b81-32df-4168-8e62-4cbca2be100b",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic range:  range(4, 13)\n",
      "Alpha:  [0.31, 0.61, 0.9099999999999999, 'symmetric', 'asymmetric']\n",
      "Beta:  [0.31, 0.61, 0.9099999999999999, 'symmetric']\n"
     ]
    }
   ],
   "source": [
    "print(\"Topic range: \",num_topics)\n",
    "print(\"Alpha: \",alpha)\n",
    "print(\"Beta: \", beta)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "3c1f703c-4778-467f-a12e-0c18eeb274c5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2024-06-30 15:56:56.953954\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "#Topics: 4, CV Score: 0.3720156705867761, PV Score: -10.531015192970104, Alpha: 0.31, Beta: 0.31\n",
      "#Topics: 5, CV Score: 0.5104309491692648, PV Score: -10.501295581191243, Alpha: 0.31, Beta: 0.31\n",
      "#Topics: 6, CV Score: 0.4577416605401658, PV Score: -10.460472706228693, Alpha: 0.31, Beta: 0.31\n",
      "#Topics: 7, CV Score: 0.47905186758060786, PV Score: -10.45017220690116, Alpha: 0.31, Beta: 0.31\n",
      "#Topics: 8, CV Score: 0.5170931021465908, PV Score: -10.435198097218018, Alpha: 0.31, Beta: 0.31\n",
      "#Topics: 9, CV Score: 0.5418406648591022, PV Score: -10.41813577221336, Alpha: 0.31, Beta: 0.31\n",
      "#Topics: 10, CV Score: 0.6289688892634311, PV Score: -10.40650838076674, Alpha: 0.31, Beta: 0.31\n",
      "#Topics: 11, CV Score: 0.615861099169618, PV Score: -10.400571085548444, Alpha: 0.31, Beta: 0.31\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Traceback (most recent call last):\n",
      "  File \"<string>\", line 1, in <module>\n",
      "  File \"/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/multiprocessing/spawn.py\", line 122, in spawn_main\n",
      "    exitcode = _main(fd, parent_sentinel)\n",
      "               ^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
      "  File \"/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/multiprocessing/spawn.py\", line 132, in _main\n",
      "    self = reduction.pickle.load(from_parent)\n",
      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
      "_pickle.UnpicklingError: pickle data was truncated\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "#Topics: 12, CV Score: 0.5821904373179804, PV Score: -10.405657307982493, Alpha: 0.31, Beta: 0.31\n",
      "#Topics: 4, CV Score: 0.39197924000152073, PV Score: -10.431108130993747, Alpha: 0.31, Beta: 0.61\n",
      "#Topics: 5, CV Score: 0.5158552240339984, PV Score: -10.415847323918724, Alpha: 0.31, Beta: 0.61\n",
      "#Topics: 6, CV Score: 0.4398095546006567, PV Score: -10.395709424729047, Alpha: 0.31, Beta: 0.61\n",
      "#Topics: 7, CV Score: 0.4759542844436549, PV Score: -10.390971943426882, Alpha: 0.31, Beta: 0.61\n",
      "#Topics: 8, CV Score: 0.5228046057671669, PV Score: -10.390099173623508, Alpha: 0.31, Beta: 0.61\n",
      "#Topics: 9, CV Score: 0.534380007483663, PV Score: -10.383173065174224, Alpha: 0.31, Beta: 0.61\n",
      "#Topics: 10, CV Score: 0.601346262577239, PV Score: -10.38283548973593, Alpha: 0.31, Beta: 0.61\n",
      "#Topics: 11, CV Score: 0.6182871521157967, PV Score: -10.381370037404881, Alpha: 0.31, Beta: 0.61\n",
      "#Topics: 12, CV Score: 0.6301666636692548, PV Score: -10.388156880830003, Alpha: 0.31, Beta: 0.61\n",
      "#Topics: 4, CV Score: 0.3906162608542371, PV Score: -10.401723128237816, Alpha: 0.31, Beta: 0.9099999999999999\n",
      "#Topics: 5, CV Score: 0.5939884088380677, PV Score: -10.387492626618993, Alpha: 0.31, Beta: 0.9099999999999999\n",
      "#Topics: 6, CV Score: 0.4508756651213514, PV Score: -10.37800267053485, Alpha: 0.31, Beta: 0.9099999999999999\n",
      "#Topics: 7, CV Score: 0.5023247846073803, PV Score: -10.378610286427143, Alpha: 0.31, Beta: 0.9099999999999999\n",
      "#Topics: 8, CV Score: 0.5338912827400732, PV Score: -10.382219717893106, Alpha: 0.31, Beta: 0.9099999999999999\n",
      "#Topics: 9, CV Score: 0.5350215871400255, PV Score: -10.378352577214379, Alpha: 0.31, Beta: 0.9099999999999999\n",
      "#Topics: 10, CV Score: 0.5843943719668163, PV Score: -10.379151839595481, Alpha: 0.31, Beta: 0.9099999999999999\n",
      "#Topics: 11, CV Score: 0.6022597255391141, PV Score: -10.381816469366692, Alpha: 0.31, Beta: 0.9099999999999999\n",
      "#Topics: 12, CV Score: 0.6549152327008515, PV Score: -10.3871327637851, Alpha: 0.31, Beta: 0.9099999999999999\n",
      "#Topics: 4, CV Score: 0.38133564373441475, PV Score: -10.57910573063879, Alpha: 0.31, Beta: symmetric\n",
      "#Topics: 5, CV Score: 0.504405181495512, PV Score: -10.591716275575331, Alpha: 0.31, Beta: symmetric\n",
      "#Topics: 6, CV Score: 0.4434166489578631, PV Score: -10.577016836067129, Alpha: 0.31, Beta: symmetric\n",
      "#Topics: 7, CV Score: 0.4650127509674876, PV Score: -10.59307567259223, Alpha: 0.31, Beta: symmetric\n",
      "#Topics: 8, CV Score: 0.48085842979187243, PV Score: -10.595749214509778, Alpha: 0.31, Beta: symmetric\n",
      "#Topics: 9, CV Score: 0.5229987979769645, PV Score: -10.572445904678828, Alpha: 0.31, Beta: symmetric\n",
      "#Topics: 10, CV Score: 0.6011951298180502, PV Score: -10.580254153107184, Alpha: 0.31, Beta: symmetric\n",
      "#Topics: 11, CV Score: 0.5816514196292673, PV Score: -10.572981018751022, Alpha: 0.31, Beta: symmetric\n",
      "#Topics: 12, CV Score: 0.54353358800824, PV Score: -10.58723522762571, Alpha: 0.31, Beta: symmetric\n",
      "#Topics: 4, CV Score: 0.3747888014696905, PV Score: -10.553273801572038, Alpha: 0.61, Beta: 0.31\n",
      "#Topics: 5, CV Score: 0.41949099152429126, PV Score: -10.529284579653147, Alpha: 0.61, Beta: 0.31\n",
      "#Topics: 6, CV Score: 0.42792555699640306, PV Score: -10.493600231954503, Alpha: 0.61, Beta: 0.31\n",
      "#Topics: 7, CV Score: 0.4414682530110495, PV Score: -10.49167181275119, Alpha: 0.61, Beta: 0.31\n",
      "#Topics: 8, CV Score: 0.40257655373677936, PV Score: -10.480723050906821, Alpha: 0.61, Beta: 0.31\n",
      "#Topics: 9, CV Score: 0.46588662192698593, PV Score: -10.462647038151925, Alpha: 0.61, Beta: 0.31\n",
      "#Topics: 10, CV Score: 0.6342207771608466, PV Score: -10.459408243710644, Alpha: 0.61, Beta: 0.31\n",
      "#Topics: 11, CV Score: 0.5957477827868781, PV Score: -10.457942472137361, Alpha: 0.61, Beta: 0.31\n",
      "#Topics: 12, CV Score: 0.5069147689054222, PV Score: -10.464993667751155, Alpha: 0.61, Beta: 0.31\n",
      "#Topics: 4, CV Score: 0.3819785352237406, PV Score: -10.457655853911954, Alpha: 0.61, Beta: 0.61\n",
      "#Topics: 5, CV Score: 0.521805619062306, PV Score: -10.44289355248452, Alpha: 0.61, Beta: 0.61\n",
      "#Topics: 6, CV Score: 0.4718959465808612, PV Score: -10.42865218169654, Alpha: 0.61, Beta: 0.61\n",
      "#Topics: 7, CV Score: 0.4666775368816065, PV Score: -10.435293881017774, Alpha: 0.61, Beta: 0.61\n",
      "#Topics: 8, CV Score: 0.4161034826078284, PV Score: -10.43039536604531, Alpha: 0.61, Beta: 0.61\n",
      "#Topics: 9, CV Score: 0.5356741464147949, PV Score: -10.430215877864402, Alpha: 0.61, Beta: 0.61\n",
      "#Topics: 10, CV Score: 0.6500234328720327, PV Score: -10.433181454609361, Alpha: 0.61, Beta: 0.61\n",
      "#Topics: 11, CV Score: 0.5724584031569051, PV Score: -10.436418913727316, Alpha: 0.61, Beta: 0.61\n",
      "#Topics: 12, CV Score: 0.47510144373255264, PV Score: -10.447173694241018, Alpha: 0.61, Beta: 0.61\n",
      "#Topics: 4, CV Score: 0.38858871959573177, PV Score: -10.422449139190153, Alpha: 0.61, Beta: 0.9099999999999999\n",
      "#Topics: 5, CV Score: 0.5168433414143676, PV Score: -10.415463384416736, Alpha: 0.61, Beta: 0.9099999999999999\n",
      "#Topics: 6, CV Score: 0.4623710938296017, PV Score: -10.410332603528257, Alpha: 0.61, Beta: 0.9099999999999999\n",
      "#Topics: 7, CV Score: 0.47872425272635516, PV Score: -10.414848707310925, Alpha: 0.61, Beta: 0.9099999999999999\n",
      "#Topics: 8, CV Score: 0.4916000111100134, PV Score: -10.42081190532211, Alpha: 0.61, Beta: 0.9099999999999999\n",
      "#Topics: 9, CV Score: 0.5266551042307853, PV Score: -10.423109042698925, Alpha: 0.61, Beta: 0.9099999999999999\n",
      "#Topics: 10, CV Score: 0.6308106489770785, PV Score: -10.429297527391322, Alpha: 0.61, Beta: 0.9099999999999999\n",
      "#Topics: 11, CV Score: 0.5824504074419617, PV Score: -10.435341814119317, Alpha: 0.61, Beta: 0.9099999999999999\n",
      "#Topics: 12, CV Score: 0.5655237958130822, PV Score: -10.446175294783924, Alpha: 0.61, Beta: 0.9099999999999999\n",
      "#Topics: 4, CV Score: 0.38066789853933136, PV Score: -10.600860392033141, Alpha: 0.61, Beta: symmetric\n",
      "#Topics: 5, CV Score: 0.4230110894509031, PV Score: -10.624957958665691, Alpha: 0.61, Beta: symmetric\n",
      "#Topics: 6, CV Score: 0.46006178600015324, PV Score: -10.612290512343902, Alpha: 0.61, Beta: symmetric\n",
      "#Topics: 7, CV Score: 0.4303435103117806, PV Score: -10.629508186597825, Alpha: 0.61, Beta: symmetric\n",
      "#Topics: 8, CV Score: 0.43866952185453756, PV Score: -10.634503260588486, Alpha: 0.61, Beta: symmetric\n",
      "#Topics: 9, CV Score: 0.4702721241778767, PV Score: -10.621032221974806, Alpha: 0.61, Beta: symmetric\n",
      "#Topics: 10, CV Score: 0.6087872469897164, PV Score: -10.634684373455967, Alpha: 0.61, Beta: symmetric\n",
      "#Topics: 11, CV Score: 0.5297501274565263, PV Score: -10.641694238787153, Alpha: 0.61, Beta: symmetric\n",
      "#Topics: 12, CV Score: 0.5432630625800524, PV Score: -10.654685933857545, Alpha: 0.61, Beta: symmetric\n",
      "#Topics: 4, CV Score: 0.3980327518942032, PV Score: -10.571774729929455, Alpha: 0.9099999999999999, Beta: 0.31\n",
      "#Topics: 5, CV Score: 0.47100281301765873, PV Score: -10.551206821119235, Alpha: 0.9099999999999999, Beta: 0.31\n",
      "#Topics: 6, CV Score: 0.45585476488442933, PV Score: -10.5221789742425, Alpha: 0.9099999999999999, Beta: 0.31\n",
      "#Topics: 7, CV Score: 0.42741889953329854, PV Score: -10.520073750838211, Alpha: 0.9099999999999999, Beta: 0.31\n",
      "#Topics: 8, CV Score: 0.4445519399417415, PV Score: -10.513500785821066, Alpha: 0.9099999999999999, Beta: 0.31\n",
      "#Topics: 9, CV Score: 0.5316093514063003, PV Score: -10.502280711961715, Alpha: 0.9099999999999999, Beta: 0.31\n",
      "#Topics: 10, CV Score: 0.6182445479757168, PV Score: -10.506699966303506, Alpha: 0.9099999999999999, Beta: 0.31\n",
      "#Topics: 11, CV Score: 0.5706965784614643, PV Score: -10.509385494327319, Alpha: 0.9099999999999999, Beta: 0.31\n",
      "#Topics: 12, CV Score: 0.5244982477634946, PV Score: -10.518817895245245, Alpha: 0.9099999999999999, Beta: 0.31\n",
      "#Topics: 4, CV Score: 0.34370448027723666, PV Score: -10.473583266573756, Alpha: 0.9099999999999999, Beta: 0.61\n",
      "#Topics: 5, CV Score: 0.42177545406640793, PV Score: -10.466430482135557, Alpha: 0.9099999999999999, Beta: 0.61\n",
      "#Topics: 6, CV Score: 0.48870590420062365, PV Score: -10.45681790473505, Alpha: 0.9099999999999999, Beta: 0.61\n",
      "#Topics: 7, CV Score: 0.4818942174006588, PV Score: -10.462492853055563, Alpha: 0.9099999999999999, Beta: 0.61\n",
      "#Topics: 8, CV Score: 0.49058139371286213, PV Score: -10.464079562560514, Alpha: 0.9099999999999999, Beta: 0.61\n",
      "#Topics: 9, CV Score: 0.5284476302536344, PV Score: -10.469358208054853, Alpha: 0.9099999999999999, Beta: 0.61\n",
      "#Topics: 10, CV Score: 0.5992842730225252, PV Score: -10.474371922798735, Alpha: 0.9099999999999999, Beta: 0.61\n",
      "#Topics: 11, CV Score: 0.5550594565914921, PV Score: -10.482537823118147, Alpha: 0.9099999999999999, Beta: 0.61\n",
      "#Topics: 12, CV Score: 0.5130408610330546, PV Score: -10.496326340379174, Alpha: 0.9099999999999999, Beta: 0.61\n",
      "#Topics: 4, CV Score: 0.3680605610706724, PV Score: -10.437968210441094, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
      "#Topics: 5, CV Score: 0.39834507895056426, PV Score: -10.434366957326267, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
      "#Topics: 6, CV Score: 0.5122840918935663, PV Score: -10.435732920054527, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
      "#Topics: 7, CV Score: 0.5242817034609356, PV Score: -10.444687489964636, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
      "#Topics: 8, CV Score: 0.525222502346446, PV Score: -10.450911057789357, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
      "#Topics: 9, CV Score: 0.5855489762461916, PV Score: -10.459363210197557, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
      "#Topics: 10, CV Score: 0.6033482162866756, PV Score: -10.46709651310468, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
      "#Topics: 11, CV Score: 0.534181138841984, PV Score: -10.478818718345869, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
      "#Topics: 12, CV Score: 0.5044175544841009, PV Score: -10.49287461972145, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
      "#Topics: 4, CV Score: 0.4030162857954225, PV Score: -10.61948175515634, Alpha: 0.9099999999999999, Beta: symmetric\n",
      "#Topics: 5, CV Score: 0.47100281301765873, PV Score: -10.643778224222777, Alpha: 0.9099999999999999, Beta: symmetric\n",
      "#Topics: 6, CV Score: 0.44431460450915367, PV Score: -10.640983049810698, Alpha: 0.9099999999999999, Beta: symmetric\n",
      "#Topics: 7, CV Score: 0.44194061169816934, PV Score: -10.6627668410305, Alpha: 0.9099999999999999, Beta: symmetric\n",
      "#Topics: 8, CV Score: 0.43220156375405006, PV Score: -10.66934582156036, Alpha: 0.9099999999999999, Beta: symmetric\n",
      "#Topics: 9, CV Score: 0.5246794752098919, PV Score: -10.666334218302142, Alpha: 0.9099999999999999, Beta: symmetric\n",
      "#Topics: 10, CV Score: 0.5839917253272302, PV Score: -10.683233193788631, Alpha: 0.9099999999999999, Beta: symmetric\n",
      "#Topics: 11, CV Score: 0.5812551603138659, PV Score: -10.68512322593847, Alpha: 0.9099999999999999, Beta: symmetric\n",
      "#Topics: 12, CV Score: 0.5267098171572085, PV Score: -10.712358225009568, Alpha: 0.9099999999999999, Beta: symmetric\n",
      "#Topics: 4, CV Score: 0.3720156705867761, PV Score: -10.526282588811258, Alpha: symmetric, Beta: 0.31\n",
      "#Topics: 5, CV Score: 0.49933029874610535, PV Score: -10.48844257371801, Alpha: symmetric, Beta: 0.31\n",
      "#Topics: 6, CV Score: 0.47205899846058585, PV Score: -10.439636548970395, Alpha: symmetric, Beta: 0.31\n",
      "#Topics: 7, CV Score: 0.48269249518744994, PV Score: -10.421634188787365, Alpha: symmetric, Beta: 0.31\n",
      "#Topics: 8, CV Score: 0.538335613683992, PV Score: -10.399271817666595, Alpha: symmetric, Beta: 0.31\n",
      "#Topics: 9, CV Score: 0.5761042276388716, PV Score: -10.381594889094856, Alpha: symmetric, Beta: 0.31\n",
      "#Topics: 10, CV Score: 0.6148075051567828, PV Score: -10.361131747776065, Alpha: symmetric, Beta: 0.31\n",
      "#Topics: 11, CV Score: 0.5588504038246359, PV Score: -10.345500948521796, Alpha: symmetric, Beta: 0.31\n",
      "#Topics: 12, CV Score: 0.5448010772010606, PV Score: -10.345617505586791, Alpha: symmetric, Beta: 0.31\n",
      "#Topics: 4, CV Score: 0.39936348065904637, PV Score: -10.426149669454736, Alpha: symmetric, Beta: 0.61\n",
      "#Topics: 5, CV Score: 0.5090823271966699, PV Score: -10.404214549545916, Alpha: symmetric, Beta: 0.61\n",
      "#Topics: 6, CV Score: 0.4278461684376491, PV Score: -10.376087516260217, Alpha: symmetric, Beta: 0.61\n",
      "#Topics: 7, CV Score: 0.5044713577089992, PV Score: -10.365065223661983, Alpha: symmetric, Beta: 0.61\n",
      "#Topics: 8, CV Score: 0.5585090181447581, PV Score: -10.357648533539216, Alpha: symmetric, Beta: 0.61\n",
      "#Topics: 9, CV Score: 0.5308131994078027, PV Score: -10.343761327293674, Alpha: symmetric, Beta: 0.61\n",
      "#Topics: 10, CV Score: 0.5933548111304747, PV Score: -10.333543849654015, Alpha: symmetric, Beta: 0.61\n",
      "#Topics: 11, CV Score: 0.5157930553152481, PV Score: -10.326695408594487, Alpha: symmetric, Beta: 0.61\n",
      "#Topics: 12, CV Score: 0.6500731393929028, PV Score: -10.325261572586404, Alpha: symmetric, Beta: 0.61\n",
      "#Topics: 4, CV Score: 0.3724208362940652, PV Score: -10.397307737784152, Alpha: symmetric, Beta: 0.9099999999999999\n",
      "#Topics: 5, CV Score: 0.6047343451735429, PV Score: -10.37632434171058, Alpha: symmetric, Beta: 0.9099999999999999\n",
      "#Topics: 6, CV Score: 0.40146108571917843, PV Score: -10.358709138199426, Alpha: symmetric, Beta: 0.9099999999999999\n",
      "#Topics: 7, CV Score: 0.5478418889572643, PV Score: -10.352775652604763, Alpha: symmetric, Beta: 0.9099999999999999\n",
      "#Topics: 8, CV Score: 0.4965510774725434, PV Score: -10.35076287539615, Alpha: symmetric, Beta: 0.9099999999999999\n",
      "#Topics: 9, CV Score: 0.5184439894201485, PV Score: -10.338520514214467, Alpha: symmetric, Beta: 0.9099999999999999\n",
      "#Topics: 10, CV Score: 0.6022496614184635, PV Score: -10.33271452286526, Alpha: symmetric, Beta: 0.9099999999999999\n",
      "#Topics: 11, CV Score: 0.4966577290320975, PV Score: -10.328283322344763, Alpha: symmetric, Beta: 0.9099999999999999\n",
      "#Topics: 12, CV Score: 0.6261571238431066, PV Score: -10.32532208498972, Alpha: symmetric, Beta: 0.9099999999999999\n",
      "#Topics: 4, CV Score: 0.37181231277776183, PV Score: -10.5737851457114, Alpha: symmetric, Beta: symmetric\n",
      "#Topics: 5, CV Score: 0.49982175351658986, PV Score: -10.57991503850104, Alpha: symmetric, Beta: symmetric\n",
      "#Topics: 6, CV Score: 0.4623746453802937, PV Score: -10.558772260964222, Alpha: symmetric, Beta: symmetric\n",
      "#Topics: 7, CV Score: 0.4884610769859329, PV Score: -10.570459578130873, Alpha: symmetric, Beta: symmetric\n",
      "#Topics: 8, CV Score: 0.5131836632736474, PV Score: -10.559869552593637, Alpha: symmetric, Beta: symmetric\n",
      "#Topics: 9, CV Score: 0.5414297105648862, PV Score: -10.529852182357658, Alpha: symmetric, Beta: symmetric\n",
      "#Topics: 10, CV Score: 0.6785044360557178, PV Score: -10.535124275209565, Alpha: symmetric, Beta: symmetric\n",
      "#Topics: 11, CV Score: 0.5657397622843224, PV Score: -10.52133404122993, Alpha: symmetric, Beta: symmetric\n",
      "#Topics: 12, CV Score: 0.582327359714131, PV Score: -10.526610298202412, Alpha: symmetric, Beta: symmetric\n",
      "#Topics: 4, CV Score: 0.43912942092956697, PV Score: -10.522848808310528, Alpha: asymmetric, Beta: 0.31\n",
      "#Topics: 5, CV Score: 0.40751184962270537, PV Score: -10.48541066901554, Alpha: asymmetric, Beta: 0.31\n",
      "#Topics: 6, CV Score: 0.4542494682857962, PV Score: -10.442773054340485, Alpha: asymmetric, Beta: 0.31\n",
      "#Topics: 7, CV Score: 0.47041200319916493, PV Score: -10.420479103000723, Alpha: asymmetric, Beta: 0.31\n",
      "#Topics: 8, CV Score: 0.49233261299212405, PV Score: -10.403763191353837, Alpha: asymmetric, Beta: 0.31\n",
      "#Topics: 9, CV Score: 0.45907092550739237, PV Score: -10.3786293878649, Alpha: asymmetric, Beta: 0.31\n",
      "#Topics: 10, CV Score: 0.6217993088001614, PV Score: -10.36148775185113, Alpha: asymmetric, Beta: 0.31\n",
      "#Topics: 11, CV Score: 0.5185376069390376, PV Score: -10.349232810275204, Alpha: asymmetric, Beta: 0.31\n",
      "#Topics: 12, CV Score: 0.5034307005049391, PV Score: -10.34076712188869, Alpha: asymmetric, Beta: 0.31\n",
      "#Topics: 4, CV Score: 0.515628236666662, PV Score: -10.422196504217826, Alpha: asymmetric, Beta: 0.61\n",
      "#Topics: 5, CV Score: 0.423162935257951, PV Score: -10.398368311505568, Alpha: asymmetric, Beta: 0.61\n",
      "#Topics: 6, CV Score: 0.45944499325883426, PV Score: -10.374638816393565, Alpha: asymmetric, Beta: 0.61\n",
      "#Topics: 7, CV Score: 0.4820831491535998, PV Score: -10.36267991668515, Alpha: asymmetric, Beta: 0.61\n",
      "#Topics: 8, CV Score: 0.4036989725279696, PV Score: -10.354531424153524, Alpha: asymmetric, Beta: 0.61\n",
      "#Topics: 9, CV Score: 0.45919112117643734, PV Score: -10.342620369274787, Alpha: asymmetric, Beta: 0.61\n",
      "#Topics: 10, CV Score: 0.6451794871459123, PV Score: -10.330084574702694, Alpha: asymmetric, Beta: 0.61\n",
      "#Topics: 11, CV Score: 0.572154524993651, PV Score: -10.324677755836108, Alpha: asymmetric, Beta: 0.61\n",
      "#Topics: 12, CV Score: 0.6194824233122164, PV Score: -10.323963042788398, Alpha: asymmetric, Beta: 0.61\n",
      "#Topics: 4, CV Score: 0.519154732232678, PV Score: -10.389407917509194, Alpha: asymmetric, Beta: 0.9099999999999999\n",
      "#Topics: 5, CV Score: 0.4151559388052012, PV Score: -10.373488513541524, Alpha: asymmetric, Beta: 0.9099999999999999\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "File \u001b[0;32m<timed exec>:10\u001b[0m\n",
      "Cell \u001b[0;32mIn[22], line 14\u001b[0m, in \u001b[0;36mcompute_coherence_values\u001b[0;34m(corpus, dictionary, k, a, b)\u001b[0m\n\u001b[1;32m      4\u001b[0m lda_model \u001b[38;5;241m=\u001b[39m gensim\u001b[38;5;241m.\u001b[39mmodels\u001b[38;5;241m.\u001b[39mLdaMulticore(corpus\u001b[38;5;241m=\u001b[39mcorpus,\n\u001b[1;32m      5\u001b[0m                                        id2word\u001b[38;5;241m=\u001b[39mdictionary,\n\u001b[1;32m      6\u001b[0m                                        num_topics\u001b[38;5;241m=\u001b[39mk, \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     10\u001b[0m                                        alpha\u001b[38;5;241m=\u001b[39ma,\n\u001b[1;32m     11\u001b[0m                                        eta\u001b[38;5;241m=\u001b[39mb)\n\u001b[1;32m     13\u001b[0m coherence_model_lda \u001b[38;5;241m=\u001b[39m CoherenceModel(model\u001b[38;5;241m=\u001b[39mlda_model, texts\u001b[38;5;241m=\u001b[39mheadline, dictionary\u001b[38;5;241m=\u001b[39mdoc_dict, coherence\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mc_v\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 14\u001b[0m coherence \u001b[38;5;241m=\u001b[39m \u001b[43mcoherence_model_lda\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_coherence\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     15\u001b[0m perplex \u001b[38;5;241m=\u001b[39m lda_model\u001b[38;5;241m.\u001b[39mlog_perplexity(docs_vecs, total_docs\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m) \n\u001b[1;32m     17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m coherence, perplex\n",
      "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/models/coherencemodel.py:614\u001b[0m, in \u001b[0;36mCoherenceModel.get_coherence\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    605\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_coherence\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m    606\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Get coherence value based on pipeline parameters.\u001b[39;00m\n\u001b[1;32m    607\u001b[0m \n\u001b[1;32m    608\u001b[0m \u001b[38;5;124;03m    Returns\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    612\u001b[0m \n\u001b[1;32m    613\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 614\u001b[0m     confirmed_measures \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_coherence_per_topic\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    615\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maggregate_measures(confirmed_measures)\n",
      "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/models/coherencemodel.py:574\u001b[0m, in \u001b[0;36mCoherenceModel.get_coherence_per_topic\u001b[0;34m(self, segmented_topics, with_std, with_support)\u001b[0m\n\u001b[1;32m    572\u001b[0m     segmented_topics \u001b[38;5;241m=\u001b[39m measure\u001b[38;5;241m.\u001b[39mseg(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtopics)\n\u001b[1;32m    573\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_accumulator \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 574\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mestimate_probabilities\u001b[49m\u001b[43m(\u001b[49m\u001b[43msegmented_topics\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    576\u001b[0m kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(with_std\u001b[38;5;241m=\u001b[39mwith_std, with_support\u001b[38;5;241m=\u001b[39mwith_support)\n\u001b[1;32m    577\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcoherence \u001b[38;5;129;01min\u001b[39;00m BOOLEAN_DOCUMENT_BASED \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcoherence \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mc_w2v\u001b[39m\u001b[38;5;124m'\u001b[39m:\n",
      "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/models/coherencemodel.py:546\u001b[0m, in \u001b[0;36mCoherenceModel.estimate_probabilities\u001b[0;34m(self, segmented_topics)\u001b[0m\n\u001b[1;32m    543\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcoherence \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mc_w2v\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m    544\u001b[0m         kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkeyed_vectors\n\u001b[0;32m--> 546\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_accumulator \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmeasure\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprob\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    548\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_accumulator\n",
      "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/topic_coherence/probability_estimation.py:156\u001b[0m, in \u001b[0;36mp_boolean_sliding_window\u001b[0;34m(texts, segmented_topics, dictionary, window_size, processes)\u001b[0m\n\u001b[1;32m    154\u001b[0m     accumulator \u001b[38;5;241m=\u001b[39m ParallelWordOccurrenceAccumulator(processes, top_ids, dictionary)\n\u001b[1;32m    155\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124musing \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m to estimate probabilities from sliding windows\u001b[39m\u001b[38;5;124m\"\u001b[39m, accumulator)\n\u001b[0;32m--> 156\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43maccumulator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maccumulate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtexts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwindow_size\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/topic_coherence/text_analysis.py:437\u001b[0m, in \u001b[0;36mParallelWordOccurrenceAccumulator.accumulate\u001b[0;34m(self, texts, window_size)\u001b[0m\n\u001b[1;32m    436\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21maccumulate\u001b[39m(\u001b[38;5;28mself\u001b[39m, texts, window_size):\n\u001b[0;32m--> 437\u001b[0m     workers, input_q, output_q \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart_workers\u001b[49m\u001b[43m(\u001b[49m\u001b[43mwindow_size\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    438\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    439\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mqueue_all_texts(input_q, texts, window_size)\n",
      "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/topic_coherence/text_analysis.py:471\u001b[0m, in \u001b[0;36mParallelWordOccurrenceAccumulator.start_workers\u001b[0;34m(self, window_size)\u001b[0m\n\u001b[1;32m    469\u001b[0m     accumulator \u001b[38;5;241m=\u001b[39m PatchedWordOccurrenceAccumulator(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrelevant_ids, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdictionary)\n\u001b[1;32m    470\u001b[0m     worker \u001b[38;5;241m=\u001b[39m AccumulatingWorker(input_q, output_q, accumulator, window_size)\n\u001b[0;32m--> 471\u001b[0m     \u001b[43mworker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    472\u001b[0m     workers\u001b[38;5;241m.\u001b[39mappend(worker)\n\u001b[1;32m    474\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m workers, input_q, output_q\n",
      "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/process.py:121\u001b[0m, in \u001b[0;36mBaseProcess.start\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    118\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _current_process\u001b[38;5;241m.\u001b[39m_config\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdaemon\u001b[39m\u001b[38;5;124m'\u001b[39m), \\\n\u001b[1;32m    119\u001b[0m        \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdaemonic processes are not allowed to have children\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m    120\u001b[0m _cleanup()\n\u001b[0;32m--> 121\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_popen \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_Popen\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    122\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sentinel \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_popen\u001b[38;5;241m.\u001b[39msentinel\n\u001b[1;32m    123\u001b[0m \u001b[38;5;66;03m# Avoid a refcycle if the target function holds an indirect\u001b[39;00m\n\u001b[1;32m    124\u001b[0m \u001b[38;5;66;03m# reference to the process object (see bpo-30775)\u001b[39;00m\n",
      "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/context.py:224\u001b[0m, in \u001b[0;36mProcess._Popen\u001b[0;34m(process_obj)\u001b[0m\n\u001b[1;32m    222\u001b[0m \u001b[38;5;129m@staticmethod\u001b[39m\n\u001b[1;32m    223\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_Popen\u001b[39m(process_obj):\n\u001b[0;32m--> 224\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_default_context\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_context\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mProcess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_Popen\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocess_obj\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/context.py:289\u001b[0m, in \u001b[0;36mSpawnProcess._Popen\u001b[0;34m(process_obj)\u001b[0m\n\u001b[1;32m    286\u001b[0m \u001b[38;5;129m@staticmethod\u001b[39m\n\u001b[1;32m    287\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_Popen\u001b[39m(process_obj):\n\u001b[1;32m    288\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpopen_spawn_posix\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Popen\n\u001b[0;32m--> 289\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mPopen\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocess_obj\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/popen_spawn_posix.py:32\u001b[0m, in \u001b[0;36mPopen.__init__\u001b[0;34m(self, process_obj)\u001b[0m\n\u001b[1;32m     30\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, process_obj):\n\u001b[1;32m     31\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fds \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m---> 32\u001b[0m     \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mprocess_obj\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/popen_fork.py:19\u001b[0m, in \u001b[0;36mPopen.__init__\u001b[0;34m(self, process_obj)\u001b[0m\n\u001b[1;32m     17\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturncode \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m     18\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfinalizer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m---> 19\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_launch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocess_obj\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/popen_spawn_posix.py:62\u001b[0m, in \u001b[0;36mPopen._launch\u001b[0;34m(self, process_obj)\u001b[0m\n\u001b[1;32m     60\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msentinel \u001b[38;5;241m=\u001b[39m parent_r\n\u001b[1;32m     61\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(parent_w, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mwb\u001b[39m\u001b[38;5;124m'\u001b[39m, closefd\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[0;32m---> 62\u001b[0m         \u001b[43mf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetbuffer\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     63\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m     64\u001b[0m     fds_to_close \u001b[38;5;241m=\u001b[39m []\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "import datetime\n",
    "import numpy as np\n",
    "from gensim.models import CoherenceModel\n",
    "\n",
    "print(datetime.datetime.now())\n",
    "\n",
    "for a in alpha:\n",
    "    for b in beta:\n",
    "        for num in num_topics:\n",
    "            cv, pv = compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict,k=num, a=a, b=b) \n",
    "\n",
    "            model_topics.append(num)    \n",
    "            coherence_values.append(cv)  \n",
    "            perplexity_values.append(pv)\n",
    "            alpha_result.append(a)\n",
    "            beta_result.append(b)\n",
    "            print(\"#Topics: \" + str(num) + \", CV Score: \" + str(coherence_values[-1]) + \", PV Score: \" + str(perplexity_values[-1]) + \", Alpha: \" + str(alpha_result[-1]) + \", Beta: \" + str(beta_result[-1]))\n",
    "    \n",
    "print(datetime.datetime.now())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "364ff6d5-e3da-4dde-a2c8-5375fc5d711f",
   "metadata": {},
   "source": [
    "The table below reveals the top 20 fine tuned models with best combinations of coherence score and perplexity score. It was sorted by the coherence score in descending order as a higher coherence score indicates a better model, and sorted the perplexity score in ascending order as a lower perplexity score indicates a better model. While coherence score evaluates the quality of the topics, the perplexity score evaluates the overall performance of the model in predicting new documents. Usually, the coherence score is a better metric to use if the goal is to obtain topics that are semantically coherent and interpretable. Perplexity score, on the other hand, is a better metric to use if the goal is to build a model that generalises well to new data, in other words, how confident the model is in predicting the new data (Sánchez-Aguayo, et al., 2022). Ultimately, we aim to get a balance between the perplexity value and coherence score when determining our final model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "78a60032-a4d7-44d4-841c-a1bd3740d5dd",
   "metadata": {},
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mFailed to start the Kernel. \n",
      "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "# Find the top 20 combinations based on Coherence Score and Perplexity Score\n",
    "result = pd.DataFrame(\n",
    "    {'Topics': model_topics,\n",
    "     'Coherence Score': coherence_values,\n",
    "     'Perplexity Score': perplexity_values,\n",
    "     'Alpha': alpha_result,\n",
    "     'Beta': beta_result\n",
    "    })\n",
    "result.sort_values(by=['Coherence Score', 'Perplexity Score'], ascending=[False, True]).head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3461df57-c069-4ad2-80d7-8890dec9438e",
   "metadata": {},
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mFailed to start the Kernel. \n",
      "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "result.to_csv('data/lda_fine_tuning_result_severe.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "800e5a4b-7302-42e8-97b0-5b598c1c80ae",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mFailed to start the Kernel. \n",
      "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "# Show graph Topics vs Coherence Score\n",
    "result.groupby('Alpha').plot(x='Topics', y='Coherence Score', legend = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26996b89-0e7a-4f2d-8cf7-c4a716569bc2",
   "metadata": {},
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mFailed to start the Kernel. \n",
      "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "# Show graph Topics vs Perplexity Score\n",
    "\n",
    "plt.plot(model_topics, coherence_values)\n",
    "plt.xlabel(\"Num Topics\")\n",
    "plt.ylabel(\"Coherence Score\")\n",
    "plt.legend((\"Coherence Score\"), loc='best')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91d2f4c1-de77-44b6-b41b-fcc9a07233e8",
   "metadata": {},
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mFailed to start the Kernel. \n",
      "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "# Show graph Topics vs Perplexity Score\n",
    "\n",
    "plt.plot(model_topics, perplexity_values)\n",
    "plt.xlabel(\"Num Topics\")\n",
    "plt.ylabel(\"Perplexity score\")\n",
    "plt.legend((\"perplexity_values\"), loc='best')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cdc3ddd2-f743-4e5b-b6c6-2656e0b77aec",
   "metadata": {},
   "source": [
    "## Final Model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8a4196d2-0f8a-4b0b-a6dd-ead9441af44e",
   "metadata": {},
   "source": [
    "Topic 4, 8, 9 and 10 were selected for further evaluation using the visual graphs, considering that the best combination does not always yield the best result since a model with higher number of topics tends to have a better measurable result but may not fit the data the most. \n",
    "\n",
    "However, take note that even the random_state was preset and all other parameters were fixed, there are still randomness found that the model may produce inconsistant output each time. "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "df1c00ad-ba54-4686-ac75-ef1033066dce",
   "metadata": {},
   "source": [
    "unfortunately, the alter of the number of topics has no much effect on the results, and the news are not clustered into relevant topics properly. also, most topics are stacked together, indicating high similarity and ambiguity among them due to the multi-aspect nature of the news contents. As a result, LDA may not be a suitable solution for this kind of news content. same result goes for moderate and minor."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "490734ed-077c-4fb0-930c-0b42f4f63c94",
   "metadata": {},
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mFailed to start the Kernel. \n",
      "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "# realised that there may be some overlaps for 8 topics, thus 4-6 topics are optimal\n",
    "k = 8\n",
    "a = 'asymmetric'\n",
    "# a = 0.91\n",
    "# b = 0.61\n",
    "b = 'symmetric'\n",
    "\n",
    "\n",
    "\n",
    "final_model = gensim.models.LdaMulticore(corpus=docs_vecs,\n",
    "                                           id2word=doc_dict,\n",
    "                                           num_topics=k, \n",
    "                                           random_state=42,\n",
    "                                           chunksize=100,\n",
    "                                           passes=10,\n",
    "                                           alpha=a,\n",
    "                                           eta=b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "afe8abf0-2d12-414e-92be-a655865addb1",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mFailed to start the Kernel. \n",
      "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict,k=k, a=a, b=b) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8430a827-6dbb-4737-8ccc-78ed17a01234",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mFailed to start the Kernel. \n",
      "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "#Set up the environment to display the graphical outputs\n",
    "# feed the LDA model into the pyLDAvis instance\n",
    "pyLDAvis.enable_notebook()\n",
    "visual= gensimvis.prepare(final_model, docs_vecs, doc_dict)\n",
    "\n",
    "#Save the output to the html file\n",
    "pyLDAvis.save_html(visual, \"data/topic_viz8_severe_training.html\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5e30d71a-a3c7-40c7-94c0-7eea1bedc887",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mFailed to start the Kernel. \n",
      "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "final_model.print_topics(num_words=30)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4fc5e753-e0e7-4520-9e7d-10e26e4d580d",
   "metadata": {},
   "source": [
    "This allows ease access to the trained model for future prediction work."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84eb2746-173a-4283-bca5-681f77548698",
   "metadata": {},
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mFailed to start the Kernel. \n",
      "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "#Save a model to disk, or reload a pre-trained model\n",
    "# naming convention: final_model_topic_alpha_eta\n",
    "final_model.save(\"final_model_8_asym_sym\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "466c3952-69d1-4d19-b49c-d4b1e1844572",
   "metadata": {},
   "source": [
    "get dominant topics and topic percentage contribution.\n",
    "Made use of gensim lda's own function: https://radimrehurek.com/gensim/models/ldamodel.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cd88034c-2fb8-4f1f-a4e8-85d09b4fc1dc",
   "metadata": {},
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mFailed to start the Kernel. \n",
      "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "def format_topics_sentences(ldamodel, corpus, data):\n",
    "    # Preallocate memory for the DataFrame\n",
    "    num_docs = len(corpus)\n",
    "    sent_topics = {'Dominant_Topic': [0] * num_docs, 'Perc_Contribution': [0.0] * num_docs, 'Topic_Distribution': [()] * num_docs}\n",
    "    \n",
    "    # Get main topic in each document\n",
    "    for i, row in enumerate(ldamodel[corpus]):\n",
    "        row = sorted(row, key=lambda x: (x[1]), reverse=True)\n",
    "        if row:\n",
    "            # Get the Dominant topic, Perc Contribution and Keywords for each document\n",
    "            dominant_topic, perc_contribution = row[0]\n",
    "            topic_distribution = row\n",
    "            sent_topics['Dominant_Topic'][i] = int(dominant_topic)\n",
    "            sent_topics['Perc_Contribution'][i] = round(perc_contribution, 4)\n",
    "            sent_topics['Topic_Distribution'][i] = topic_distribution\n",
    "\n",
    "    # Create the DataFrame\n",
    "    sent_topics_df = pd.DataFrame(sent_topics)\n",
    "    sent_topics_df['Text'] = data\n",
    "\n",
    "    return sent_topics_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24d3ff60-035e-4133-9ffd-88cce5cdccb1",
   "metadata": {},
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mFailed to start the Kernel. \n",
      "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "df_topic_sents_keywords = format_topics_sentences(ldamodel=final_model, corpus=docs_vecs, data=cleaned.Headline_Details)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c88b088b",
   "metadata": {},
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mFailed to start the Kernel. \n",
      "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "# Format\n",
    "df_dominant_topic = df_topic_sents_keywords.reset_index()\n",
    "df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Topic_Distribution', 'Text']\n",
    "\n",
    "# Show\n",
    "df_dominant_topic.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "560da382-aa86-4df1-8b85-56b057a27cd4",
   "metadata": {},
   "source": [
    "# Result Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4fe6b40b-6922-4de3-8d9e-dac7474b6303",
   "metadata": {},
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mFailed to start the Kernel. \n",
      "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "df_dominant_topic[\"Dominant_Topic\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9917340-31cf-48af-871f-b481128fdf22",
   "metadata": {},
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mFailed to start the Kernel. \n",
      "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Get value counts of each topic\n",
    "topic_counts = df_dominant_topic[\"Dominant_Topic\"].value_counts()\n",
    "\n",
    "# Create a bar plot\n",
    "plt.figure(figsize=(8, 6))\n",
    "topic_counts.plot(kind=\"bar\", color=\"skyblue\")\n",
    "\n",
    "# Add labels to the bars\n",
    "for i, count in enumerate(topic_counts):\n",
    "    plt.text(i, count, str(count), ha=\"center\", va=\"bottom\")\n",
    "\n",
    "# Add labels and title\n",
    "plt.xlabel(\"Topics\")\n",
    "plt.ylabel(\"Number of News\")\n",
    "plt.title(\"Topic Distribution\")\n",
    "\n",
    "# Show the plot\n",
    "plt.xticks(rotation=45)  # Rotate x-axis labels for better readability\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fffa1e57-f975-4469-a42b-19d76c60fb66",
   "metadata": {},
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mFailed to start the Kernel. \n",
      "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "df_dominant_topic.sort_values(by='Topic_Perc_Contrib', ascending=False).head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8510f506-141f-4382-b668-251df1afc95f",
   "metadata": {},
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mFailed to start the Kernel. \n",
      "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "# Sample 100 rows, can change the random_state for different samples\n",
    "sampled_data = df_dominant_topic.sample(n=100, random_state=42)  \n",
    "sampled_df = pd.DataFrame(sampled_data).reset_index()\n",
    "sampled_df.to_csv('data/sample_severe.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "16388596-a1d6-4509-acac-6dd57220554a",
   "metadata": {},
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mFailed to start the Kernel. \n",
      "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}