{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "46322fb5-5918-4b70-9689-9e0781439ac4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "workding dir: /Users/inflaton/code/engd/papers/maritime/global-incidents\n", "loading env vars from: /Users/inflaton/code/engd/papers/maritime/global-incidents/.env\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "import sys\n", "from pathlib import Path\n", "\n", "workding_dir = str(Path.cwd().parent)\n", "os.chdir(workding_dir)\n", "sys.path.append(workding_dir)\n", "print(\"workding dir:\", workding_dir)\n", "\n", "from dotenv import find_dotenv, load_dotenv\n", "\n", "found_dotenv = find_dotenv(\".env\")\n", "\n", "if len(found_dotenv) == 0:\n", " found_dotenv = find_dotenv(\".env.example\")\n", "print(f\"loading env vars from: {found_dotenv}\")\n", "load_dotenv(found_dotenv, override=True)" ] }, { "cell_type": "code", "execution_count": 2, "id": "daf1e3d1-75ac-4299-8bed-2f413a49f9a6", "metadata": { "tags": [] }, "outputs": [], "source": [ "import nltk\n", "from nltk.tokenize import sent_tokenize\n", "from nltk.tokenize import word_tokenize\n", "\n", "import gensim\n", "from gensim import corpora\n", "from gensim import similarities\n", "from gensim import models\n", "from gensim.models import CoherenceModel\n", "\n", "# from wordcloud import WordCloud, ImageColorGenerator\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import pandas as pd\n", "import re\n", "import os\n", "import datetime\n", "\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "from pprint import pprint\n", "import pyLDAvis\n", "import pyLDAvis.gensim_models as gensimvis" ] }, { "cell_type": "markdown", "id": "49e6de6b-71bd-4948-8827-52601406058f", "metadata": {}, "source": [ "# Import Data" ] }, { "cell_type": "code", "execution_count": 3, "id": "49222182-7811-4fa6-8c0a-21d3a546863e", "metadata": {}, "outputs": [], "source": [ "df = pd.read_parquet('data/processed_data2.parquet')" ] }, { "cell_type": "code", "execution_count": 4, "id": "3fb59a30", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idHeadlineDetailsSeverityCategoryRegionDatetimeYearlatlon...if_labeledMonthWeekHeadline_Detailsurltitlecontentcleaned_contentbinary_contentword_count
01Grasberg Mine- Grasberg mine workers extend st...Media sources indicate that workers at the Gra...ModerateMine Workers StrikeIndonesia28/5/17 17:082017.0-4.05608137.11302...False5.021.0Grasberg Mine- Grasberg mine workers extend st...https://news.google.com/rss/articles/CBMiZ2h0d...Freeport Indonesia mine workers extend strike ...Trucks are seen on a road in the Grasberg copp...[truck, be, see, on, road, in, grasberg, coppe...[adkerson_jakarta_try, agreement_freeport_indo...53
13Shanghai port congestion impacts terminals in ...The persisting port congestion at Shanghai’s Y...MinorPort CongestionChina27/4/17 9:162017.029.52000121.33190...False4.017.0Shanghai port congestion impacts terminals in ...https://news.google.com/rss/articles/CBMiVWh0d...Typhoon Muifa to shut China ports for second t...By Sam Whelan 13/09/2022\\n\\nAnother typhoon ha...[by, sam, whelan, typhoon, have, prompt, port,...[additional_ripple_effect, avoid_path_typhoon,...44
25UPDATE - Indonesia: Police confirm two explosi...According to local police in Jakarta, two expl...ExtremeBombing, Police OperationsIndonesia24/5/17 16:202017.0NaNNaN...True5.021.0UPDATE - Indonesia: Police confirm two explosi...https://news.google.com/rss/articles/CBMiZWh0d...Jakarta Police Receive 2 More Reports on Coldp...TEMPO.CO, Jakarta - South Jakarta Metro Police...[jakarta, south, jakarta, metro, police, recei...[actress_accord, available_day_concert, click_...24
36UPDATE - Indonesia: Severe winds damage infras...Severe winds have downed billboards and trees ...ModerateRoadway Closure / Disruption, Flooding, Severe...Indonesia19/4/17 9:102017.0-6.91264107.65700...True4.016.0UPDATE - Indonesia: Severe winds damage infras...https://news.google.com/rss/articles/CBMiSWh0d...Indonesia hit by some of strongest winds recordedA man stands near damaged houses following a t...[man, stand, near, damage, house, follow, torn...[bbc_indonesia, climatologist_government_resea...28
4142 miles E of Chesterfield - A tornado has touc...Government sources are reporting a tornado has...MinorTornadoUnited States17/9/18 19:552018.037.51000-77.61000...True9.038.02 miles E of Chesterfield - A tornado has touc...https://news.google.com/rss/articles/CBMigAFod...UPDATE: Number of homes without power down to ...More than 90,000 homes and businesses across t...[more, than, home, business, across, richmond,...[advise_seek_alternate, affect_richmond, alter...134
\n", "

5 rows × 23 columns

\n", "
" ], "text/plain": [ " id Headline \\\n", "0 1 Grasberg Mine- Grasberg mine workers extend st... \n", "1 3 Shanghai port congestion impacts terminals in ... \n", "2 5 UPDATE - Indonesia: Police confirm two explosi... \n", "3 6 UPDATE - Indonesia: Severe winds damage infras... \n", "4 14 2 miles E of Chesterfield - A tornado has touc... \n", "\n", " Details Severity \\\n", "0 Media sources indicate that workers at the Gra... Moderate \n", "1 The persisting port congestion at Shanghai’s Y... Minor \n", "2 According to local police in Jakarta, two expl... Extreme \n", "3 Severe winds have downed billboards and trees ... Moderate \n", "4 Government sources are reporting a tornado has... Minor \n", "\n", " Category Region \\\n", "0 Mine Workers Strike Indonesia \n", "1 Port Congestion China \n", "2 Bombing, Police Operations Indonesia \n", "3 Roadway Closure / Disruption, Flooding, Severe... Indonesia \n", "4 Tornado United States \n", "\n", " Datetime Year lat lon ... if_labeled Month Week \\\n", "0 28/5/17 17:08 2017.0 -4.05608 137.11302 ... False 5.0 21.0 \n", "1 27/4/17 9:16 2017.0 29.52000 121.33190 ... False 4.0 17.0 \n", "2 24/5/17 16:20 2017.0 NaN NaN ... True 5.0 21.0 \n", "3 19/4/17 9:10 2017.0 -6.91264 107.65700 ... True 4.0 16.0 \n", "4 17/9/18 19:55 2018.0 37.51000 -77.61000 ... True 9.0 38.0 \n", "\n", " Headline_Details \\\n", "0 Grasberg Mine- Grasberg mine workers extend st... \n", "1 Shanghai port congestion impacts terminals in ... \n", "2 UPDATE - Indonesia: Police confirm two explosi... \n", "3 UPDATE - Indonesia: Severe winds damage infras... \n", "4 2 miles E of Chesterfield - A tornado has touc... \n", "\n", " url \\\n", "0 https://news.google.com/rss/articles/CBMiZ2h0d... \n", "1 https://news.google.com/rss/articles/CBMiVWh0d... \n", "2 https://news.google.com/rss/articles/CBMiZWh0d... \n", "3 https://news.google.com/rss/articles/CBMiSWh0d... \n", "4 https://news.google.com/rss/articles/CBMigAFod... \n", "\n", " title \\\n", "0 Freeport Indonesia mine workers extend strike ... \n", "1 Typhoon Muifa to shut China ports for second t... \n", "2 Jakarta Police Receive 2 More Reports on Coldp... \n", "3 Indonesia hit by some of strongest winds recorded \n", "4 UPDATE: Number of homes without power down to ... \n", "\n", " content \\\n", "0 Trucks are seen on a road in the Grasberg copp... \n", "1 By Sam Whelan 13/09/2022\\n\\nAnother typhoon ha... \n", "2 TEMPO.CO, Jakarta - South Jakarta Metro Police... \n", "3 A man stands near damaged houses following a t... \n", "4 More than 90,000 homes and businesses across t... \n", "\n", " cleaned_content \\\n", "0 [truck, be, see, on, road, in, grasberg, coppe... \n", "1 [by, sam, whelan, typhoon, have, prompt, port,... \n", "2 [jakarta, south, jakarta, metro, police, recei... \n", "3 [man, stand, near, damage, house, follow, torn... \n", "4 [more, than, home, business, across, richmond,... \n", "\n", " binary_content word_count \n", "0 [adkerson_jakarta_try, agreement_freeport_indo... 53 \n", "1 [additional_ripple_effect, avoid_path_typhoon,... 44 \n", "2 [actress_accord, available_day_concert, click_... 24 \n", "3 [bbc_indonesia, climatologist_government_resea... 28 \n", "4 [advise_seek_alternate, affect_richmond, alter... 134 \n", "\n", "[5 rows x 23 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 5, "id": "09113e88-66cc-414c-a953-da04db83c4ae", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(3681, 23)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "markdown", "id": "037e74fc-bbcd-43e3-8346-799920cca8d8", "metadata": {}, "source": [ "# Vectorisation" ] }, { "cell_type": "markdown", "id": "d67cef3a-59fb-4dd8-adc8-2cf288b90728", "metadata": {}, "source": [ "NLP vectorization refers to the process of converting text data into numerical vectors that machine learning algorithms can understand and process. \n", "\n", "Bag-of-Words (BoW) is used here that represents text as a collection of unique words along with their frequencies. Each word is assigned an index, and the vector contains the count of each word present in the document." ] }, { "cell_type": "code", "execution_count": 6, "id": "c95b7b8a-9767-469d-812d-c9a9d9fee0e9", "metadata": {}, "outputs": [], "source": [ "df_copy = df.copy()" ] }, { "cell_type": "code", "execution_count": 7, "id": "dfb2001e-04c1-49dc-b423-a64ea47af5a9", "metadata": {}, "outputs": [], "source": [ "# choose only the extreme and severe cases for modelling\n", "cleaned = df_copy[df_copy['Severity'].isin(['Minor'])]\n", "cleaned.reset_index(drop=True, inplace=True)" ] }, { "cell_type": "code", "execution_count": 8, "id": "3da09b6a-65c6-4f40-9a21-e0b798318ca5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1620, 23)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cleaned.shape" ] }, { "cell_type": "code", "execution_count": 9, "id": "de71c523-a59e-44b2-aa96-5f17d872c9c6", "metadata": {}, "outputs": [], "source": [ "headline = cleaned.cleaned_content" ] }, { "cell_type": "code", "execution_count": 10, "id": "5b1e34e1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['man', 'be', 'seriously', 'injure', 'after', 'boat', 'catch',\n", " 'fire', 'in', 'sydney', 'marina', 'today', 'emergency', 'service',\n", " 'be', 'call', 'birkenhead', 'point', 'marina', 'in', 'drummoyne',\n", " 'shortly', 'after', 'follow', 'report', 'of', 'explosion', 'on',\n", " 'boat', 'careflight', 's', 'rapid', 'response', 'helicopter', 'be',\n", " 'task', 'at', 'crew', 'include', 'doctor', 'intensive', 'care',\n", " 'paramedic', 'fly', 'drummoyne', 'land', 'in', 'nearby', 'brett',\n", " 'park', 'just', 'minute', 'late', 'careflight', 'boat', 'catch',\n", " 'fire', 'near', 'birkenhead', 'point', 'shopping', 'outlet',\n", " 'simon', 'r', 'supply', 'reader', 'image', 'of', 'boat', 'on',\n", " 'fire', 'at', 'marina', 'in', 'drummoyne', 'simon', 'r', 'supply',\n", " 'three', 'nsw', 'ambulance', 'crew', 'careflight', 'chopper',\n", " 'attend', 'scene', 'find', 'man', 'suffer', 'serious', 'burn',\n", " 'legs', 'arm', 'shoulder', 'man', 'suffer', 'burn', 'percent',\n", " 'of', 'body', 'boat', 'reportedly', 'explode', 'into', 'flame',\n", " 'shortly', 'before', 'morning', 'paramedic', 'treat', 'man', 'at',\n", " 'scene', 'with', 'burn', 'percent', 'of', 'body', 'firefighting',\n", " 'boat', 'be', 'deploy', 'help', 'extinguish', 'blaze', 'man', 'be',\n", " 'treat', 'on', 'scene', 'be', 'surround', 'by', 'family', 'be',\n", " 'not', 'onboard', 'boat', 'at', 'time'], dtype=object)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "headline[5]" ] }, { "cell_type": "code", "execution_count": 11, "id": "677055b4-978e-4253-90f4-3f903662e225", "metadata": { "tags": [] }, "outputs": [], "source": [ "# vectorise the words\n", "doc_dict = gensim.corpora.Dictionary(headline)\n", "docs_vecs = [doc_dict.doc2bow(doc) for doc in headline]" ] }, { "cell_type": "code", "execution_count": 12, "id": "a54d1768-b069-4936-a156-deaf0b506d93", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of unique tokens: 35192\n", "Number of articles: 1620\n" ] } ], "source": [ "print('Number of unique tokens: %d' % len(doc_dict)) \n", "print('Number of articles: %d' % len(docs_vecs)) " ] }, { "cell_type": "code", "execution_count": 13, "id": "9147fa86-1503-4252-bd9b-92fea1e6a926", "metadata": { "scrolled": true, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('be', 58147),\n", " ('of', 53324),\n", " ('in', 42710),\n", " ('for', 19846),\n", " ('on', 18899),\n", " ('have', 18769),\n", " ('with', 13029),\n", " ('as', 12079),\n", " ('port', 11548),\n", " ('from', 11033),\n", " ('at', 10589),\n", " ('by', 10527),\n", " ('s', 8527),\n", " ('will', 8465),\n", " ('that', 7825),\n", " ('say', 7752),\n", " ('not', 5664),\n", " ('china', 5654),\n", " ('day', 5304),\n", " ('more', 5266),\n", " ('new', 5198),\n", " ('service', 5001),\n", " ('also', 4707),\n", " ('ship', 4693),\n", " ('time', 4651),\n", " ('than', 3673),\n", " ('year', 3669),\n", " ('trade', 3518),\n", " ('can', 3504),\n", " ('state', 3378),\n", " ('between', 3226),\n", " ('include', 3220),\n", " ('over', 3173),\n", " ('supply', 3094),\n", " ('do', 3083),\n", " ('vessel', 3079),\n", " ('report', 2988),\n", " ('country', 2979),\n", " ('after', 2962),\n", " ('area', 2913),\n", " ('cargo', 2908),\n", " ('one', 2905),\n", " ('people', 2868),\n", " ('other', 2866),\n", " ('increase', 2852),\n", " ('continue', 2787),\n", " ('market', 2766),\n", " ('container', 2765),\n", " ('strike', 2713),\n", " ('chain', 2708),\n", " ('expect', 2656),\n", " ('work', 2624),\n", " ('while', 2590),\n", " ('high', 2579),\n", " ('about', 2514),\n", " ('http', 2489),\n", " ('into', 2470),\n", " ('would', 2461),\n", " ('remain', 2448),\n", " ('pm', 2448),\n", " ('government', 2437),\n", " ('take', 2425),\n", " ('two', 2349),\n", " ('terminal', 2348),\n", " ('may', 2332),\n", " ('due', 2325),\n", " ('use', 2313),\n", " ('update', 2310),\n", " ('city', 2295),\n", " ('train', 2290),\n", " ('make', 2278),\n", " ('info', 2253),\n", " ('demand', 2240),\n", " ('world', 2219),\n", " ('asia', 2194),\n", " ('south', 2160),\n", " ('company', 2148),\n", " ('march', 2125),\n", " ('see', 2101),\n", " ('large', 2097),\n", " ('customer', 2081),\n", " ('through', 2042),\n", " ('most', 2041),\n", " ('plan', 2038),\n", " ('global', 2031),\n", " ('first', 2011),\n", " ('line', 1962),\n", " ('north', 1961),\n", " ('could', 1948),\n", " ('operation', 1931),\n", " ('good', 1928),\n", " ('if', 1906),\n", " ('across', 1893),\n", " ('coast', 1887),\n", " ('business', 1863),\n", " ('when', 1825),\n", " ('week', 1813),\n", " ('during', 1805),\n", " ('last', 1771),\n", " ('million', 1770)]\n" ] } ], "source": [ "# Calculate word frequencies\n", "word_frequencies = {doc_dict[word_id]: freq for word_id, freq in doc_dict.cfs.items()}\n", "sorted_words = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)\n", "\n", "pprint(sorted_words[:100])" ] }, { "cell_type": "markdown", "id": "5ed78239-2ce1-4784-a8f4-4c7438c8627b", "metadata": {}, "source": [ "# LDA Modelling" ] }, { "cell_type": "markdown", "id": "9db83273-461d-4f70-b23f-ec967579d94f", "metadata": {}, "source": [ "## Benchmark Model" ] }, { "cell_type": "code", "execution_count": 14, "id": "e6d577bd-9936-4d45-be90-345af2eb4827", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Build LDA benchmark model\n", "lda_model = gensim.models.LdaMulticore(corpus=docs_vecs,\n", " id2word=doc_dict,\n", " num_topics=4, \n", " random_state=42,\n", " chunksize=100,\n", " passes=10,\n", " per_word_topics=True)" ] }, { "cell_type": "code", "execution_count": 15, "id": "c4f1521f-5f43-40d2-a3a3-a8ac2ca6fec2", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(0,\n", " '0.042*\"of\" + 0.030*\"in\" + 0.026*\"be\" + 0.012*\"china\" + 0.012*\"on\" + '\n", " '0.011*\"s\" + 0.011*\"for\" + 0.011*\"have\" + 0.010*\"that\" + 0.009*\"as\"'),\n", " (1,\n", " '0.028*\"be\" + 0.028*\"of\" + 0.018*\"in\" + 0.011*\"de\" + 0.009*\"y\" + 0.009*\"for\" '\n", " '+ 0.008*\"by\" + 0.007*\"http\" + 0.007*\"on\" + 0.007*\"have\"'),\n", " (2,\n", " '0.041*\"be\" + 0.033*\"of\" + 0.028*\"in\" + 0.015*\"on\" + 0.013*\"have\" + '\n", " '0.013*\"for\" + 0.009*\"at\" + 0.009*\"say\" + 0.009*\"as\" + 0.008*\"with\"'),\n", " (3,\n", " '0.039*\"be\" + 0.029*\"of\" + 0.026*\"in\" + 0.018*\"port\" + 0.013*\"for\" + '\n", " '0.011*\"have\" + 0.009*\"on\" + 0.009*\"with\" + 0.009*\"day\" + 0.008*\"from\"')]\n" ] } ], "source": [ "from pprint import pprint\n", "\n", "# Print the Keyword in the 10 topics\n", "pprint(lda_model.print_topics())\n", "doc_lda = lda_model[docs_vecs]" ] }, { "cell_type": "code", "execution_count": 16, "id": "fd57b1f4-a6cd-41e8-964f-d8a1d30aa3c9", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Coherence Score LDAModel: 0.2757800922361482\n" ] } ], "source": [ "# Compute Benchmark Coherence Score\n", "coherence_model_lda = CoherenceModel(model=lda_model, texts=headline, dictionary=doc_dict, coherence='c_v')\n", "coherence_lda = coherence_model_lda.get_coherence()\n", "print('\\nCoherence Score LDAModel: ', coherence_lda)" ] }, { "cell_type": "code", "execution_count": 17, "id": "152e5a3a-7afe-4fb8-a02f-d7492ad80936", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Perplexity for LDAModel: -7.407722316633226\n" ] } ], "source": [ "# Compute Benchmark Perplexity\n", "perplex= lda_model.log_perplexity(docs_vecs, total_docs=None) #For LDAModel\n", " # a measure of how good the model is. lower the better.\n", "\n", "print('\\nPerplexity for LDAModel: ', perplex)" ] }, { "cell_type": "code", "execution_count": 18, "id": "7dd3a60a-5c6f-4249-9868-30528a5b0ac8", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n", "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n", " pid = os.fork()\n" ] } ], "source": [ "from pprint import pprint\n", "import pyLDAvis\n", "import pyLDAvis.gensim_models as gensimvis\n", "\n", "# feed the LDA model into the pyLDAvis instance\n", "pyLDAvis.enable_notebook()\n", "visual= gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n", "\n", "# Save the output to the html file\n", "pyLDAvis.save_html(visual, \"data/topic_viz_benchmark_minor.html\")" ] }, { "cell_type": "markdown", "id": "1895598f-3e5f-4acd-83a6-4491cc90f695", "metadata": {}, "source": [ "# Hyper-Perameter Tuning and Evaluation" ] }, { "cell_type": "markdown", "id": "47136c89-ff7b-4ac9-840f-04122fe62160", "metadata": {}, "source": [ "Run the cells below only for re-modelling with new datasets, the whole tuning and evaluation process may take hours to run." ] }, { "cell_type": "code", "execution_count": 19, "id": "c79ca5c4-e078-43ce-a430-8c1ed93dcd64", "metadata": {}, "outputs": [], "source": [ "# hyper-perameter tuning (alpha and beta)\n", "def compute_coherence_values(corpus, dictionary, k, a, b):\n", " \n", " lda_model = gensim.models.LdaMulticore(corpus=corpus,\n", " id2word=dictionary,\n", " num_topics=k, \n", " random_state=42,\n", " chunksize=100,\n", " passes=10,\n", " alpha=a,\n", " eta=b)\n", " \n", " coherence_model_lda = CoherenceModel(model=lda_model, texts=headline, dictionary=doc_dict, coherence='c_v')\n", " coherence = coherence_model_lda.get_coherence()\n", " perplex = lda_model.log_perplexity(docs_vecs, total_docs=None) \n", " \n", " return coherence, perplex" ] }, { "cell_type": "code", "execution_count": 20, "id": "1c3c8478-9336-40f2-bb30-a37db4243b67", "metadata": {}, "outputs": [], "source": [ "# setup\n", "import numpy as np\n", "\n", "from gensim.models import CoherenceModel\n", "\n", "model_list = []\n", "coherence_values = []\n", "perplexity_values = []\n", "model_topics = []\n", "alpha_result = []\n", "beta_result = []\n", "\n", "# topic ranges\n", "num_topics = range(4, 13)\n", "\n", "# Alpha parameter\n", "alpha = list(np.arange(0.31, 1, 0.3))\n", "alpha.append('symmetric')\n", "alpha.append('asymmetric')\n", "\n", "# Beta parameter\n", "beta = list(np.arange(0.31, 1, 0.3))\n", "beta.append('symmetric')" ] }, { "cell_type": "markdown", "id": "c7e6bc53-0b57-4858-879a-644eca54ddbc", "metadata": {}, "source": [ "Rational behind the alpha and eta: https://stats.stackexchange.com/questions/37405/natural-interpretation-for-lda-hyperparameters" ] }, { "cell_type": "code", "execution_count": 21, "id": "02877b81-32df-4168-8e62-4cbca2be100b", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Topic range: range(4, 13)\n", "Alpha: [0.31, 0.61, 0.9099999999999999, 'symmetric', 'asymmetric']\n", "Beta: [0.31, 0.61, 0.9099999999999999, 'symmetric']\n" ] } ], "source": [ "print(\"Topic range: \",num_topics)\n", "print(\"Alpha: \",alpha)\n", "print(\"Beta: \", beta)" ] }, { "cell_type": "code", "execution_count": 23, "id": "3c1f703c-4778-467f-a12e-0c18eeb274c5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2024-06-30 11:52:46.026823\n", "#Topics: 4, CV Score: 0.26143167370104753, PV Score: -7.394241913115894, Alpha: 0.31, Beta: 0.31\n", "#Topics: 5, CV Score: 0.29008917581081944, PV Score: -7.35821379289522, Alpha: 0.31, Beta: 0.31\n", "#Topics: 6, CV Score: 0.3075550457683199, PV Score: -7.348746753666222, Alpha: 0.31, Beta: 0.31\n", "#Topics: 7, CV Score: 0.27068500671158163, PV Score: -7.332448354134598, Alpha: 0.31, Beta: 0.31\n", "#Topics: 8, CV Score: 0.2949752069963174, PV Score: -7.355618761173559, Alpha: 0.31, Beta: 0.31\n", "#Topics: 9, CV Score: 0.2883770109840239, PV Score: -7.345756037675643, Alpha: 0.31, Beta: 0.31\n", "#Topics: 10, CV Score: 0.30169838729877146, PV Score: -7.334658077515814, Alpha: 0.31, Beta: 0.31\n", "#Topics: 11, CV Score: 0.28399502160009293, PV Score: -7.337863652882048, Alpha: 0.31, Beta: 0.31\n", "#Topics: 12, CV Score: 0.3098908373731854, PV Score: -7.317278302545894, Alpha: 0.31, Beta: 0.31\n", "#Topics: 4, CV Score: 0.26324058496974073, PV Score: -7.421960949456933, Alpha: 0.31, Beta: 0.61\n", "#Topics: 5, CV Score: 0.28653242913009425, PV Score: -7.406328200019341, Alpha: 0.31, Beta: 0.61\n", "#Topics: 6, CV Score: 0.29614945599419024, PV Score: -7.405829958529572, Alpha: 0.31, Beta: 0.61\n", "#Topics: 7, CV Score: 0.26469167967575336, PV Score: -7.409695513631366, Alpha: 0.31, Beta: 0.61\n", "#Topics: 8, CV Score: 0.29743885397540426, PV Score: -7.416190442593165, Alpha: 0.31, Beta: 0.61\n", "#Topics: 9, CV Score: 0.29464381439032383, PV Score: -7.402408965099985, Alpha: 0.31, Beta: 0.61\n", "#Topics: 10, CV Score: 0.34823725193043276, PV Score: -7.398638060374257, Alpha: 0.31, Beta: 0.61\n", "#Topics: 11, CV Score: 0.31843503124612416, PV Score: -7.4064374372258275, Alpha: 0.31, Beta: 0.61\n", "#Topics: 12, CV Score: 0.31444697036706293, PV Score: -7.405945850874236, Alpha: 0.31, Beta: 0.61\n", "#Topics: 4, CV Score: 0.2712735526187385, PV Score: -7.4600931582453756, Alpha: 0.31, Beta: 0.9099999999999999\n", "#Topics: 5, CV Score: 0.30311968074865997, PV Score: -7.460908353343469, Alpha: 0.31, Beta: 0.9099999999999999\n", "#Topics: 6, CV Score: 0.29829568606376594, PV Score: -7.453422852489877, Alpha: 0.31, Beta: 0.9099999999999999\n", "#Topics: 7, CV Score: 0.29621387204862615, PV Score: -7.46166337203672, Alpha: 0.31, Beta: 0.9099999999999999\n", "#Topics: 8, CV Score: 0.32127327516432214, PV Score: -7.467013507843407, Alpha: 0.31, Beta: 0.9099999999999999\n", "#Topics: 9, CV Score: 0.31985991254557, PV Score: -7.431396910423797, Alpha: 0.31, Beta: 0.9099999999999999\n", "#Topics: 10, CV Score: 0.3224915541388905, PV Score: -7.448438398378084, Alpha: 0.31, Beta: 0.9099999999999999\n", "#Topics: 11, CV Score: 0.3381018616555288, PV Score: -7.457876704009056, Alpha: 0.31, Beta: 0.9099999999999999\n", "#Topics: 12, CV Score: 0.30836100224079815, PV Score: -7.45460656674408, Alpha: 0.31, Beta: 0.9099999999999999\n", "#Topics: 4, CV Score: 0.2643367431201753, PV Score: -7.390794744679982, Alpha: 0.31, Beta: symmetric\n", "#Topics: 5, CV Score: 0.29008917581081944, PV Score: -7.341357499253523, Alpha: 0.31, Beta: symmetric\n", "#Topics: 6, CV Score: 0.27735837350398757, PV Score: -7.323864546372206, Alpha: 0.31, Beta: symmetric\n", "#Topics: 7, CV Score: 0.27207166533136756, PV Score: -7.298831876196072, Alpha: 0.31, Beta: symmetric\n", "#Topics: 8, CV Score: 0.29681559070931296, PV Score: -7.286549304258785, Alpha: 0.31, Beta: symmetric\n", "#Topics: 9, CV Score: 0.29778017634331727, PV Score: -7.271904110257489, Alpha: 0.31, Beta: symmetric\n", "#Topics: 10, CV Score: 0.3012188333925358, PV Score: -7.263682235965553, Alpha: 0.31, Beta: symmetric\n", "#Topics: 11, CV Score: 0.30281626874462, PV Score: -7.267991091418852, Alpha: 0.31, Beta: symmetric\n", "#Topics: 12, CV Score: 0.31307471307490264, PV Score: -7.263340871172593, Alpha: 0.31, Beta: symmetric\n", "#Topics: 4, CV Score: 0.2631472484692373, PV Score: -7.391225160526899, Alpha: 0.61, Beta: 0.31\n", "#Topics: 5, CV Score: 0.28117696184010554, PV Score: -7.358283237848617, Alpha: 0.61, Beta: 0.31\n", "#Topics: 6, CV Score: 0.2946692074353743, PV Score: -7.350030009796271, Alpha: 0.61, Beta: 0.31\n", "#Topics: 7, CV Score: 0.2695848086251494, PV Score: -7.332704133455395, Alpha: 0.61, Beta: 0.31\n", "#Topics: 8, CV Score: 0.2873386785526998, PV Score: -7.365743826339281, Alpha: 0.61, Beta: 0.31\n", "#Topics: 9, CV Score: 0.28209952470043215, PV Score: -7.352132999419965, Alpha: 0.61, Beta: 0.31\n", "#Topics: 10, CV Score: 0.29783175477393864, PV Score: -7.343942952645343, Alpha: 0.61, Beta: 0.31\n", "#Topics: 11, CV Score: 0.2786007151015459, PV Score: -7.345863425524115, Alpha: 0.61, Beta: 0.31\n", "#Topics: 12, CV Score: 0.31534621656869705, PV Score: -7.331805220457104, Alpha: 0.61, Beta: 0.31\n", "#Topics: 4, CV Score: 0.2505923687565112, PV Score: -7.416873289619847, Alpha: 0.61, Beta: 0.61\n", "#Topics: 5, CV Score: 0.28706700596105156, PV Score: -7.403540595451488, Alpha: 0.61, Beta: 0.61\n", "#Topics: 6, CV Score: 0.29473065024977163, PV Score: -7.4106494490711015, Alpha: 0.61, Beta: 0.61\n", "#Topics: 7, CV Score: 0.26820275442412866, PV Score: -7.412576639292029, Alpha: 0.61, Beta: 0.61\n", "#Topics: 8, CV Score: 0.29025456691114637, PV Score: -7.41531726726418, Alpha: 0.61, Beta: 0.61\n", "#Topics: 9, CV Score: 0.29873861091584003, PV Score: -7.410097381301386, Alpha: 0.61, Beta: 0.61\n", "#Topics: 10, CV Score: 0.33489681733700966, PV Score: -7.414464176932596, Alpha: 0.61, Beta: 0.61\n", "#Topics: 11, CV Score: 0.3136515342313888, PV Score: -7.414824058449246, Alpha: 0.61, Beta: 0.61\n", "#Topics: 12, CV Score: 0.29901180359094387, PV Score: -7.425251302616095, Alpha: 0.61, Beta: 0.61\n", "#Topics: 4, CV Score: 0.25606628747612015, PV Score: -7.461432644283255, Alpha: 0.61, Beta: 0.9099999999999999\n", "#Topics: 5, CV Score: 0.2981919886915416, PV Score: -7.461173440540555, Alpha: 0.61, Beta: 0.9099999999999999\n", "#Topics: 6, CV Score: 0.3155490657279846, PV Score: -7.456975825811057, Alpha: 0.61, Beta: 0.9099999999999999\n", "#Topics: 7, CV Score: 0.3083085761113492, PV Score: -7.463721193947859, Alpha: 0.61, Beta: 0.9099999999999999\n", "#Topics: 8, CV Score: 0.2987637794108595, PV Score: -7.463974937352514, Alpha: 0.61, Beta: 0.9099999999999999\n", "#Topics: 9, CV Score: 0.30637199754319, PV Score: -7.447224084336919, Alpha: 0.61, Beta: 0.9099999999999999\n", "#Topics: 10, CV Score: 0.34064164600584956, PV Score: -7.4608295203390345, Alpha: 0.61, Beta: 0.9099999999999999\n", "#Topics: 11, CV Score: 0.33431924954293984, PV Score: -7.468074450247606, Alpha: 0.61, Beta: 0.9099999999999999\n", "#Topics: 12, CV Score: 0.3021823876546624, PV Score: -7.474199589973487, Alpha: 0.61, Beta: 0.9099999999999999\n", "#Topics: 4, CV Score: 0.2592664104701984, PV Score: -7.380958290626252, Alpha: 0.61, Beta: symmetric\n", "#Topics: 5, CV Score: 0.29008917581081944, PV Score: -7.34406277883799, Alpha: 0.61, Beta: symmetric\n", "#Topics: 6, CV Score: 0.2856139513419668, PV Score: -7.332074766991939, Alpha: 0.61, Beta: symmetric\n", "#Topics: 7, CV Score: 0.2721558660968534, PV Score: -7.297199516455353, Alpha: 0.61, Beta: symmetric\n", "#Topics: 8, CV Score: 0.28791066408428134, PV Score: -7.314898967963879, Alpha: 0.61, Beta: symmetric\n", "#Topics: 9, CV Score: 0.2972722790111482, PV Score: -7.283806173931616, Alpha: 0.61, Beta: symmetric\n", "#Topics: 10, CV Score: 0.3082121035138333, PV Score: -7.29151183400522, Alpha: 0.61, Beta: symmetric\n", "#Topics: 11, CV Score: 0.30375150084375463, PV Score: -7.280134176808763, Alpha: 0.61, Beta: symmetric\n", "#Topics: 12, CV Score: 0.3026143223444013, PV Score: -7.277233833271221, Alpha: 0.61, Beta: symmetric\n", "#Topics: 4, CV Score: 0.2587849246611012, PV Score: -7.391745121978334, Alpha: 0.9099999999999999, Beta: 0.31\n", "#Topics: 5, CV Score: 0.29232718870337143, PV Score: -7.359033621013408, Alpha: 0.9099999999999999, Beta: 0.31\n", "#Topics: 6, CV Score: 0.298114743775785, PV Score: -7.351153460394994, Alpha: 0.9099999999999999, Beta: 0.31\n", "#Topics: 7, CV Score: 0.26903864999105614, PV Score: -7.334371538653122, Alpha: 0.9099999999999999, Beta: 0.31\n", "#Topics: 8, CV Score: 0.28622468428206693, PV Score: -7.372271837301099, Alpha: 0.9099999999999999, Beta: 0.31\n", "#Topics: 9, CV Score: 0.29758804437495606, PV Score: -7.358497856224319, Alpha: 0.9099999999999999, Beta: 0.31\n", "#Topics: 10, CV Score: 0.29746253535912703, PV Score: -7.343247363065808, Alpha: 0.9099999999999999, Beta: 0.31\n", "#Topics: 11, CV Score: 0.2842073825386278, PV Score: -7.3531033741821386, Alpha: 0.9099999999999999, Beta: 0.31\n", "#Topics: 12, CV Score: 0.32301209366405703, PV Score: -7.344357219935557, Alpha: 0.9099999999999999, Beta: 0.31\n", "#Topics: 4, CV Score: 0.2620085857568131, PV Score: -7.431829725624846, Alpha: 0.9099999999999999, Beta: 0.61\n", "#Topics: 5, CV Score: 0.3148122601459173, PV Score: -7.4118133900365955, Alpha: 0.9099999999999999, Beta: 0.61\n", "#Topics: 6, CV Score: 0.29369633574952747, PV Score: -7.411592442393478, Alpha: 0.9099999999999999, Beta: 0.61\n", "#Topics: 7, CV Score: 0.2702663182798373, PV Score: -7.408869634701454, Alpha: 0.9099999999999999, Beta: 0.61\n", "#Topics: 8, CV Score: 0.29626015190055904, PV Score: -7.4218524805240165, Alpha: 0.9099999999999999, Beta: 0.61\n", "#Topics: 9, CV Score: 0.3029984127461481, PV Score: -7.413846845692629, Alpha: 0.9099999999999999, Beta: 0.61\n", "#Topics: 10, CV Score: 0.3328224542610766, PV Score: -7.42022182252732, Alpha: 0.9099999999999999, Beta: 0.61\n", "#Topics: 11, CV Score: 0.30475127610761077, PV Score: -7.422669034180523, Alpha: 0.9099999999999999, Beta: 0.61\n", "#Topics: 12, CV Score: 0.30231311065405014, PV Score: -7.434846448528838, Alpha: 0.9099999999999999, Beta: 0.61\n", "#Topics: 4, CV Score: 0.258672617830039, PV Score: -7.469954125650798, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n", "#Topics: 5, CV Score: 0.3141300753700526, PV Score: -7.465439773550294, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n", "#Topics: 6, CV Score: 0.33646277127762847, PV Score: -7.459430389322742, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n", "#Topics: 7, CV Score: 0.2989500059387701, PV Score: -7.464860494675141, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n", "#Topics: 8, CV Score: 0.3067664143272131, PV Score: -7.460823334723379, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n", "#Topics: 9, CV Score: 0.32230471519058956, PV Score: -7.462203609695669, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n", "#Topics: 10, CV Score: 0.355651395118116, PV Score: -7.466906313924779, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n", "#Topics: 11, CV Score: 0.3074563046801128, PV Score: -7.4762246058984125, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n", "#Topics: 12, CV Score: 0.292184655691141, PV Score: -7.48462689762355, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n", "#Topics: 4, CV Score: 0.27425502082773556, PV Score: -7.381959277922089, Alpha: 0.9099999999999999, Beta: symmetric\n", "#Topics: 5, CV Score: 0.2792475123330335, PV Score: -7.340967173529193, Alpha: 0.9099999999999999, Beta: symmetric\n", "#Topics: 6, CV Score: 0.2748288743412546, PV Score: -7.338412973991607, Alpha: 0.9099999999999999, Beta: symmetric\n", "#Topics: 7, CV Score: 0.27272711338157735, PV Score: -7.304992530741422, Alpha: 0.9099999999999999, Beta: symmetric\n", "#Topics: 8, CV Score: 0.27875956141406333, PV Score: -7.310583890409706, Alpha: 0.9099999999999999, Beta: symmetric\n", "#Topics: 9, CV Score: 0.2958113176625343, PV Score: -7.290099223904081, Alpha: 0.9099999999999999, Beta: symmetric\n", "#Topics: 10, CV Score: 0.30039391830183415, PV Score: -7.29361844332684, Alpha: 0.9099999999999999, Beta: symmetric\n", "#Topics: 11, CV Score: 0.30238837300731736, PV Score: -7.293498894544746, Alpha: 0.9099999999999999, Beta: symmetric\n", "#Topics: 12, CV Score: 0.30057296940845457, PV Score: -7.286728893856696, Alpha: 0.9099999999999999, Beta: symmetric\n", "#Topics: 4, CV Score: 0.28045576981660336, PV Score: -7.3810882171558, Alpha: symmetric, Beta: 0.31\n", "#Topics: 5, CV Score: 0.27516707115009786, PV Score: -7.348322538492822, Alpha: symmetric, Beta: 0.31\n", "#Topics: 6, CV Score: 0.30174281503830513, PV Score: -7.357661750753335, Alpha: symmetric, Beta: 0.31\n", "#Topics: 7, CV Score: 0.26452009940043947, PV Score: -7.332995538212589, Alpha: symmetric, Beta: 0.31\n", "#Topics: 8, CV Score: 0.27957941981373957, PV Score: -7.34520072112036, Alpha: symmetric, Beta: 0.31\n", "#Topics: 9, CV Score: 0.2921761447063516, PV Score: -7.3174465018975985, Alpha: symmetric, Beta: 0.31\n", "#Topics: 10, CV Score: 0.2819456892021096, PV Score: -7.3271012408060425, Alpha: symmetric, Beta: 0.31\n", "#Topics: 11, CV Score: 0.2858145693331318, PV Score: -7.324243303702109, Alpha: symmetric, Beta: 0.31\n", "#Topics: 12, CV Score: 0.3162268304900056, PV Score: -7.301149412893058, Alpha: symmetric, Beta: 0.31\n", "#Topics: 4, CV Score: 0.26200858575681313, PV Score: -7.435025163734385, Alpha: symmetric, Beta: 0.61\n", "#Topics: 5, CV Score: 0.28653242913009425, PV Score: -7.402808127824116, Alpha: symmetric, Beta: 0.61\n", "#Topics: 6, CV Score: 0.30034116357398466, PV Score: -7.403041768564758, Alpha: symmetric, Beta: 0.61\n", "#Topics: 7, CV Score: 0.27713631118988463, PV Score: -7.401296493610201, Alpha: symmetric, Beta: 0.61\n", "#Topics: 8, CV Score: 0.305005574283436, PV Score: -7.403063894303076, Alpha: symmetric, Beta: 0.61\n", "#Topics: 9, CV Score: 0.319284842602941, PV Score: -7.381097776990529, Alpha: symmetric, Beta: 0.61\n", "#Topics: 10, CV Score: 0.3187835569771548, PV Score: -7.392442067244151, Alpha: symmetric, Beta: 0.61\n", "#Topics: 11, CV Score: 0.31619116554114374, PV Score: -7.398110003036577, Alpha: symmetric, Beta: 0.61\n", "#Topics: 12, CV Score: 0.30620936485014044, PV Score: -7.389060738256993, Alpha: symmetric, Beta: 0.61\n", "#Topics: 4, CV Score: 0.25861742292536233, PV Score: -7.463031931556786, Alpha: symmetric, Beta: 0.9099999999999999\n", "#Topics: 5, CV Score: 0.30612131816590366, PV Score: -7.459187288641012, Alpha: symmetric, Beta: 0.9099999999999999\n", "#Topics: 6, CV Score: 0.29856036681931464, PV Score: -7.451419283754892, Alpha: symmetric, Beta: 0.9099999999999999\n", "#Topics: 7, CV Score: 0.2963932433750025, PV Score: -7.451642157595998, Alpha: symmetric, Beta: 0.9099999999999999\n", "#Topics: 8, CV Score: 0.3175863739616889, PV Score: -7.460112275446344, Alpha: symmetric, Beta: 0.9099999999999999\n", "#Topics: 9, CV Score: 0.30091910989824133, PV Score: -7.423042776887486, Alpha: symmetric, Beta: 0.9099999999999999\n", "#Topics: 10, CV Score: 0.3221986755554972, PV Score: -7.4413791780149126, Alpha: symmetric, Beta: 0.9099999999999999\n", "#Topics: 11, CV Score: 0.3400624797510634, PV Score: -7.4475561490332405, Alpha: symmetric, Beta: 0.9099999999999999\n", "#Topics: 12, CV Score: 0.29577010557935146, PV Score: -7.438896552351397, Alpha: symmetric, Beta: 0.9099999999999999\n", "#Topics: 4, CV Score: 0.2875150010691004, PV Score: -7.398657443181138, Alpha: symmetric, Beta: symmetric\n", "#Topics: 5, CV Score: 0.2900891758108194, PV Score: -7.337054653335996, Alpha: symmetric, Beta: symmetric\n", "#Topics: 6, CV Score: 0.2872501529046281, PV Score: -7.319444085859869, Alpha: symmetric, Beta: symmetric\n", "#Topics: 7, CV Score: 0.27501057403010526, PV Score: -7.290723834382987, Alpha: symmetric, Beta: symmetric\n", "#Topics: 8, CV Score: 0.2923043477830799, PV Score: -7.2829244272999825, Alpha: symmetric, Beta: symmetric\n", "#Topics: 9, CV Score: 0.2978356724583504, PV Score: -7.263979371448642, Alpha: symmetric, Beta: symmetric\n", "#Topics: 10, CV Score: 0.3098726690096235, PV Score: -7.249251508569862, Alpha: symmetric, Beta: symmetric\n", "#Topics: 11, CV Score: 0.31266100984189077, PV Score: -7.252673302755199, Alpha: symmetric, Beta: symmetric\n", "#Topics: 12, CV Score: 0.31275264383294515, PV Score: -7.254777083051176, Alpha: symmetric, Beta: symmetric\n", "#Topics: 4, CV Score: 0.2602975588122612, PV Score: -7.381821385441839, Alpha: asymmetric, Beta: 0.31\n", "#Topics: 5, CV Score: 0.2792475123330335, PV Score: -7.350616172255018, Alpha: asymmetric, Beta: 0.31\n", "#Topics: 6, CV Score: 0.3015492641821192, PV Score: -7.349068241683937, Alpha: asymmetric, Beta: 0.31\n", "#Topics: 7, CV Score: 0.2668593619258581, PV Score: -7.334972757155578, Alpha: asymmetric, Beta: 0.31\n", "#Topics: 8, CV Score: 0.2916111243636439, PV Score: -7.3504381054918735, Alpha: asymmetric, Beta: 0.31\n", "#Topics: 9, CV Score: 0.3104237788212182, PV Score: -7.3330296284925875, Alpha: asymmetric, Beta: 0.31\n", "#Topics: 10, CV Score: 0.3282155213609229, PV Score: -7.336602404389545, Alpha: asymmetric, Beta: 0.31\n", "#Topics: 11, CV Score: 0.2878177794271231, PV Score: -7.319097172570247, Alpha: asymmetric, Beta: 0.31\n", "#Topics: 12, CV Score: 0.309148501118687, PV Score: -7.299840665764544, Alpha: asymmetric, Beta: 0.31\n", "#Topics: 4, CV Score: 0.2643914598991108, PV Score: -7.439997400522905, Alpha: asymmetric, Beta: 0.61\n", "#Topics: 5, CV Score: 0.30680350294176834, PV Score: -7.406369360079358, Alpha: asymmetric, Beta: 0.61\n", "#Topics: 6, CV Score: 0.30273133375875494, PV Score: -7.409977374651039, Alpha: asymmetric, Beta: 0.61\n", "#Topics: 7, CV Score: 0.2895606942065561, PV Score: -7.404088495718267, Alpha: asymmetric, Beta: 0.61\n", "#Topics: 8, CV Score: 0.30220942123026345, PV Score: -7.41654700444938, Alpha: asymmetric, Beta: 0.61\n", "#Topics: 9, CV Score: 0.30085511300217477, PV Score: -7.3893922736333995, Alpha: asymmetric, Beta: 0.61\n", "#Topics: 10, CV Score: 0.32576991286555973, PV Score: -7.400148584021936, Alpha: asymmetric, Beta: 0.61\n", "#Topics: 11, CV Score: 0.30570350016100467, PV Score: -7.3930680136359115, Alpha: asymmetric, Beta: 0.61\n", "#Topics: 12, CV Score: 0.27592390657639637, PV Score: -7.390687284583513, Alpha: asymmetric, Beta: 0.61\n", "#Topics: 4, CV Score: 0.26415862588678707, PV Score: -7.4660902988837075, Alpha: asymmetric, Beta: 0.9099999999999999\n", "#Topics: 5, CV Score: 0.3031196807486601, PV Score: -7.459922366408937, Alpha: asymmetric, Beta: 0.9099999999999999\n", "#Topics: 6, CV Score: 0.28267000810499315, PV Score: -7.446406611219757, Alpha: asymmetric, Beta: 0.9099999999999999\n", "#Topics: 7, CV Score: 0.30409252212111715, PV Score: -7.458109944307844, Alpha: asymmetric, Beta: 0.9099999999999999\n", "#Topics: 8, CV Score: 0.3186545711929555, PV Score: -7.460394137450762, Alpha: asymmetric, Beta: 0.9099999999999999\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "File \u001b[0;32m:10\u001b[0m\n", "Cell \u001b[0;32mIn[19], line 4\u001b[0m, in \u001b[0;36mcompute_coherence_values\u001b[0;34m(corpus, dictionary, k, a, b)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompute_coherence_values\u001b[39m(corpus, dictionary, k, a, b):\n\u001b[0;32m----> 4\u001b[0m lda_model \u001b[38;5;241m=\u001b[39m \u001b[43mgensim\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodels\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mLdaMulticore\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcorpus\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcorpus\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43mid2word\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdictionary\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_topics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m42\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunksize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m100\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43mpasses\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43malpha\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43ma\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[43meta\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 13\u001b[0m coherence_model_lda \u001b[38;5;241m=\u001b[39m CoherenceModel(model\u001b[38;5;241m=\u001b[39mlda_model, texts\u001b[38;5;241m=\u001b[39mheadline, dictionary\u001b[38;5;241m=\u001b[39mdoc_dict, coherence\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mc_v\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 14\u001b[0m coherence \u001b[38;5;241m=\u001b[39m coherence_model_lda\u001b[38;5;241m.\u001b[39mget_coherence()\n", "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/models/ldamulticore.py:186\u001b[0m, in \u001b[0;36mLdaMulticore.__init__\u001b[0;34m(self, corpus, num_topics, id2word, workers, chunksize, passes, batch, alpha, eta, decay, offset, eval_every, iterations, gamma_threshold, random_state, minimum_probability, minimum_phi_value, per_word_topics, dtype)\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(alpha, \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m alpha \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mauto\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m 184\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mauto-tuning alpha not implemented in LdaMulticore; use plain LdaModel.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 186\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mLdaMulticore\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 187\u001b[0m \u001b[43m \u001b[49m\u001b[43mcorpus\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcorpus\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_topics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnum_topics\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 188\u001b[0m \u001b[43m \u001b[49m\u001b[43mid2word\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mid2word\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchunksize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunksize\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpasses\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpasses\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43malpha\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43malpha\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43meta\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 189\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecay\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moffset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43meval_every\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43meval_every\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43miterations\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43miterations\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 190\u001b[0m \u001b[43m \u001b[49m\u001b[43mgamma_threshold\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgamma_threshold\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrandom_state\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mminimum_probability\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimum_probability\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 191\u001b[0m \u001b[43m \u001b[49m\u001b[43mminimum_phi_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimum_phi_value\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mper_word_topics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mper_word_topics\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 192\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/models/ldamodel.py:521\u001b[0m, in \u001b[0;36mLdaModel.__init__\u001b[0;34m(self, corpus, num_topics, id2word, distributed, chunksize, passes, update_every, alpha, eta, decay, offset, eval_every, iterations, gamma_threshold, minimum_probability, random_state, ns_conf, minimum_phi_value, per_word_topics, callbacks, dtype)\u001b[0m\n\u001b[1;32m 519\u001b[0m use_numpy \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdispatcher \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 520\u001b[0m start \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 521\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mupdate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcorpus\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchunks_as_numpy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_numpy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 522\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39madd_lifecycle_event(\n\u001b[1;32m 523\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcreated\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 524\u001b[0m msg\u001b[38;5;241m=\u001b[39m\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrained \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtime\u001b[38;5;241m.\u001b[39mtime()\u001b[38;5;250m \u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;250m \u001b[39mstart\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m.2f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124ms\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 525\u001b[0m )\n", "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/models/ldamulticore.py:316\u001b[0m, in \u001b[0;36mLdaMulticore.update\u001b[0;34m(self, corpus, chunks_as_numpy)\u001b[0m\n\u001b[1;32m 312\u001b[0m \u001b[38;5;66;03m# endfor single corpus pass\u001b[39;00m\n\u001b[1;32m 313\u001b[0m \n\u001b[1;32m 314\u001b[0m \u001b[38;5;66;03m# wait for all outstanding jobs to finish\u001b[39;00m\n\u001b[1;32m 315\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m queue_size[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m--> 316\u001b[0m \u001b[43mprocess_result_queue\u001b[49m\u001b[43m(\u001b[49m\u001b[43mforce\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reallen \u001b[38;5;241m!=\u001b[39m lencorpus:\n\u001b[1;32m 319\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput corpus size changed during training (don\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt use generators as input)\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/models/ldamulticore.py:274\u001b[0m, in \u001b[0;36mLdaMulticore.update..process_result_queue\u001b[0;34m(force)\u001b[0m\n\u001b[1;32m 268\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 269\u001b[0m \u001b[38;5;124;03mClear the result queue, merging all intermediate results, and update the\u001b[39;00m\n\u001b[1;32m 270\u001b[0m \u001b[38;5;124;03mLDA model if necessary.\u001b[39;00m\n\u001b[1;32m 271\u001b[0m \n\u001b[1;32m 272\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 273\u001b[0m merged_new \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m--> 274\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mresult_queue\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mempty\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 275\u001b[0m other\u001b[38;5;241m.\u001b[39mmerge(result_queue\u001b[38;5;241m.\u001b[39mget())\n\u001b[1;32m 276\u001b[0m queue_size[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n", "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/queues.py:129\u001b[0m, in \u001b[0;36mQueue.empty\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 128\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mempty\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 129\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_poll\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/connection.py:257\u001b[0m, in \u001b[0;36m_ConnectionBase.poll\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_closed()\n\u001b[1;32m 256\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_readable()\n\u001b[0;32m--> 257\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_poll\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/connection.py:440\u001b[0m, in \u001b[0;36mConnection._poll\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_poll\u001b[39m(\u001b[38;5;28mself\u001b[39m, timeout):\n\u001b[0;32m--> 440\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 441\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mbool\u001b[39m(r)\n", "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/connection.py:1130\u001b[0m, in \u001b[0;36mwait\u001b[0;34m(object_list, timeout)\u001b[0m\n\u001b[1;32m 1128\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _WaitSelector() \u001b[38;5;28;01mas\u001b[39;00m selector:\n\u001b[1;32m 1129\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m object_list:\n\u001b[0;32m-> 1130\u001b[0m \u001b[43mselector\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mregister\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mselectors\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mEVENT_READ\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1132\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1133\u001b[0m deadline \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mmonotonic() \u001b[38;5;241m+\u001b[39m timeout\n", "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/selectors.py:352\u001b[0m, in \u001b[0;36m_PollLikeSelector.register\u001b[0;34m(self, fileobj, events, data)\u001b[0m\n\u001b[1;32m 351\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mregister\u001b[39m(\u001b[38;5;28mself\u001b[39m, fileobj, events, data\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[0;32m--> 352\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mregister\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfileobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mevents\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 353\u001b[0m poller_events \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 354\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m events \u001b[38;5;241m&\u001b[39m EVENT_READ:\n", "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/selectors.py:238\u001b[0m, in \u001b[0;36m_BaseSelectorImpl.register\u001b[0;34m(self, fileobj, events, data)\u001b[0m\n\u001b[1;32m 235\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\u001b[38;5;129;01mnot\u001b[39;00m events) \u001b[38;5;129;01mor\u001b[39;00m (events \u001b[38;5;241m&\u001b[39m \u001b[38;5;241m~\u001b[39m(EVENT_READ \u001b[38;5;241m|\u001b[39m EVENT_WRITE)):\n\u001b[1;32m 236\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid events: \u001b[39m\u001b[38;5;132;01m{!r}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(events))\n\u001b[0;32m--> 238\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[43mSelectorKey\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfileobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fileobj_lookup\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfileobj\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mevents\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 240\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m key\u001b[38;5;241m.\u001b[39mfd \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fd_to_key:\n\u001b[1;32m 241\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{!r}\u001b[39;00m\u001b[38;5;124m (FD \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m) is already registered\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;241m.\u001b[39mformat(fileobj, key\u001b[38;5;241m.\u001b[39mfd))\n", "File \u001b[0;32m:1\u001b[0m, in \u001b[0;36m\u001b[0;34m(_cls, fileobj, fd, events, data)\u001b[0m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "%%time\n", "\n", "import datetime\n", "import numpy as np\n", "from gensim.models import CoherenceModel\n", "\n", "print(datetime.datetime.now())\n", "\n", "for a in alpha:\n", " for b in beta:\n", " for num in num_topics:\n", " cv, pv = compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict,k=num, a=a, b=b) \n", "\n", " model_topics.append(num) \n", " coherence_values.append(cv) \n", " perplexity_values.append(pv)\n", " alpha_result.append(a)\n", " beta_result.append(b)\n", " print(\"#Topics: \" + str(num) + \", CV Score: \" + str(coherence_values[-1]) + \", PV Score: \" + str(perplexity_values[-1]) + \", Alpha: \" + str(alpha_result[-1]) + \", Beta: \" + str(beta_result[-1]))\n", " \n", "print(datetime.datetime.now())" ] }, { "cell_type": "markdown", "id": "364ff6d5-e3da-4dde-a2c8-5375fc5d711f", "metadata": {}, "source": [ "The table below reveals the top 20 fine tuned models with best combinations of coherence score and perplexity score. It was sorted by the coherence score in descending order as a higher coherence score indicates a better model, and sorted the perplexity score in ascending order as a lower perplexity score indicates a better model. While coherence score evaluates the quality of the topics, the perplexity score evaluates the overall performance of the model in predicting new documents. Usually, the coherence score is a better metric to use if the goal is to obtain topics that are semantically coherent and interpretable. Perplexity score, on the other hand, is a better metric to use if the goal is to build a model that generalises well to new data, in other words, how confident the model is in predicting the new data (Sánchez-Aguayo, et al., 2022). Ultimately, we aim to get a balance between the perplexity value and coherence score when determining our final model." ] }, { "cell_type": "code", "execution_count": null, "id": "78a60032-a4d7-44d4-841c-a1bd3740d5dd", "metadata": {}, "outputs": [], "source": [ "# Find the top 20 combinations based on Coherence Score and Perplexity Score\n", "result = pd.DataFrame(\n", " {'Topics': model_topics,\n", " 'Coherence Score': coherence_values,\n", " 'Perplexity Score': perplexity_values,\n", " 'Alpha': alpha_result,\n", " 'Beta': beta_result\n", " })\n", "result.sort_values(by=['Coherence Score', 'Perplexity Score'], ascending=[False, True]).head(20)" ] }, { "cell_type": "code", "execution_count": null, "id": "3461df57-c069-4ad2-80d7-8890dec9438e", "metadata": {}, "outputs": [], "source": [ "result.to_csv('data/lda_fine_tuning_result_minor.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "800e5a4b-7302-42e8-97b0-5b598c1c80ae", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Show graph Topics vs Coherence Score\n", "result.groupby('Alpha').plot(x='Topics', y='Coherence Score', legend = True)" ] }, { "cell_type": "code", "execution_count": null, "id": "26996b89-0e7a-4f2d-8cf7-c4a716569bc2", "metadata": {}, "outputs": [], "source": [ "# Show graph Topics vs Perplexity Score\n", "\n", "plt.plot(model_topics, coherence_values)\n", "plt.xlabel(\"Num Topics\")\n", "plt.ylabel(\"Coherence Score\")\n", "plt.legend((\"Coherence Score\"), loc='best')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "91d2f4c1-de77-44b6-b41b-fcc9a07233e8", "metadata": {}, "outputs": [], "source": [ "# Show graph Topics vs Perplexity Score\n", "\n", "plt.plot(model_topics, perplexity_values)\n", "plt.xlabel(\"Num Topics\")\n", "plt.ylabel(\"Perplexity score\")\n", "plt.legend((\"perplexity_values\"), loc='best')\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "cdc3ddd2-f743-4e5b-b6c6-2656e0b77aec", "metadata": {}, "source": [ "## Final Model" ] }, { "cell_type": "markdown", "id": "e86c2bfe-264b-4530-9d81-10b1cdc5071c", "metadata": {}, "source": [ "refer to the script topic_modelling_severe for detailed explanation" ] }, { "cell_type": "code", "execution_count": null, "id": "490734ed-077c-4fb0-930c-0b42f4f63c94", "metadata": {}, "outputs": [], "source": [ "# realised that there may be some overlaps for 8 topics, thus 4-6 topics are optimal\n", "k = 2\n", "# a = 'asymmetric'\n", "a = 0.31\n", "# b = 0.31\n", "b = 'symmetric'\n", "\n", "\n", "final_model = gensim.models.LdaMulticore(corpus=docs_vecs,\n", " id2word=doc_dict,\n", " num_topics=k, \n", " random_state=42,\n", " chunksize=100,\n", " passes=10,\n", " alpha=a,\n", " eta=b)" ] }, { "cell_type": "code", "execution_count": null, "id": "afe8abf0-2d12-414e-92be-a655865addb1", "metadata": { "tags": [] }, "outputs": [], "source": [ "compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict,k=k, a=a, b=b) " ] }, { "cell_type": "code", "execution_count": null, "id": "8430a827-6dbb-4737-8ccc-78ed17a01234", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Set up the environment to display the graphical outputs\n", "# feed the LDA model into the pyLDAvis instance\n", "pyLDAvis.enable_notebook()\n", "visual= gensimvis.prepare(final_model, docs_vecs, doc_dict)\n", "\n", "# Save the output to the html file\n", "pyLDAvis.save_html(visual, \"data/topic_viz2_minor_training.html\")" ] }, { "cell_type": "code", "execution_count": null, "id": "5e30d71a-a3c7-40c7-94c0-7eea1bedc887", "metadata": { "tags": [] }, "outputs": [], "source": [ "final_model.print_topics(num_words=30)" ] }, { "cell_type": "code", "execution_count": null, "id": "b958c148-5dbe-4896-bcba-85e0f78b2941", "metadata": {}, "outputs": [], "source": [ "break" ] }, { "cell_type": "code", "execution_count": null, "id": "1939c00d-12e4-4670-9f18-f287f8b86bef", "metadata": {}, "outputs": [], "source": [ "# Map the topic ID with appropriate topic names, this part should be updated accordinly whenever the model is updated\n", "topic_mapping = {0: \"finance\", 1: \"tech\", 2: \"education\", 3: \"sports\", 4: \"leisure\"}" ] }, { "cell_type": "code", "execution_count": null, "id": "1a9a921e-3cdd-4fe5-a58f-7b50e5feeecd", "metadata": {}, "outputs": [], "source": [ "# Get the topics and their top keywords into a dataframe\n", "topics = final_model.show_topics(num_words=30) \n", "\n", "topic_keywords = pd.DataFrame()\n", "for topic_id, topic in topics:\n", " topic_keywords.at[topic_id, 'Topic Keywords'] = topic\n", "\n", "topic_keywords['Topic ID'] = topic_keywords.index\n", "topic_keywords['Topic Name'] = topic_mapping \n", "topic_keywords" ] }, { "cell_type": "markdown", "id": "607d2cfd-b3ca-4f99-9e01-d320ca98a2a0", "metadata": {}, "source": [ "# Save the final model " ] }, { "cell_type": "code", "execution_count": null, "id": "84eb2746-173a-4283-bca5-681f77548698", "metadata": {}, "outputs": [], "source": [ "#Save a model to disk, or reload a pre-trained model\n", "# naming convention: final_model_topic_alpha_eta\n", "final_model.save(\"models/final_model_5_asym_91\")" ] }, { "cell_type": "markdown", "id": "a7b6e4d9-a577-4dfb-ba6e-fc74365880f4", "metadata": {}, "source": [ "# Find dominant topic(s) for each news article" ] }, { "cell_type": "markdown", "id": "0eeecbcb-358c-44f9-8463-75cdfac0ba90", "metadata": {}, "source": [ "Attach the dominant topics back to the news dataset for classifying purpose." ] }, { "cell_type": "markdown", "id": "8bebb269-dbb0-4c46-925c-38de0f2bcfd7", "metadata": {}, "source": [ "Made use of gensim lda's own function: https://radimrehurek.com/gensim/models/ldamodel.html" ] }, { "cell_type": "code", "execution_count": null, "id": "f585ff52-b60d-4d70-ae64-a7c23d2cc6c1", "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "def format_topics_sentences(ldamodel, corpus, data):\n", " # Preallocate memory for the DataFrame\n", " num_docs = len(corpus)\n", " sent_topics = {'Dominant_Topic': [0] * num_docs, 'Perc_Contribution': [0.0] * num_docs, 'Topic_Distribution': [()] * num_docs}\n", " \n", " # Get main topic in each document\n", " for i, row in enumerate(ldamodel[corpus]):\n", " row = sorted(row, key=lambda x: (x[1]), reverse=True)\n", " if row:\n", " # Get the Dominant topic, Perc Contribution and Keywords for each document\n", " dominant_topic, perc_contribution = row[0]\n", " topic_distribution = row\n", " sent_topics['Dominant_Topic'][i] = int(dominant_topic)\n", " sent_topics['Perc_Contribution'][i] = round(perc_contribution, 4)\n", " sent_topics['Topic_Distribution'][i] = topic_distribution\n", "\n", " # Create the DataFrame\n", " sent_topics_df = pd.DataFrame(sent_topics)\n", " sent_topics_df['Text'] = data\n", "\n", " return sent_topics_df" ] }, { "cell_type": "code", "execution_count": null, "id": "24d3ff60-035e-4133-9ffd-88cce5cdccb1", "metadata": {}, "outputs": [], "source": [ "df_topic_sents_keywords = format_topics_sentences(ldamodel=final_model, corpus=docs_vecs, data=cleaned.Headline_Details)" ] }, { "cell_type": "code", "execution_count": null, "id": "c88b088b", "metadata": {}, "outputs": [], "source": [ "# Format\n", "df_dominant_topic = df_topic_sents_keywords.reset_index()\n", "df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Topic_Distribution', 'Text']\n", "\n", "# Show\n", "df_dominant_topic.head(10)" ] }, { "cell_type": "markdown", "id": "560da382-aa86-4df1-8b85-56b057a27cd4", "metadata": {}, "source": [ "# Result Analysis" ] }, { "cell_type": "code", "execution_count": null, "id": "4fe6b40b-6922-4de3-8d9e-dac7474b6303", "metadata": {}, "outputs": [], "source": [ "df_dominant_topic[\"Dominant_Topic\"].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "b9917340-31cf-48af-871f-b481128fdf22", "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", "# Get value counts of each topic\n", "topic_counts = df_dominant_topic[\"Dominant_Topic\"].value_counts()\n", "\n", "# Create a bar plot\n", "plt.figure(figsize=(8, 6))\n", "topic_counts.plot(kind=\"bar\", color=\"skyblue\")\n", "\n", "# Add labels to the bars\n", "for i, count in enumerate(topic_counts):\n", " plt.text(i, count, str(count), ha=\"center\", va=\"bottom\")\n", "\n", "# Add labels and title\n", "plt.xlabel(\"Topics\")\n", "plt.ylabel(\"Number of News\")\n", "plt.title(\"Topic Distribution\")\n", "\n", "# Show the plot\n", "plt.xticks(rotation=45) # Rotate x-axis labels for better readability\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "fffa1e57-f975-4469-a42b-19d76c60fb66", "metadata": {}, "outputs": [], "source": [ "df_dominant_topic.sort_values(by='Topic_Perc_Contrib', ascending=True).head(20)" ] }, { "cell_type": "code", "execution_count": null, "id": "8510f506-141f-4382-b668-251df1afc95f", "metadata": {}, "outputs": [], "source": [ "# Sample 100 rows, can change the random_state for different samples\n", "sampled_data = df_dominant_topic.sample(n=100, random_state=42) \n", "sampled_df = pd.DataFrame(sampled_data).reset_index()\n", "sampled_df.to_csv('data/sample_minor.csv')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 5 }