dh-mc commited on
Commit
0dec8a3
·
1 Parent(s): 1179be6

updated LFS

Browse files
.env.example ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY=
.gitattributes CHANGED
@@ -37,3 +37,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
37
  *.docx filter=lfs diff=lfs merge=lfs -text
38
  *.pptx filter=lfs diff=lfs merge=lfs -text
39
  *.xlsx filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
37
  *.docx filter=lfs diff=lfs merge=lfs -text
38
  *.pptx filter=lfs diff=lfs merge=lfs -text
39
  *.xlsx filter=lfs diff=lfs merge=lfs -text
40
+ data/label_list.xlsx filter=lfs diff=lfs merge=lfs -text
41
+ data/new_labels.xlsx filter=lfs diff=lfs merge=lfs -text
42
+ data/processed_data.parquet filter=lfs diff=lfs merge=lfs -text
43
+ data/scrapped_data1.parquet filter=lfs diff=lfs merge=lfs -text
44
+ data/scrapped_data2.csv filter=lfs diff=lfs merge=lfs -text
45
+ data/all_port_labelled.csv filter=lfs diff=lfs merge=lfs -text
46
+ data/cleaned_data.csv filter=lfs diff=lfs merge=lfs -text
47
+ data/cleaned_data_with_categories.csv filter=lfs diff=lfs merge=lfs -text
48
+ data/processed_data2.parquet filter=lfs diff=lfs merge=lfs -text
49
+ data/topic_viz_benchmark_moderate.html filter=lfs diff=lfs merge=lfs -text
50
+ data/topic_viz_benchmark_severe.html filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -148,3 +148,4 @@ qa_chain_test_results.csv
148
  data/logs/hftgi-2024-03-18.txt
149
  qa_chain_all_results.csv
150
  qa_batch_1_test_results.csv
 
 
148
  data/logs/hftgi-2024-03-18.txt
149
  qa_chain_all_results.csv
150
  qa_batch_1_test_results.csv
151
+ *.html
README.md CHANGED
@@ -1,12 +1,12 @@
1
- ---
2
- title: Global Incidents
3
- emoji: 💬
4
- colorFrom: yellow
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 4.36.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
1
+ # DM Project
2
+
3
+ The project consists of three parts: the classification, the GPT and its evaluation (GPT Evaluation) and the news scraper + LDA modelling.
4
+
5
+ all the files can be run independently without interference of others and pre-setup, except for the LDA folder.
6
+
7
+ For LDA,
8
+ the processed_data.parquet, processed the original data and is generated by the file basic_text_preprocessing, is used in topic_modelling_benchmark_using_headline;
9
+
10
+ the processed_data1.parquet, processed the scraped news content and is generated by the file basic_text_preprocessing_on_scraped_data, is used in files topic_modelling_minor, topic_modelling_severe and topic_modelling_moderate.
11
+
12
+ However, be careful when running the GPT and news scraper file, as you may need your own API key for GPT script to run properly. Also, it takes very long for the news scraper scirpt to finish.
data/all_port_labelled.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f2a6c487806e6307d2763ffef90378092cbdd9682ed2a952cb129d5ec06f7a3
3
+ size 8767987
data/cleaned_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75438fa508739e03dcf9e9ad18d5fe2f01ffef0c9fd5b6b8caf9e0aff65aa18d
3
+ size 7998227
data/cleaned_data_with_categories.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be908d68560d66d238287fad1c3bacfb6e2d5a205b77822cc75ba9df57e55fa9
3
+ size 78688
data/label_list.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a47566ba13a2b4f97f0105bb0aba8f1abeecd5a1bf7f24af47cac44d4d579e5
3
+ size 6462
data/new_labels.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d2c27675a2408cfaa948f4030b19f67deb9fe72bfa3eb80f721ebb4386ea77a
3
+ size 10338
data/processed_data.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa419c0b692c27050df6059204dd9dc16570bcd2344b9be061ac30b06363112e
3
+ size 5081543
data/processed_data2.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41d62483447ae9377eb9c13ed383a09b6b0fa82b7f896c2d1033f6e9109b69d6
3
+ size 29146326
data/scrapped_data1.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57abf4a381c4d2501e154cd23bd1ca3200e08c446bee2eee14abde93204fcdb6
3
+ size 18114548
data/scrapped_data2.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bd8a6b3ada152b3ebc8aae8b6036045b993f668015ccd73d675ecaa43954ce6
3
+ size 37502335
notebooks/01_Classification models on incident category.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/02_Classification models on maritime incident.ipynb ADDED
@@ -0,0 +1,2545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "e550a89c",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "workding dir: /Users/inflaton/code/engd/papers/maritime/global-incidents\n",
14
+ "loading env vars from: /Users/inflaton/code/engd/papers/maritime/global-incidents/.env\n"
15
+ ]
16
+ },
17
+ {
18
+ "data": {
19
+ "text/plain": [
20
+ "True"
21
+ ]
22
+ },
23
+ "execution_count": 1,
24
+ "metadata": {},
25
+ "output_type": "execute_result"
26
+ }
27
+ ],
28
+ "source": [
29
+ "%load_ext autoreload\n",
30
+ "%autoreload 2\n",
31
+ "\n",
32
+ "import os\n",
33
+ "import sys\n",
34
+ "from pathlib import Path\n",
35
+ "\n",
36
+ "workding_dir = str(Path.cwd().parent)\n",
37
+ "os.chdir(workding_dir)\n",
38
+ "sys.path.append(workding_dir)\n",
39
+ "print(\"workding dir:\", workding_dir)\n",
40
+ "\n",
41
+ "from dotenv import find_dotenv, load_dotenv\n",
42
+ "\n",
43
+ "found_dotenv = find_dotenv(\".env\")\n",
44
+ "\n",
45
+ "if len(found_dotenv) == 0:\n",
46
+ " found_dotenv = find_dotenv(\".env.example\")\n",
47
+ "print(f\"loading env vars from: {found_dotenv}\")\n",
48
+ "load_dotenv(found_dotenv, override=True)"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "markdown",
53
+ "id": "1fecbc87",
54
+ "metadata": {},
55
+ "source": [
56
+ "## Import Statement"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": 2,
62
+ "id": "5169e3ee",
63
+ "metadata": {},
64
+ "outputs": [],
65
+ "source": [
66
+ "import pandas as pd"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "markdown",
71
+ "id": "76905f72",
72
+ "metadata": {},
73
+ "source": [
74
+ "### read the data"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 3,
80
+ "id": "b1043895",
81
+ "metadata": {},
82
+ "outputs": [],
83
+ "source": [
84
+ "df = pd.read_csv(\"data/all_port_labelled.csv\")"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": 4,
90
+ "id": "2e40d90a",
91
+ "metadata": {
92
+ "scrolled": true
93
+ },
94
+ "outputs": [
95
+ {
96
+ "data": {
97
+ "text/html": [
98
+ "<div>\n",
99
+ "<style scoped>\n",
100
+ " .dataframe tbody tr th:only-of-type {\n",
101
+ " vertical-align: middle;\n",
102
+ " }\n",
103
+ "\n",
104
+ " .dataframe tbody tr th {\n",
105
+ " vertical-align: top;\n",
106
+ " }\n",
107
+ "\n",
108
+ " .dataframe thead th {\n",
109
+ " text-align: right;\n",
110
+ " }\n",
111
+ "</style>\n",
112
+ "<table border=\"1\" class=\"dataframe\">\n",
113
+ " <thead>\n",
114
+ " <tr style=\"text-align: right;\">\n",
115
+ " <th></th>\n",
116
+ " <th>Unnamed: 0</th>\n",
117
+ " <th>Index</th>\n",
118
+ " <th>Unnamed: 0.1</th>\n",
119
+ " <th>Headline</th>\n",
120
+ " <th>Details</th>\n",
121
+ " <th>Severity</th>\n",
122
+ " <th>Category</th>\n",
123
+ " <th>Region</th>\n",
124
+ " <th>Datetime</th>\n",
125
+ " <th>Year</th>\n",
126
+ " <th>...</th>\n",
127
+ " <th>IT</th>\n",
128
+ " <th>EP</th>\n",
129
+ " <th>NEW</th>\n",
130
+ " <th>CSD</th>\n",
131
+ " <th>RPE</th>\n",
132
+ " <th>MN</th>\n",
133
+ " <th>NM</th>\n",
134
+ " <th>if_labeled</th>\n",
135
+ " <th>Month</th>\n",
136
+ " <th>Week</th>\n",
137
+ " </tr>\n",
138
+ " </thead>\n",
139
+ " <tbody>\n",
140
+ " <tr>\n",
141
+ " <th>0</th>\n",
142
+ " <td>0.0</td>\n",
143
+ " <td>8.0</td>\n",
144
+ " <td>34.0</td>\n",
145
+ " <td>Grasberg Mine- Grasberg mine workers extend st...</td>\n",
146
+ " <td>Media sources indicate that workers at the Gra...</td>\n",
147
+ " <td>Moderate</td>\n",
148
+ " <td>Mine Workers Strike</td>\n",
149
+ " <td>Indonesia</td>\n",
150
+ " <td>28/5/17 17:08</td>\n",
151
+ " <td>2017.0</td>\n",
152
+ " <td>...</td>\n",
153
+ " <td>0.0</td>\n",
154
+ " <td>0.0</td>\n",
155
+ " <td>0.0</td>\n",
156
+ " <td>0.0</td>\n",
157
+ " <td>0.0</td>\n",
158
+ " <td>0.0</td>\n",
159
+ " <td>1.0</td>\n",
160
+ " <td>False</td>\n",
161
+ " <td>5.0</td>\n",
162
+ " <td>21.0</td>\n",
163
+ " </tr>\n",
164
+ " <tr>\n",
165
+ " <th>1</th>\n",
166
+ " <td>1.0</td>\n",
167
+ " <td>10.0</td>\n",
168
+ " <td>63.0</td>\n",
169
+ " <td>Indonesia: Undersea internet cables damaged by...</td>\n",
170
+ " <td>News sources are stating that recent typhoons ...</td>\n",
171
+ " <td>Minor</td>\n",
172
+ " <td>Travel Warning</td>\n",
173
+ " <td>Indonesia</td>\n",
174
+ " <td>4/9/17 14:30</td>\n",
175
+ " <td>2017.0</td>\n",
176
+ " <td>...</td>\n",
177
+ " <td>0.0</td>\n",
178
+ " <td>0.0</td>\n",
179
+ " <td>0.0</td>\n",
180
+ " <td>0.0</td>\n",
181
+ " <td>0.0</td>\n",
182
+ " <td>1.0</td>\n",
183
+ " <td>0.0</td>\n",
184
+ " <td>False</td>\n",
185
+ " <td>4.0</td>\n",
186
+ " <td>14.0</td>\n",
187
+ " </tr>\n",
188
+ " </tbody>\n",
189
+ "</table>\n",
190
+ "<p>2 rows × 46 columns</p>\n",
191
+ "</div>"
192
+ ],
193
+ "text/plain": [
194
+ " Unnamed: 0 Index Unnamed: 0.1 \\\n",
195
+ "0 0.0 8.0 34.0 \n",
196
+ "1 1.0 10.0 63.0 \n",
197
+ "\n",
198
+ " Headline \\\n",
199
+ "0 Grasberg Mine- Grasberg mine workers extend st... \n",
200
+ "1 Indonesia: Undersea internet cables damaged by... \n",
201
+ "\n",
202
+ " Details Severity \\\n",
203
+ "0 Media sources indicate that workers at the Gra... Moderate \n",
204
+ "1 News sources are stating that recent typhoons ... Minor \n",
205
+ "\n",
206
+ " Category Region Datetime Year ... IT EP NEW \\\n",
207
+ "0 Mine Workers Strike Indonesia 28/5/17 17:08 2017.0 ... 0.0 0.0 0.0 \n",
208
+ "1 Travel Warning Indonesia 4/9/17 14:30 2017.0 ... 0.0 0.0 0.0 \n",
209
+ "\n",
210
+ " CSD RPE MN NM if_labeled Month Week \n",
211
+ "0 0.0 0.0 0.0 1.0 False 5.0 21.0 \n",
212
+ "1 0.0 0.0 1.0 0.0 False 4.0 14.0 \n",
213
+ "\n",
214
+ "[2 rows x 46 columns]"
215
+ ]
216
+ },
217
+ "execution_count": 4,
218
+ "metadata": {},
219
+ "output_type": "execute_result"
220
+ }
221
+ ],
222
+ "source": [
223
+ "df.head(2)"
224
+ ]
225
+ },
226
+ {
227
+ "cell_type": "markdown",
228
+ "id": "643a7e40",
229
+ "metadata": {},
230
+ "source": [
231
+ "### Clean empty data"
232
+ ]
233
+ },
234
+ {
235
+ "cell_type": "code",
236
+ "execution_count": 5,
237
+ "id": "d6ee1fd7",
238
+ "metadata": {},
239
+ "outputs": [],
240
+ "source": [
241
+ "import nltk\n",
242
+ "from nltk.corpus import stopwords\n",
243
+ "from nltk.tokenize import word_tokenize\n",
244
+ "from nltk.stem import WordNetLemmatizer\n",
245
+ "import string\n",
246
+ "\n",
247
+ "# nltk.download('punkt')\n",
248
+ "# nltk.download('stopwords')\n",
249
+ "# nltk.download('wordnet')\n",
250
+ "\n",
251
+ "\n",
252
+ "def clean_text(text):\n",
253
+ " # Lowercase\n",
254
+ " text = text.lower()\n",
255
+ " # Tokenization\n",
256
+ " tokens = word_tokenize(text)\n",
257
+ " # Removing punctuation\n",
258
+ " tokens = [word for word in tokens if word not in string.punctuation]\n",
259
+ " # Removing stop words\n",
260
+ " stop_words = set(stopwords.words(\"english\"))\n",
261
+ " tokens = [word for word in tokens if word not in stop_words]\n",
262
+ " # Lemmatization\n",
263
+ " lemmatizer = WordNetLemmatizer()\n",
264
+ " tokens = [lemmatizer.lemmatize(word) for word in tokens]\n",
265
+ "\n",
266
+ " return \" \".join(tokens)"
267
+ ]
268
+ },
269
+ {
270
+ "cell_type": "code",
271
+ "execution_count": 6,
272
+ "id": "9e35b49a",
273
+ "metadata": {},
274
+ "outputs": [
275
+ {
276
+ "name": "stderr",
277
+ "output_type": "stream",
278
+ "text": [
279
+ "[nltk_data] Downloading package omw-1.4 to\n",
280
+ "[nltk_data] /Users/inflaton/nltk_data...\n",
281
+ "[nltk_data] Package omw-1.4 is already up-to-date!\n"
282
+ ]
283
+ },
284
+ {
285
+ "data": {
286
+ "text/plain": [
287
+ "True"
288
+ ]
289
+ },
290
+ "execution_count": 6,
291
+ "metadata": {},
292
+ "output_type": "execute_result"
293
+ }
294
+ ],
295
+ "source": [
296
+ "import nltk\n",
297
+ "\n",
298
+ "nltk.download(\"omw-1.4\")"
299
+ ]
300
+ },
301
+ {
302
+ "cell_type": "markdown",
303
+ "id": "ca331c4b",
304
+ "metadata": {},
305
+ "source": [
306
+ "### The Details column has an issue\n",
307
+ "\n",
308
+ "some of the data are of the type float and none of the text processing functions can be applied to it therefore we have to process it"
309
+ ]
310
+ },
311
+ {
312
+ "cell_type": "code",
313
+ "execution_count": 7,
314
+ "id": "2438c58f",
315
+ "metadata": {},
316
+ "outputs": [
317
+ {
318
+ "name": "stdout",
319
+ "output_type": "stream",
320
+ "text": [
321
+ "<class 'pandas.core.frame.DataFrame'>\n",
322
+ "RangeIndex: 5782 entries, 0 to 5781\n",
323
+ "Data columns (total 2 columns):\n",
324
+ " # Column Non-Null Count Dtype \n",
325
+ "--- ------ -------------- ----- \n",
326
+ " 0 Details 5781 non-null object\n",
327
+ " 1 maritime_label 5781 non-null object\n",
328
+ "dtypes: object(2)\n",
329
+ "memory usage: 90.5+ KB\n",
330
+ "<class 'pandas.core.frame.DataFrame'>\n",
331
+ "RangeIndex: 5782 entries, 0 to 5781\n",
332
+ "Data columns (total 3 columns):\n",
333
+ " # Column Non-Null Count Dtype \n",
334
+ "--- ------ -------------- ----- \n",
335
+ " 0 Details 5781 non-null object\n",
336
+ " 1 maritime_label 5781 non-null object\n",
337
+ " 2 Details_cleaned 5781 non-null object\n",
338
+ "dtypes: object(3)\n",
339
+ "memory usage: 135.6+ KB\n"
340
+ ]
341
+ },
342
+ {
343
+ "name": "stderr",
344
+ "output_type": "stream",
345
+ "text": [
346
+ "/var/folders/7x/56svhln929zdh2xhr3mwqg4r0000gn/T/ipykernel_16237/2443564520.py:3: SettingWithCopyWarning: \n",
347
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
348
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
349
+ "\n",
350
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
351
+ " text_df['Details_cleaned'] = text_df['Details'].apply(lambda x: clean_text(x) if not isinstance(x, float) else None)\n"
352
+ ]
353
+ }
354
+ ],
355
+ "source": [
356
+ "text_df = df[[\"Details\", \"maritime_label\"]]\n",
357
+ "text_df.info()\n",
358
+ "text_df[\"Details_cleaned\"] = text_df[\"Details\"].apply(\n",
359
+ " lambda x: clean_text(x) if not isinstance(x, float) else None\n",
360
+ ")\n",
361
+ "# no_nan_df[no_nan_df[\"Details\"].apply(lambda x: print(type(x)))]\n",
362
+ "# cleaned_df = text_df[text_df[\"Details\"].apply(lambda x: clean_text(x))]\n",
363
+ "# cleaned_df = df['Details'][1:2]\n",
364
+ "# type(no_nan_df[\"Details\"][0])\n",
365
+ "# print(clean_text(no_nan_df[\"Details\"][0]))\n",
366
+ "text_df.info()"
367
+ ]
368
+ },
369
+ {
370
+ "cell_type": "code",
371
+ "execution_count": 8,
372
+ "id": "4d3b0011",
373
+ "metadata": {},
374
+ "outputs": [
375
+ {
376
+ "data": {
377
+ "text/html": [
378
+ "<div>\n",
379
+ "<style scoped>\n",
380
+ " .dataframe tbody tr th:only-of-type {\n",
381
+ " vertical-align: middle;\n",
382
+ " }\n",
383
+ "\n",
384
+ " .dataframe tbody tr th {\n",
385
+ " vertical-align: top;\n",
386
+ " }\n",
387
+ "\n",
388
+ " .dataframe thead th {\n",
389
+ " text-align: right;\n",
390
+ " }\n",
391
+ "</style>\n",
392
+ "<table border=\"1\" class=\"dataframe\">\n",
393
+ " <thead>\n",
394
+ " <tr style=\"text-align: right;\">\n",
395
+ " <th></th>\n",
396
+ " <th>Details</th>\n",
397
+ " <th>maritime_label</th>\n",
398
+ " <th>Details_cleaned</th>\n",
399
+ " </tr>\n",
400
+ " </thead>\n",
401
+ " <tbody>\n",
402
+ " <tr>\n",
403
+ " <th>0</th>\n",
404
+ " <td>Media sources indicate that workers at the Gra...</td>\n",
405
+ " <td>FALSE</td>\n",
406
+ " <td>medium source indicate worker grasberg mine ex...</td>\n",
407
+ " </tr>\n",
408
+ " <tr>\n",
409
+ " <th>1</th>\n",
410
+ " <td>News sources are stating that recent typhoons ...</td>\n",
411
+ " <td>FALSE</td>\n",
412
+ " <td>news source stating recent typhoon impact hong...</td>\n",
413
+ " </tr>\n",
414
+ " <tr>\n",
415
+ " <th>2</th>\n",
416
+ " <td>The persisting port congestion at Shanghai’s Y...</td>\n",
417
+ " <td>TRUE</td>\n",
418
+ " <td>persisting port congestion shanghai ’ yangshan...</td>\n",
419
+ " </tr>\n",
420
+ " <tr>\n",
421
+ " <th>3</th>\n",
422
+ " <td>Updated local media sources from Jakarta indic...</td>\n",
423
+ " <td>TRUE</td>\n",
424
+ " <td>updated local medium source jakarta indicate e...</td>\n",
425
+ " </tr>\n",
426
+ " <tr>\n",
427
+ " <th>4</th>\n",
428
+ " <td>According to local police in Jakarta, two expl...</td>\n",
429
+ " <td>TRUE</td>\n",
430
+ " <td>according local police jakarta two explosion c...</td>\n",
431
+ " </tr>\n",
432
+ " </tbody>\n",
433
+ "</table>\n",
434
+ "</div>"
435
+ ],
436
+ "text/plain": [
437
+ " Details maritime_label \\\n",
438
+ "0 Media sources indicate that workers at the Gra... FALSE \n",
439
+ "1 News sources are stating that recent typhoons ... FALSE \n",
440
+ "2 The persisting port congestion at Shanghai’s Y... TRUE \n",
441
+ "3 Updated local media sources from Jakarta indic... TRUE \n",
442
+ "4 According to local police in Jakarta, two expl... TRUE \n",
443
+ "\n",
444
+ " Details_cleaned \n",
445
+ "0 medium source indicate worker grasberg mine ex... \n",
446
+ "1 news source stating recent typhoon impact hong... \n",
447
+ "2 persisting port congestion shanghai ’ yangshan... \n",
448
+ "3 updated local medium source jakarta indicate e... \n",
449
+ "4 according local police jakarta two explosion c... "
450
+ ]
451
+ },
452
+ "execution_count": 8,
453
+ "metadata": {},
454
+ "output_type": "execute_result"
455
+ }
456
+ ],
457
+ "source": [
458
+ "processed_data = text_df.dropna()\n",
459
+ "processed_data.head()"
460
+ ]
461
+ },
462
+ {
463
+ "cell_type": "markdown",
464
+ "id": "3c4be609",
465
+ "metadata": {},
466
+ "source": [
467
+ "## Naive Bayes Model"
468
+ ]
469
+ },
470
+ {
471
+ "cell_type": "code",
472
+ "execution_count": 9,
473
+ "id": "5c660011",
474
+ "metadata": {},
475
+ "outputs": [],
476
+ "source": [
477
+ "from sklearn.model_selection import train_test_split\n",
478
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
479
+ "\n",
480
+ "# from sklearn.feature_extraction.text import CountVectorizer\n",
481
+ "from sklearn.naive_bayes import MultinomialNB\n",
482
+ "from sklearn.metrics import accuracy_score, classification_report"
483
+ ]
484
+ },
485
+ {
486
+ "cell_type": "code",
487
+ "execution_count": 10,
488
+ "id": "8f009a65",
489
+ "metadata": {},
490
+ "outputs": [],
491
+ "source": [
492
+ "X = processed_data[\"Details_cleaned\"]\n",
493
+ "y = processed_data[\"maritime_label\"]"
494
+ ]
495
+ },
496
+ {
497
+ "cell_type": "code",
498
+ "execution_count": 11,
499
+ "id": "0185a967",
500
+ "metadata": {},
501
+ "outputs": [],
502
+ "source": [
503
+ "X_train, X_test, y_train, y_test = train_test_split(\n",
504
+ " X, y, test_size=0.2, random_state=42\n",
505
+ ")"
506
+ ]
507
+ },
508
+ {
509
+ "cell_type": "code",
510
+ "execution_count": 12,
511
+ "id": "d3c2de6b",
512
+ "metadata": {},
513
+ "outputs": [],
514
+ "source": [
515
+ "# vectorizer = CountVectorizer()\n",
516
+ "# X_train_vec = vectorizer.fit_transform(X_train)\n",
517
+ "# X_test_vec = vectorizer.transform(X_test)\n",
518
+ "\n",
519
+ "tfidf_vectorizer = TfidfVectorizer(max_features=1000)\n",
520
+ "X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
521
+ "X_test_tfidf = tfidf_vectorizer.transform(X_test)"
522
+ ]
523
+ },
524
+ {
525
+ "cell_type": "code",
526
+ "execution_count": 13,
527
+ "id": "ead2fc7a",
528
+ "metadata": {},
529
+ "outputs": [
530
+ {
531
+ "data": {
532
+ "text/html": [
533
+ "<style>#sk-container-id-1 {\n",
534
+ " /* Definition of color scheme common for light and dark mode */\n",
535
+ " --sklearn-color-text: black;\n",
536
+ " --sklearn-color-line: gray;\n",
537
+ " /* Definition of color scheme for unfitted estimators */\n",
538
+ " --sklearn-color-unfitted-level-0: #fff5e6;\n",
539
+ " --sklearn-color-unfitted-level-1: #f6e4d2;\n",
540
+ " --sklearn-color-unfitted-level-2: #ffe0b3;\n",
541
+ " --sklearn-color-unfitted-level-3: chocolate;\n",
542
+ " /* Definition of color scheme for fitted estimators */\n",
543
+ " --sklearn-color-fitted-level-0: #f0f8ff;\n",
544
+ " --sklearn-color-fitted-level-1: #d4ebff;\n",
545
+ " --sklearn-color-fitted-level-2: #b3dbfd;\n",
546
+ " --sklearn-color-fitted-level-3: cornflowerblue;\n",
547
+ "\n",
548
+ " /* Specific color for light theme */\n",
549
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
550
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
551
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
552
+ " --sklearn-color-icon: #696969;\n",
553
+ "\n",
554
+ " @media (prefers-color-scheme: dark) {\n",
555
+ " /* Redefinition of color scheme for dark theme */\n",
556
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
557
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
558
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
559
+ " --sklearn-color-icon: #878787;\n",
560
+ " }\n",
561
+ "}\n",
562
+ "\n",
563
+ "#sk-container-id-1 {\n",
564
+ " color: var(--sklearn-color-text);\n",
565
+ "}\n",
566
+ "\n",
567
+ "#sk-container-id-1 pre {\n",
568
+ " padding: 0;\n",
569
+ "}\n",
570
+ "\n",
571
+ "#sk-container-id-1 input.sk-hidden--visually {\n",
572
+ " border: 0;\n",
573
+ " clip: rect(1px 1px 1px 1px);\n",
574
+ " clip: rect(1px, 1px, 1px, 1px);\n",
575
+ " height: 1px;\n",
576
+ " margin: -1px;\n",
577
+ " overflow: hidden;\n",
578
+ " padding: 0;\n",
579
+ " position: absolute;\n",
580
+ " width: 1px;\n",
581
+ "}\n",
582
+ "\n",
583
+ "#sk-container-id-1 div.sk-dashed-wrapped {\n",
584
+ " border: 1px dashed var(--sklearn-color-line);\n",
585
+ " margin: 0 0.4em 0.5em 0.4em;\n",
586
+ " box-sizing: border-box;\n",
587
+ " padding-bottom: 0.4em;\n",
588
+ " background-color: var(--sklearn-color-background);\n",
589
+ "}\n",
590
+ "\n",
591
+ "#sk-container-id-1 div.sk-container {\n",
592
+ " /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
593
+ " but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
594
+ " so we also need the `!important` here to be able to override the\n",
595
+ " default hidden behavior on the sphinx rendered scikit-learn.org.\n",
596
+ " See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
597
+ " display: inline-block !important;\n",
598
+ " position: relative;\n",
599
+ "}\n",
600
+ "\n",
601
+ "#sk-container-id-1 div.sk-text-repr-fallback {\n",
602
+ " display: none;\n",
603
+ "}\n",
604
+ "\n",
605
+ "div.sk-parallel-item,\n",
606
+ "div.sk-serial,\n",
607
+ "div.sk-item {\n",
608
+ " /* draw centered vertical line to link estimators */\n",
609
+ " background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
610
+ " background-size: 2px 100%;\n",
611
+ " background-repeat: no-repeat;\n",
612
+ " background-position: center center;\n",
613
+ "}\n",
614
+ "\n",
615
+ "/* Parallel-specific style estimator block */\n",
616
+ "\n",
617
+ "#sk-container-id-1 div.sk-parallel-item::after {\n",
618
+ " content: \"\";\n",
619
+ " width: 100%;\n",
620
+ " border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
621
+ " flex-grow: 1;\n",
622
+ "}\n",
623
+ "\n",
624
+ "#sk-container-id-1 div.sk-parallel {\n",
625
+ " display: flex;\n",
626
+ " align-items: stretch;\n",
627
+ " justify-content: center;\n",
628
+ " background-color: var(--sklearn-color-background);\n",
629
+ " position: relative;\n",
630
+ "}\n",
631
+ "\n",
632
+ "#sk-container-id-1 div.sk-parallel-item {\n",
633
+ " display: flex;\n",
634
+ " flex-direction: column;\n",
635
+ "}\n",
636
+ "\n",
637
+ "#sk-container-id-1 div.sk-parallel-item:first-child::after {\n",
638
+ " align-self: flex-end;\n",
639
+ " width: 50%;\n",
640
+ "}\n",
641
+ "\n",
642
+ "#sk-container-id-1 div.sk-parallel-item:last-child::after {\n",
643
+ " align-self: flex-start;\n",
644
+ " width: 50%;\n",
645
+ "}\n",
646
+ "\n",
647
+ "#sk-container-id-1 div.sk-parallel-item:only-child::after {\n",
648
+ " width: 0;\n",
649
+ "}\n",
650
+ "\n",
651
+ "/* Serial-specific style estimator block */\n",
652
+ "\n",
653
+ "#sk-container-id-1 div.sk-serial {\n",
654
+ " display: flex;\n",
655
+ " flex-direction: column;\n",
656
+ " align-items: center;\n",
657
+ " background-color: var(--sklearn-color-background);\n",
658
+ " padding-right: 1em;\n",
659
+ " padding-left: 1em;\n",
660
+ "}\n",
661
+ "\n",
662
+ "\n",
663
+ "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
664
+ "clickable and can be expanded/collapsed.\n",
665
+ "- Pipeline and ColumnTransformer use this feature and define the default style\n",
666
+ "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
667
+ "*/\n",
668
+ "\n",
669
+ "/* Pipeline and ColumnTransformer style (default) */\n",
670
+ "\n",
671
+ "#sk-container-id-1 div.sk-toggleable {\n",
672
+ " /* Default theme specific background. It is overwritten whether we have a\n",
673
+ " specific estimator or a Pipeline/ColumnTransformer */\n",
674
+ " background-color: var(--sklearn-color-background);\n",
675
+ "}\n",
676
+ "\n",
677
+ "/* Toggleable label */\n",
678
+ "#sk-container-id-1 label.sk-toggleable__label {\n",
679
+ " cursor: pointer;\n",
680
+ " display: block;\n",
681
+ " width: 100%;\n",
682
+ " margin-bottom: 0;\n",
683
+ " padding: 0.5em;\n",
684
+ " box-sizing: border-box;\n",
685
+ " text-align: center;\n",
686
+ "}\n",
687
+ "\n",
688
+ "#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n",
689
+ " /* Arrow on the left of the label */\n",
690
+ " content: \"▸\";\n",
691
+ " float: left;\n",
692
+ " margin-right: 0.25em;\n",
693
+ " color: var(--sklearn-color-icon);\n",
694
+ "}\n",
695
+ "\n",
696
+ "#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n",
697
+ " color: var(--sklearn-color-text);\n",
698
+ "}\n",
699
+ "\n",
700
+ "/* Toggleable content - dropdown */\n",
701
+ "\n",
702
+ "#sk-container-id-1 div.sk-toggleable__content {\n",
703
+ " max-height: 0;\n",
704
+ " max-width: 0;\n",
705
+ " overflow: hidden;\n",
706
+ " text-align: left;\n",
707
+ " /* unfitted */\n",
708
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
709
+ "}\n",
710
+ "\n",
711
+ "#sk-container-id-1 div.sk-toggleable__content.fitted {\n",
712
+ " /* fitted */\n",
713
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
714
+ "}\n",
715
+ "\n",
716
+ "#sk-container-id-1 div.sk-toggleable__content pre {\n",
717
+ " margin: 0.2em;\n",
718
+ " border-radius: 0.25em;\n",
719
+ " color: var(--sklearn-color-text);\n",
720
+ " /* unfitted */\n",
721
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
722
+ "}\n",
723
+ "\n",
724
+ "#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n",
725
+ " /* unfitted */\n",
726
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
727
+ "}\n",
728
+ "\n",
729
+ "#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
730
+ " /* Expand drop-down */\n",
731
+ " max-height: 200px;\n",
732
+ " max-width: 100%;\n",
733
+ " overflow: auto;\n",
734
+ "}\n",
735
+ "\n",
736
+ "#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
737
+ " content: \"▾\";\n",
738
+ "}\n",
739
+ "\n",
740
+ "/* Pipeline/ColumnTransformer-specific style */\n",
741
+ "\n",
742
+ "#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
743
+ " color: var(--sklearn-color-text);\n",
744
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
745
+ "}\n",
746
+ "\n",
747
+ "#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
748
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
749
+ "}\n",
750
+ "\n",
751
+ "/* Estimator-specific style */\n",
752
+ "\n",
753
+ "/* Colorize estimator box */\n",
754
+ "#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
755
+ " /* unfitted */\n",
756
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
757
+ "}\n",
758
+ "\n",
759
+ "#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
760
+ " /* fitted */\n",
761
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
762
+ "}\n",
763
+ "\n",
764
+ "#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n",
765
+ "#sk-container-id-1 div.sk-label label {\n",
766
+ " /* The background is the default theme color */\n",
767
+ " color: var(--sklearn-color-text-on-default-background);\n",
768
+ "}\n",
769
+ "\n",
770
+ "/* On hover, darken the color of the background */\n",
771
+ "#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n",
772
+ " color: var(--sklearn-color-text);\n",
773
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
774
+ "}\n",
775
+ "\n",
776
+ "/* Label box, darken color on hover, fitted */\n",
777
+ "#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
778
+ " color: var(--sklearn-color-text);\n",
779
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
780
+ "}\n",
781
+ "\n",
782
+ "/* Estimator label */\n",
783
+ "\n",
784
+ "#sk-container-id-1 div.sk-label label {\n",
785
+ " font-family: monospace;\n",
786
+ " font-weight: bold;\n",
787
+ " display: inline-block;\n",
788
+ " line-height: 1.2em;\n",
789
+ "}\n",
790
+ "\n",
791
+ "#sk-container-id-1 div.sk-label-container {\n",
792
+ " text-align: center;\n",
793
+ "}\n",
794
+ "\n",
795
+ "/* Estimator-specific */\n",
796
+ "#sk-container-id-1 div.sk-estimator {\n",
797
+ " font-family: monospace;\n",
798
+ " border: 1px dotted var(--sklearn-color-border-box);\n",
799
+ " border-radius: 0.25em;\n",
800
+ " box-sizing: border-box;\n",
801
+ " margin-bottom: 0.5em;\n",
802
+ " /* unfitted */\n",
803
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
804
+ "}\n",
805
+ "\n",
806
+ "#sk-container-id-1 div.sk-estimator.fitted {\n",
807
+ " /* fitted */\n",
808
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
809
+ "}\n",
810
+ "\n",
811
+ "/* on hover */\n",
812
+ "#sk-container-id-1 div.sk-estimator:hover {\n",
813
+ " /* unfitted */\n",
814
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
815
+ "}\n",
816
+ "\n",
817
+ "#sk-container-id-1 div.sk-estimator.fitted:hover {\n",
818
+ " /* fitted */\n",
819
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
820
+ "}\n",
821
+ "\n",
822
+ "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
823
+ "\n",
824
+ "/* Common style for \"i\" and \"?\" */\n",
825
+ "\n",
826
+ ".sk-estimator-doc-link,\n",
827
+ "a:link.sk-estimator-doc-link,\n",
828
+ "a:visited.sk-estimator-doc-link {\n",
829
+ " float: right;\n",
830
+ " font-size: smaller;\n",
831
+ " line-height: 1em;\n",
832
+ " font-family: monospace;\n",
833
+ " background-color: var(--sklearn-color-background);\n",
834
+ " border-radius: 1em;\n",
835
+ " height: 1em;\n",
836
+ " width: 1em;\n",
837
+ " text-decoration: none !important;\n",
838
+ " margin-left: 1ex;\n",
839
+ " /* unfitted */\n",
840
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
841
+ " color: var(--sklearn-color-unfitted-level-1);\n",
842
+ "}\n",
843
+ "\n",
844
+ ".sk-estimator-doc-link.fitted,\n",
845
+ "a:link.sk-estimator-doc-link.fitted,\n",
846
+ "a:visited.sk-estimator-doc-link.fitted {\n",
847
+ " /* fitted */\n",
848
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
849
+ " color: var(--sklearn-color-fitted-level-1);\n",
850
+ "}\n",
851
+ "\n",
852
+ "/* On hover */\n",
853
+ "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
854
+ ".sk-estimator-doc-link:hover,\n",
855
+ "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
856
+ ".sk-estimator-doc-link:hover {\n",
857
+ " /* unfitted */\n",
858
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
859
+ " color: var(--sklearn-color-background);\n",
860
+ " text-decoration: none;\n",
861
+ "}\n",
862
+ "\n",
863
+ "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
864
+ ".sk-estimator-doc-link.fitted:hover,\n",
865
+ "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
866
+ ".sk-estimator-doc-link.fitted:hover {\n",
867
+ " /* fitted */\n",
868
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
869
+ " color: var(--sklearn-color-background);\n",
870
+ " text-decoration: none;\n",
871
+ "}\n",
872
+ "\n",
873
+ "/* Span, style for the box shown on hovering the info icon */\n",
874
+ ".sk-estimator-doc-link span {\n",
875
+ " display: none;\n",
876
+ " z-index: 9999;\n",
877
+ " position: relative;\n",
878
+ " font-weight: normal;\n",
879
+ " right: .2ex;\n",
880
+ " padding: .5ex;\n",
881
+ " margin: .5ex;\n",
882
+ " width: min-content;\n",
883
+ " min-width: 20ex;\n",
884
+ " max-width: 50ex;\n",
885
+ " color: var(--sklearn-color-text);\n",
886
+ " box-shadow: 2pt 2pt 4pt #999;\n",
887
+ " /* unfitted */\n",
888
+ " background: var(--sklearn-color-unfitted-level-0);\n",
889
+ " border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
890
+ "}\n",
891
+ "\n",
892
+ ".sk-estimator-doc-link.fitted span {\n",
893
+ " /* fitted */\n",
894
+ " background: var(--sklearn-color-fitted-level-0);\n",
895
+ " border: var(--sklearn-color-fitted-level-3);\n",
896
+ "}\n",
897
+ "\n",
898
+ ".sk-estimator-doc-link:hover span {\n",
899
+ " display: block;\n",
900
+ "}\n",
901
+ "\n",
902
+ "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
903
+ "\n",
904
+ "#sk-container-id-1 a.estimator_doc_link {\n",
905
+ " float: right;\n",
906
+ " font-size: 1rem;\n",
907
+ " line-height: 1em;\n",
908
+ " font-family: monospace;\n",
909
+ " background-color: var(--sklearn-color-background);\n",
910
+ " border-radius: 1rem;\n",
911
+ " height: 1rem;\n",
912
+ " width: 1rem;\n",
913
+ " text-decoration: none;\n",
914
+ " /* unfitted */\n",
915
+ " color: var(--sklearn-color-unfitted-level-1);\n",
916
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
917
+ "}\n",
918
+ "\n",
919
+ "#sk-container-id-1 a.estimator_doc_link.fitted {\n",
920
+ " /* fitted */\n",
921
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
922
+ " color: var(--sklearn-color-fitted-level-1);\n",
923
+ "}\n",
924
+ "\n",
925
+ "/* On hover */\n",
926
+ "#sk-container-id-1 a.estimator_doc_link:hover {\n",
927
+ " /* unfitted */\n",
928
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
929
+ " color: var(--sklearn-color-background);\n",
930
+ " text-decoration: none;\n",
931
+ "}\n",
932
+ "\n",
933
+ "#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n",
934
+ " /* fitted */\n",
935
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
936
+ "}\n",
937
+ "</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>MultinomialNB()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;MultinomialNB<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.naive_bayes.MultinomialNB.html\">?<span>Documentation for MultinomialNB</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>MultinomialNB()</pre></div> </div></div></div></div>"
938
+ ],
939
+ "text/plain": [
940
+ "MultinomialNB()"
941
+ ]
942
+ },
943
+ "execution_count": 13,
944
+ "metadata": {},
945
+ "output_type": "execute_result"
946
+ }
947
+ ],
948
+ "source": [
949
+ "naive_bayes = MultinomialNB()\n",
950
+ "naive_bayes.fit(X_train_tfidf, y_train)"
951
+ ]
952
+ },
953
+ {
954
+ "cell_type": "code",
955
+ "execution_count": 14,
956
+ "id": "74c5df68",
957
+ "metadata": {},
958
+ "outputs": [],
959
+ "source": [
960
+ "predictions = naive_bayes.predict(X_test_tfidf)"
961
+ ]
962
+ },
963
+ {
964
+ "cell_type": "code",
965
+ "execution_count": 15,
966
+ "id": "109e9456",
967
+ "metadata": {},
968
+ "outputs": [
969
+ {
970
+ "name": "stdout",
971
+ "output_type": "stream",
972
+ "text": [
973
+ "Accuracy of Naive Bayes model: 0.8582541054451167\n",
974
+ " precision recall f1-score support\n",
975
+ "\n",
976
+ " FALSE 0.88 0.94 0.91 847\n",
977
+ " TRUE 0.79 0.65 0.71 310\n",
978
+ "\n",
979
+ " accuracy 0.86 1157\n",
980
+ " macro avg 0.83 0.79 0.81 1157\n",
981
+ "weighted avg 0.85 0.86 0.85 1157\n",
982
+ "\n"
983
+ ]
984
+ }
985
+ ],
986
+ "source": [
987
+ "accuracy = accuracy_score(y_test, predictions)\n",
988
+ "print(\"Accuracy of Naive Bayes model:\", accuracy)\n",
989
+ "print(classification_report(y_test, predictions))"
990
+ ]
991
+ },
992
+ {
993
+ "cell_type": "markdown",
994
+ "id": "9518614a",
995
+ "metadata": {},
996
+ "source": [
997
+ "## Logistic Regression model"
998
+ ]
999
+ },
1000
+ {
1001
+ "cell_type": "code",
1002
+ "execution_count": 16,
1003
+ "id": "912ad7a6",
1004
+ "metadata": {},
1005
+ "outputs": [],
1006
+ "source": [
1007
+ "from sklearn.model_selection import train_test_split\n",
1008
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
1009
+ "from sklearn.linear_model import LogisticRegression\n",
1010
+ "from sklearn.metrics import accuracy_score"
1011
+ ]
1012
+ },
1013
+ {
1014
+ "cell_type": "code",
1015
+ "execution_count": 17,
1016
+ "id": "03eac734",
1017
+ "metadata": {},
1018
+ "outputs": [],
1019
+ "source": [
1020
+ "X_train, X_test, y_train, y_test = train_test_split(\n",
1021
+ " X, y, test_size=0.2, random_state=42\n",
1022
+ ")"
1023
+ ]
1024
+ },
1025
+ {
1026
+ "cell_type": "code",
1027
+ "execution_count": 18,
1028
+ "id": "e84ff87c",
1029
+ "metadata": {},
1030
+ "outputs": [],
1031
+ "source": [
1032
+ "tfidf_vectorizer = TfidfVectorizer(max_features=1000)\n",
1033
+ "X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
1034
+ "X_test_tfidf = tfidf_vectorizer.transform(X_test)"
1035
+ ]
1036
+ },
1037
+ {
1038
+ "cell_type": "code",
1039
+ "execution_count": 19,
1040
+ "id": "cedb263c",
1041
+ "metadata": {},
1042
+ "outputs": [
1043
+ {
1044
+ "data": {
1045
+ "text/html": [
1046
+ "<style>#sk-container-id-2 {\n",
1047
+ " /* Definition of color scheme common for light and dark mode */\n",
1048
+ " --sklearn-color-text: black;\n",
1049
+ " --sklearn-color-line: gray;\n",
1050
+ " /* Definition of color scheme for unfitted estimators */\n",
1051
+ " --sklearn-color-unfitted-level-0: #fff5e6;\n",
1052
+ " --sklearn-color-unfitted-level-1: #f6e4d2;\n",
1053
+ " --sklearn-color-unfitted-level-2: #ffe0b3;\n",
1054
+ " --sklearn-color-unfitted-level-3: chocolate;\n",
1055
+ " /* Definition of color scheme for fitted estimators */\n",
1056
+ " --sklearn-color-fitted-level-0: #f0f8ff;\n",
1057
+ " --sklearn-color-fitted-level-1: #d4ebff;\n",
1058
+ " --sklearn-color-fitted-level-2: #b3dbfd;\n",
1059
+ " --sklearn-color-fitted-level-3: cornflowerblue;\n",
1060
+ "\n",
1061
+ " /* Specific color for light theme */\n",
1062
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
1063
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
1064
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
1065
+ " --sklearn-color-icon: #696969;\n",
1066
+ "\n",
1067
+ " @media (prefers-color-scheme: dark) {\n",
1068
+ " /* Redefinition of color scheme for dark theme */\n",
1069
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
1070
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
1071
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
1072
+ " --sklearn-color-icon: #878787;\n",
1073
+ " }\n",
1074
+ "}\n",
1075
+ "\n",
1076
+ "#sk-container-id-2 {\n",
1077
+ " color: var(--sklearn-color-text);\n",
1078
+ "}\n",
1079
+ "\n",
1080
+ "#sk-container-id-2 pre {\n",
1081
+ " padding: 0;\n",
1082
+ "}\n",
1083
+ "\n",
1084
+ "#sk-container-id-2 input.sk-hidden--visually {\n",
1085
+ " border: 0;\n",
1086
+ " clip: rect(1px 1px 1px 1px);\n",
1087
+ " clip: rect(1px, 1px, 1px, 1px);\n",
1088
+ " height: 1px;\n",
1089
+ " margin: -1px;\n",
1090
+ " overflow: hidden;\n",
1091
+ " padding: 0;\n",
1092
+ " position: absolute;\n",
1093
+ " width: 1px;\n",
1094
+ "}\n",
1095
+ "\n",
1096
+ "#sk-container-id-2 div.sk-dashed-wrapped {\n",
1097
+ " border: 1px dashed var(--sklearn-color-line);\n",
1098
+ " margin: 0 0.4em 0.5em 0.4em;\n",
1099
+ " box-sizing: border-box;\n",
1100
+ " padding-bottom: 0.4em;\n",
1101
+ " background-color: var(--sklearn-color-background);\n",
1102
+ "}\n",
1103
+ "\n",
1104
+ "#sk-container-id-2 div.sk-container {\n",
1105
+ " /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
1106
+ " but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
1107
+ " so we also need the `!important` here to be able to override the\n",
1108
+ " default hidden behavior on the sphinx rendered scikit-learn.org.\n",
1109
+ " See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
1110
+ " display: inline-block !important;\n",
1111
+ " position: relative;\n",
1112
+ "}\n",
1113
+ "\n",
1114
+ "#sk-container-id-2 div.sk-text-repr-fallback {\n",
1115
+ " display: none;\n",
1116
+ "}\n",
1117
+ "\n",
1118
+ "div.sk-parallel-item,\n",
1119
+ "div.sk-serial,\n",
1120
+ "div.sk-item {\n",
1121
+ " /* draw centered vertical line to link estimators */\n",
1122
+ " background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
1123
+ " background-size: 2px 100%;\n",
1124
+ " background-repeat: no-repeat;\n",
1125
+ " background-position: center center;\n",
1126
+ "}\n",
1127
+ "\n",
1128
+ "/* Parallel-specific style estimator block */\n",
1129
+ "\n",
1130
+ "#sk-container-id-2 div.sk-parallel-item::after {\n",
1131
+ " content: \"\";\n",
1132
+ " width: 100%;\n",
1133
+ " border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
1134
+ " flex-grow: 1;\n",
1135
+ "}\n",
1136
+ "\n",
1137
+ "#sk-container-id-2 div.sk-parallel {\n",
1138
+ " display: flex;\n",
1139
+ " align-items: stretch;\n",
1140
+ " justify-content: center;\n",
1141
+ " background-color: var(--sklearn-color-background);\n",
1142
+ " position: relative;\n",
1143
+ "}\n",
1144
+ "\n",
1145
+ "#sk-container-id-2 div.sk-parallel-item {\n",
1146
+ " display: flex;\n",
1147
+ " flex-direction: column;\n",
1148
+ "}\n",
1149
+ "\n",
1150
+ "#sk-container-id-2 div.sk-parallel-item:first-child::after {\n",
1151
+ " align-self: flex-end;\n",
1152
+ " width: 50%;\n",
1153
+ "}\n",
1154
+ "\n",
1155
+ "#sk-container-id-2 div.sk-parallel-item:last-child::after {\n",
1156
+ " align-self: flex-start;\n",
1157
+ " width: 50%;\n",
1158
+ "}\n",
1159
+ "\n",
1160
+ "#sk-container-id-2 div.sk-parallel-item:only-child::after {\n",
1161
+ " width: 0;\n",
1162
+ "}\n",
1163
+ "\n",
1164
+ "/* Serial-specific style estimator block */\n",
1165
+ "\n",
1166
+ "#sk-container-id-2 div.sk-serial {\n",
1167
+ " display: flex;\n",
1168
+ " flex-direction: column;\n",
1169
+ " align-items: center;\n",
1170
+ " background-color: var(--sklearn-color-background);\n",
1171
+ " padding-right: 1em;\n",
1172
+ " padding-left: 1em;\n",
1173
+ "}\n",
1174
+ "\n",
1175
+ "\n",
1176
+ "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
1177
+ "clickable and can be expanded/collapsed.\n",
1178
+ "- Pipeline and ColumnTransformer use this feature and define the default style\n",
1179
+ "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
1180
+ "*/\n",
1181
+ "\n",
1182
+ "/* Pipeline and ColumnTransformer style (default) */\n",
1183
+ "\n",
1184
+ "#sk-container-id-2 div.sk-toggleable {\n",
1185
+ " /* Default theme specific background. It is overwritten whether we have a\n",
1186
+ " specific estimator or a Pipeline/ColumnTransformer */\n",
1187
+ " background-color: var(--sklearn-color-background);\n",
1188
+ "}\n",
1189
+ "\n",
1190
+ "/* Toggleable label */\n",
1191
+ "#sk-container-id-2 label.sk-toggleable__label {\n",
1192
+ " cursor: pointer;\n",
1193
+ " display: block;\n",
1194
+ " width: 100%;\n",
1195
+ " margin-bottom: 0;\n",
1196
+ " padding: 0.5em;\n",
1197
+ " box-sizing: border-box;\n",
1198
+ " text-align: center;\n",
1199
+ "}\n",
1200
+ "\n",
1201
+ "#sk-container-id-2 label.sk-toggleable__label-arrow:before {\n",
1202
+ " /* Arrow on the left of the label */\n",
1203
+ " content: \"▸\";\n",
1204
+ " float: left;\n",
1205
+ " margin-right: 0.25em;\n",
1206
+ " color: var(--sklearn-color-icon);\n",
1207
+ "}\n",
1208
+ "\n",
1209
+ "#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {\n",
1210
+ " color: var(--sklearn-color-text);\n",
1211
+ "}\n",
1212
+ "\n",
1213
+ "/* Toggleable content - dropdown */\n",
1214
+ "\n",
1215
+ "#sk-container-id-2 div.sk-toggleable__content {\n",
1216
+ " max-height: 0;\n",
1217
+ " max-width: 0;\n",
1218
+ " overflow: hidden;\n",
1219
+ " text-align: left;\n",
1220
+ " /* unfitted */\n",
1221
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
1222
+ "}\n",
1223
+ "\n",
1224
+ "#sk-container-id-2 div.sk-toggleable__content.fitted {\n",
1225
+ " /* fitted */\n",
1226
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
1227
+ "}\n",
1228
+ "\n",
1229
+ "#sk-container-id-2 div.sk-toggleable__content pre {\n",
1230
+ " margin: 0.2em;\n",
1231
+ " border-radius: 0.25em;\n",
1232
+ " color: var(--sklearn-color-text);\n",
1233
+ " /* unfitted */\n",
1234
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
1235
+ "}\n",
1236
+ "\n",
1237
+ "#sk-container-id-2 div.sk-toggleable__content.fitted pre {\n",
1238
+ " /* unfitted */\n",
1239
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
1240
+ "}\n",
1241
+ "\n",
1242
+ "#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
1243
+ " /* Expand drop-down */\n",
1244
+ " max-height: 200px;\n",
1245
+ " max-width: 100%;\n",
1246
+ " overflow: auto;\n",
1247
+ "}\n",
1248
+ "\n",
1249
+ "#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
1250
+ " content: \"▾\";\n",
1251
+ "}\n",
1252
+ "\n",
1253
+ "/* Pipeline/ColumnTransformer-specific style */\n",
1254
+ "\n",
1255
+ "#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
1256
+ " color: var(--sklearn-color-text);\n",
1257
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
1258
+ "}\n",
1259
+ "\n",
1260
+ "#sk-container-id-2 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
1261
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
1262
+ "}\n",
1263
+ "\n",
1264
+ "/* Estimator-specific style */\n",
1265
+ "\n",
1266
+ "/* Colorize estimator box */\n",
1267
+ "#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
1268
+ " /* unfitted */\n",
1269
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
1270
+ "}\n",
1271
+ "\n",
1272
+ "#sk-container-id-2 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
1273
+ " /* fitted */\n",
1274
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
1275
+ "}\n",
1276
+ "\n",
1277
+ "#sk-container-id-2 div.sk-label label.sk-toggleable__label,\n",
1278
+ "#sk-container-id-2 div.sk-label label {\n",
1279
+ " /* The background is the default theme color */\n",
1280
+ " color: var(--sklearn-color-text-on-default-background);\n",
1281
+ "}\n",
1282
+ "\n",
1283
+ "/* On hover, darken the color of the background */\n",
1284
+ "#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {\n",
1285
+ " color: var(--sklearn-color-text);\n",
1286
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
1287
+ "}\n",
1288
+ "\n",
1289
+ "/* Label box, darken color on hover, fitted */\n",
1290
+ "#sk-container-id-2 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
1291
+ " color: var(--sklearn-color-text);\n",
1292
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
1293
+ "}\n",
1294
+ "\n",
1295
+ "/* Estimator label */\n",
1296
+ "\n",
1297
+ "#sk-container-id-2 div.sk-label label {\n",
1298
+ " font-family: monospace;\n",
1299
+ " font-weight: bold;\n",
1300
+ " display: inline-block;\n",
1301
+ " line-height: 1.2em;\n",
1302
+ "}\n",
1303
+ "\n",
1304
+ "#sk-container-id-2 div.sk-label-container {\n",
1305
+ " text-align: center;\n",
1306
+ "}\n",
1307
+ "\n",
1308
+ "/* Estimator-specific */\n",
1309
+ "#sk-container-id-2 div.sk-estimator {\n",
1310
+ " font-family: monospace;\n",
1311
+ " border: 1px dotted var(--sklearn-color-border-box);\n",
1312
+ " border-radius: 0.25em;\n",
1313
+ " box-sizing: border-box;\n",
1314
+ " margin-bottom: 0.5em;\n",
1315
+ " /* unfitted */\n",
1316
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
1317
+ "}\n",
1318
+ "\n",
1319
+ "#sk-container-id-2 div.sk-estimator.fitted {\n",
1320
+ " /* fitted */\n",
1321
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
1322
+ "}\n",
1323
+ "\n",
1324
+ "/* on hover */\n",
1325
+ "#sk-container-id-2 div.sk-estimator:hover {\n",
1326
+ " /* unfitted */\n",
1327
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
1328
+ "}\n",
1329
+ "\n",
1330
+ "#sk-container-id-2 div.sk-estimator.fitted:hover {\n",
1331
+ " /* fitted */\n",
1332
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
1333
+ "}\n",
1334
+ "\n",
1335
+ "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
1336
+ "\n",
1337
+ "/* Common style for \"i\" and \"?\" */\n",
1338
+ "\n",
1339
+ ".sk-estimator-doc-link,\n",
1340
+ "a:link.sk-estimator-doc-link,\n",
1341
+ "a:visited.sk-estimator-doc-link {\n",
1342
+ " float: right;\n",
1343
+ " font-size: smaller;\n",
1344
+ " line-height: 1em;\n",
1345
+ " font-family: monospace;\n",
1346
+ " background-color: var(--sklearn-color-background);\n",
1347
+ " border-radius: 1em;\n",
1348
+ " height: 1em;\n",
1349
+ " width: 1em;\n",
1350
+ " text-decoration: none !important;\n",
1351
+ " margin-left: 1ex;\n",
1352
+ " /* unfitted */\n",
1353
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
1354
+ " color: var(--sklearn-color-unfitted-level-1);\n",
1355
+ "}\n",
1356
+ "\n",
1357
+ ".sk-estimator-doc-link.fitted,\n",
1358
+ "a:link.sk-estimator-doc-link.fitted,\n",
1359
+ "a:visited.sk-estimator-doc-link.fitted {\n",
1360
+ " /* fitted */\n",
1361
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
1362
+ " color: var(--sklearn-color-fitted-level-1);\n",
1363
+ "}\n",
1364
+ "\n",
1365
+ "/* On hover */\n",
1366
+ "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
1367
+ ".sk-estimator-doc-link:hover,\n",
1368
+ "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
1369
+ ".sk-estimator-doc-link:hover {\n",
1370
+ " /* unfitted */\n",
1371
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
1372
+ " color: var(--sklearn-color-background);\n",
1373
+ " text-decoration: none;\n",
1374
+ "}\n",
1375
+ "\n",
1376
+ "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
1377
+ ".sk-estimator-doc-link.fitted:hover,\n",
1378
+ "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
1379
+ ".sk-estimator-doc-link.fitted:hover {\n",
1380
+ " /* fitted */\n",
1381
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
1382
+ " color: var(--sklearn-color-background);\n",
1383
+ " text-decoration: none;\n",
1384
+ "}\n",
1385
+ "\n",
1386
+ "/* Span, style for the box shown on hovering the info icon */\n",
1387
+ ".sk-estimator-doc-link span {\n",
1388
+ " display: none;\n",
1389
+ " z-index: 9999;\n",
1390
+ " position: relative;\n",
1391
+ " font-weight: normal;\n",
1392
+ " right: .2ex;\n",
1393
+ " padding: .5ex;\n",
1394
+ " margin: .5ex;\n",
1395
+ " width: min-content;\n",
1396
+ " min-width: 20ex;\n",
1397
+ " max-width: 50ex;\n",
1398
+ " color: var(--sklearn-color-text);\n",
1399
+ " box-shadow: 2pt 2pt 4pt #999;\n",
1400
+ " /* unfitted */\n",
1401
+ " background: var(--sklearn-color-unfitted-level-0);\n",
1402
+ " border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
1403
+ "}\n",
1404
+ "\n",
1405
+ ".sk-estimator-doc-link.fitted span {\n",
1406
+ " /* fitted */\n",
1407
+ " background: var(--sklearn-color-fitted-level-0);\n",
1408
+ " border: var(--sklearn-color-fitted-level-3);\n",
1409
+ "}\n",
1410
+ "\n",
1411
+ ".sk-estimator-doc-link:hover span {\n",
1412
+ " display: block;\n",
1413
+ "}\n",
1414
+ "\n",
1415
+ "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
1416
+ "\n",
1417
+ "#sk-container-id-2 a.estimator_doc_link {\n",
1418
+ " float: right;\n",
1419
+ " font-size: 1rem;\n",
1420
+ " line-height: 1em;\n",
1421
+ " font-family: monospace;\n",
1422
+ " background-color: var(--sklearn-color-background);\n",
1423
+ " border-radius: 1rem;\n",
1424
+ " height: 1rem;\n",
1425
+ " width: 1rem;\n",
1426
+ " text-decoration: none;\n",
1427
+ " /* unfitted */\n",
1428
+ " color: var(--sklearn-color-unfitted-level-1);\n",
1429
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
1430
+ "}\n",
1431
+ "\n",
1432
+ "#sk-container-id-2 a.estimator_doc_link.fitted {\n",
1433
+ " /* fitted */\n",
1434
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
1435
+ " color: var(--sklearn-color-fitted-level-1);\n",
1436
+ "}\n",
1437
+ "\n",
1438
+ "/* On hover */\n",
1439
+ "#sk-container-id-2 a.estimator_doc_link:hover {\n",
1440
+ " /* unfitted */\n",
1441
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
1442
+ " color: var(--sklearn-color-background);\n",
1443
+ " text-decoration: none;\n",
1444
+ "}\n",
1445
+ "\n",
1446
+ "#sk-container-id-2 a.estimator_doc_link.fitted:hover {\n",
1447
+ " /* fitted */\n",
1448
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
1449
+ "}\n",
1450
+ "</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LogisticRegression()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" checked><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;LogisticRegression<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.linear_model.LogisticRegression.html\">?<span>Documentation for LogisticRegression</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>LogisticRegression()</pre></div> </div></div></div></div>"
1451
+ ],
1452
+ "text/plain": [
1453
+ "LogisticRegression()"
1454
+ ]
1455
+ },
1456
+ "execution_count": 19,
1457
+ "metadata": {},
1458
+ "output_type": "execute_result"
1459
+ }
1460
+ ],
1461
+ "source": [
1462
+ "model = LogisticRegression()\n",
1463
+ "model.fit(X_train_tfidf, y_train)"
1464
+ ]
1465
+ },
1466
+ {
1467
+ "cell_type": "code",
1468
+ "execution_count": 20,
1469
+ "id": "6f49fddb",
1470
+ "metadata": {},
1471
+ "outputs": [
1472
+ {
1473
+ "name": "stdout",
1474
+ "output_type": "stream",
1475
+ "text": [
1476
+ "Accuracy of Logistic Regression Model: 0.9308556611927399\n",
1477
+ " precision recall f1-score support\n",
1478
+ "\n",
1479
+ " FALSE 0.92 0.99 0.95 847\n",
1480
+ " TRUE 0.98 0.76 0.86 310\n",
1481
+ "\n",
1482
+ " accuracy 0.93 1157\n",
1483
+ " macro avg 0.95 0.88 0.90 1157\n",
1484
+ "weighted avg 0.93 0.93 0.93 1157\n",
1485
+ "\n"
1486
+ ]
1487
+ }
1488
+ ],
1489
+ "source": [
1490
+ "y_pred = model.predict(X_test_tfidf)\n",
1491
+ "\n",
1492
+ "accuracy = accuracy_score(y_test, y_pred)\n",
1493
+ "print(\"Accuracy of Logistic Regression Model:\", accuracy)\n",
1494
+ "print(classification_report(y_test, y_pred))"
1495
+ ]
1496
+ },
1497
+ {
1498
+ "cell_type": "markdown",
1499
+ "id": "613c0cdf",
1500
+ "metadata": {},
1501
+ "source": [
1502
+ "## Support Vector Machine (SVM) model"
1503
+ ]
1504
+ },
1505
+ {
1506
+ "cell_type": "code",
1507
+ "execution_count": 21,
1508
+ "id": "706302c1",
1509
+ "metadata": {},
1510
+ "outputs": [],
1511
+ "source": [
1512
+ "from sklearn.model_selection import train_test_split\n",
1513
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
1514
+ "from sklearn.svm import SVC\n",
1515
+ "from sklearn.metrics import accuracy_score"
1516
+ ]
1517
+ },
1518
+ {
1519
+ "cell_type": "code",
1520
+ "execution_count": 22,
1521
+ "id": "b0988ca4",
1522
+ "metadata": {},
1523
+ "outputs": [],
1524
+ "source": [
1525
+ "X_train, X_test, y_train, y_test = train_test_split(\n",
1526
+ " X, y, test_size=0.2, random_state=42\n",
1527
+ ")"
1528
+ ]
1529
+ },
1530
+ {
1531
+ "cell_type": "code",
1532
+ "execution_count": 23,
1533
+ "id": "4f682c60",
1534
+ "metadata": {},
1535
+ "outputs": [],
1536
+ "source": [
1537
+ "tfidf_vectorizer = TfidfVectorizer(max_features=1000)\n",
1538
+ "X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
1539
+ "X_test_tfidf = tfidf_vectorizer.transform(X_test)"
1540
+ ]
1541
+ },
1542
+ {
1543
+ "cell_type": "code",
1544
+ "execution_count": 24,
1545
+ "id": "71ae91d9",
1546
+ "metadata": {},
1547
+ "outputs": [
1548
+ {
1549
+ "data": {
1550
+ "text/html": [
1551
+ "<style>#sk-container-id-3 {\n",
1552
+ " /* Definition of color scheme common for light and dark mode */\n",
1553
+ " --sklearn-color-text: black;\n",
1554
+ " --sklearn-color-line: gray;\n",
1555
+ " /* Definition of color scheme for unfitted estimators */\n",
1556
+ " --sklearn-color-unfitted-level-0: #fff5e6;\n",
1557
+ " --sklearn-color-unfitted-level-1: #f6e4d2;\n",
1558
+ " --sklearn-color-unfitted-level-2: #ffe0b3;\n",
1559
+ " --sklearn-color-unfitted-level-3: chocolate;\n",
1560
+ " /* Definition of color scheme for fitted estimators */\n",
1561
+ " --sklearn-color-fitted-level-0: #f0f8ff;\n",
1562
+ " --sklearn-color-fitted-level-1: #d4ebff;\n",
1563
+ " --sklearn-color-fitted-level-2: #b3dbfd;\n",
1564
+ " --sklearn-color-fitted-level-3: cornflowerblue;\n",
1565
+ "\n",
1566
+ " /* Specific color for light theme */\n",
1567
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
1568
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
1569
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
1570
+ " --sklearn-color-icon: #696969;\n",
1571
+ "\n",
1572
+ " @media (prefers-color-scheme: dark) {\n",
1573
+ " /* Redefinition of color scheme for dark theme */\n",
1574
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
1575
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
1576
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
1577
+ " --sklearn-color-icon: #878787;\n",
1578
+ " }\n",
1579
+ "}\n",
1580
+ "\n",
1581
+ "#sk-container-id-3 {\n",
1582
+ " color: var(--sklearn-color-text);\n",
1583
+ "}\n",
1584
+ "\n",
1585
+ "#sk-container-id-3 pre {\n",
1586
+ " padding: 0;\n",
1587
+ "}\n",
1588
+ "\n",
1589
+ "#sk-container-id-3 input.sk-hidden--visually {\n",
1590
+ " border: 0;\n",
1591
+ " clip: rect(1px 1px 1px 1px);\n",
1592
+ " clip: rect(1px, 1px, 1px, 1px);\n",
1593
+ " height: 1px;\n",
1594
+ " margin: -1px;\n",
1595
+ " overflow: hidden;\n",
1596
+ " padding: 0;\n",
1597
+ " position: absolute;\n",
1598
+ " width: 1px;\n",
1599
+ "}\n",
1600
+ "\n",
1601
+ "#sk-container-id-3 div.sk-dashed-wrapped {\n",
1602
+ " border: 1px dashed var(--sklearn-color-line);\n",
1603
+ " margin: 0 0.4em 0.5em 0.4em;\n",
1604
+ " box-sizing: border-box;\n",
1605
+ " padding-bottom: 0.4em;\n",
1606
+ " background-color: var(--sklearn-color-background);\n",
1607
+ "}\n",
1608
+ "\n",
1609
+ "#sk-container-id-3 div.sk-container {\n",
1610
+ " /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
1611
+ " but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
1612
+ " so we also need the `!important` here to be able to override the\n",
1613
+ " default hidden behavior on the sphinx rendered scikit-learn.org.\n",
1614
+ " See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
1615
+ " display: inline-block !important;\n",
1616
+ " position: relative;\n",
1617
+ "}\n",
1618
+ "\n",
1619
+ "#sk-container-id-3 div.sk-text-repr-fallback {\n",
1620
+ " display: none;\n",
1621
+ "}\n",
1622
+ "\n",
1623
+ "div.sk-parallel-item,\n",
1624
+ "div.sk-serial,\n",
1625
+ "div.sk-item {\n",
1626
+ " /* draw centered vertical line to link estimators */\n",
1627
+ " background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
1628
+ " background-size: 2px 100%;\n",
1629
+ " background-repeat: no-repeat;\n",
1630
+ " background-position: center center;\n",
1631
+ "}\n",
1632
+ "\n",
1633
+ "/* Parallel-specific style estimator block */\n",
1634
+ "\n",
1635
+ "#sk-container-id-3 div.sk-parallel-item::after {\n",
1636
+ " content: \"\";\n",
1637
+ " width: 100%;\n",
1638
+ " border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
1639
+ " flex-grow: 1;\n",
1640
+ "}\n",
1641
+ "\n",
1642
+ "#sk-container-id-3 div.sk-parallel {\n",
1643
+ " display: flex;\n",
1644
+ " align-items: stretch;\n",
1645
+ " justify-content: center;\n",
1646
+ " background-color: var(--sklearn-color-background);\n",
1647
+ " position: relative;\n",
1648
+ "}\n",
1649
+ "\n",
1650
+ "#sk-container-id-3 div.sk-parallel-item {\n",
1651
+ " display: flex;\n",
1652
+ " flex-direction: column;\n",
1653
+ "}\n",
1654
+ "\n",
1655
+ "#sk-container-id-3 div.sk-parallel-item:first-child::after {\n",
1656
+ " align-self: flex-end;\n",
1657
+ " width: 50%;\n",
1658
+ "}\n",
1659
+ "\n",
1660
+ "#sk-container-id-3 div.sk-parallel-item:last-child::after {\n",
1661
+ " align-self: flex-start;\n",
1662
+ " width: 50%;\n",
1663
+ "}\n",
1664
+ "\n",
1665
+ "#sk-container-id-3 div.sk-parallel-item:only-child::after {\n",
1666
+ " width: 0;\n",
1667
+ "}\n",
1668
+ "\n",
1669
+ "/* Serial-specific style estimator block */\n",
1670
+ "\n",
1671
+ "#sk-container-id-3 div.sk-serial {\n",
1672
+ " display: flex;\n",
1673
+ " flex-direction: column;\n",
1674
+ " align-items: center;\n",
1675
+ " background-color: var(--sklearn-color-background);\n",
1676
+ " padding-right: 1em;\n",
1677
+ " padding-left: 1em;\n",
1678
+ "}\n",
1679
+ "\n",
1680
+ "\n",
1681
+ "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
1682
+ "clickable and can be expanded/collapsed.\n",
1683
+ "- Pipeline and ColumnTransformer use this feature and define the default style\n",
1684
+ "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
1685
+ "*/\n",
1686
+ "\n",
1687
+ "/* Pipeline and ColumnTransformer style (default) */\n",
1688
+ "\n",
1689
+ "#sk-container-id-3 div.sk-toggleable {\n",
1690
+ " /* Default theme specific background. It is overwritten whether we have a\n",
1691
+ " specific estimator or a Pipeline/ColumnTransformer */\n",
1692
+ " background-color: var(--sklearn-color-background);\n",
1693
+ "}\n",
1694
+ "\n",
1695
+ "/* Toggleable label */\n",
1696
+ "#sk-container-id-3 label.sk-toggleable__label {\n",
1697
+ " cursor: pointer;\n",
1698
+ " display: block;\n",
1699
+ " width: 100%;\n",
1700
+ " margin-bottom: 0;\n",
1701
+ " padding: 0.5em;\n",
1702
+ " box-sizing: border-box;\n",
1703
+ " text-align: center;\n",
1704
+ "}\n",
1705
+ "\n",
1706
+ "#sk-container-id-3 label.sk-toggleable__label-arrow:before {\n",
1707
+ " /* Arrow on the left of the label */\n",
1708
+ " content: \"▸\";\n",
1709
+ " float: left;\n",
1710
+ " margin-right: 0.25em;\n",
1711
+ " color: var(--sklearn-color-icon);\n",
1712
+ "}\n",
1713
+ "\n",
1714
+ "#sk-container-id-3 label.sk-toggleable__label-arrow:hover:before {\n",
1715
+ " color: var(--sklearn-color-text);\n",
1716
+ "}\n",
1717
+ "\n",
1718
+ "/* Toggleable content - dropdown */\n",
1719
+ "\n",
1720
+ "#sk-container-id-3 div.sk-toggleable__content {\n",
1721
+ " max-height: 0;\n",
1722
+ " max-width: 0;\n",
1723
+ " overflow: hidden;\n",
1724
+ " text-align: left;\n",
1725
+ " /* unfitted */\n",
1726
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
1727
+ "}\n",
1728
+ "\n",
1729
+ "#sk-container-id-3 div.sk-toggleable__content.fitted {\n",
1730
+ " /* fitted */\n",
1731
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
1732
+ "}\n",
1733
+ "\n",
1734
+ "#sk-container-id-3 div.sk-toggleable__content pre {\n",
1735
+ " margin: 0.2em;\n",
1736
+ " border-radius: 0.25em;\n",
1737
+ " color: var(--sklearn-color-text);\n",
1738
+ " /* unfitted */\n",
1739
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
1740
+ "}\n",
1741
+ "\n",
1742
+ "#sk-container-id-3 div.sk-toggleable__content.fitted pre {\n",
1743
+ " /* unfitted */\n",
1744
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
1745
+ "}\n",
1746
+ "\n",
1747
+ "#sk-container-id-3 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
1748
+ " /* Expand drop-down */\n",
1749
+ " max-height: 200px;\n",
1750
+ " max-width: 100%;\n",
1751
+ " overflow: auto;\n",
1752
+ "}\n",
1753
+ "\n",
1754
+ "#sk-container-id-3 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
1755
+ " content: \"▾\";\n",
1756
+ "}\n",
1757
+ "\n",
1758
+ "/* Pipeline/ColumnTransformer-specific style */\n",
1759
+ "\n",
1760
+ "#sk-container-id-3 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
1761
+ " color: var(--sklearn-color-text);\n",
1762
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
1763
+ "}\n",
1764
+ "\n",
1765
+ "#sk-container-id-3 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
1766
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
1767
+ "}\n",
1768
+ "\n",
1769
+ "/* Estimator-specific style */\n",
1770
+ "\n",
1771
+ "/* Colorize estimator box */\n",
1772
+ "#sk-container-id-3 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
1773
+ " /* unfitted */\n",
1774
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
1775
+ "}\n",
1776
+ "\n",
1777
+ "#sk-container-id-3 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
1778
+ " /* fitted */\n",
1779
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
1780
+ "}\n",
1781
+ "\n",
1782
+ "#sk-container-id-3 div.sk-label label.sk-toggleable__label,\n",
1783
+ "#sk-container-id-3 div.sk-label label {\n",
1784
+ " /* The background is the default theme color */\n",
1785
+ " color: var(--sklearn-color-text-on-default-background);\n",
1786
+ "}\n",
1787
+ "\n",
1788
+ "/* On hover, darken the color of the background */\n",
1789
+ "#sk-container-id-3 div.sk-label:hover label.sk-toggleable__label {\n",
1790
+ " color: var(--sklearn-color-text);\n",
1791
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
1792
+ "}\n",
1793
+ "\n",
1794
+ "/* Label box, darken color on hover, fitted */\n",
1795
+ "#sk-container-id-3 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
1796
+ " color: var(--sklearn-color-text);\n",
1797
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
1798
+ "}\n",
1799
+ "\n",
1800
+ "/* Estimator label */\n",
1801
+ "\n",
1802
+ "#sk-container-id-3 div.sk-label label {\n",
1803
+ " font-family: monospace;\n",
1804
+ " font-weight: bold;\n",
1805
+ " display: inline-block;\n",
1806
+ " line-height: 1.2em;\n",
1807
+ "}\n",
1808
+ "\n",
1809
+ "#sk-container-id-3 div.sk-label-container {\n",
1810
+ " text-align: center;\n",
1811
+ "}\n",
1812
+ "\n",
1813
+ "/* Estimator-specific */\n",
1814
+ "#sk-container-id-3 div.sk-estimator {\n",
1815
+ " font-family: monospace;\n",
1816
+ " border: 1px dotted var(--sklearn-color-border-box);\n",
1817
+ " border-radius: 0.25em;\n",
1818
+ " box-sizing: border-box;\n",
1819
+ " margin-bottom: 0.5em;\n",
1820
+ " /* unfitted */\n",
1821
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
1822
+ "}\n",
1823
+ "\n",
1824
+ "#sk-container-id-3 div.sk-estimator.fitted {\n",
1825
+ " /* fitted */\n",
1826
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
1827
+ "}\n",
1828
+ "\n",
1829
+ "/* on hover */\n",
1830
+ "#sk-container-id-3 div.sk-estimator:hover {\n",
1831
+ " /* unfitted */\n",
1832
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
1833
+ "}\n",
1834
+ "\n",
1835
+ "#sk-container-id-3 div.sk-estimator.fitted:hover {\n",
1836
+ " /* fitted */\n",
1837
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
1838
+ "}\n",
1839
+ "\n",
1840
+ "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
1841
+ "\n",
1842
+ "/* Common style for \"i\" and \"?\" */\n",
1843
+ "\n",
1844
+ ".sk-estimator-doc-link,\n",
1845
+ "a:link.sk-estimator-doc-link,\n",
1846
+ "a:visited.sk-estimator-doc-link {\n",
1847
+ " float: right;\n",
1848
+ " font-size: smaller;\n",
1849
+ " line-height: 1em;\n",
1850
+ " font-family: monospace;\n",
1851
+ " background-color: var(--sklearn-color-background);\n",
1852
+ " border-radius: 1em;\n",
1853
+ " height: 1em;\n",
1854
+ " width: 1em;\n",
1855
+ " text-decoration: none !important;\n",
1856
+ " margin-left: 1ex;\n",
1857
+ " /* unfitted */\n",
1858
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
1859
+ " color: var(--sklearn-color-unfitted-level-1);\n",
1860
+ "}\n",
1861
+ "\n",
1862
+ ".sk-estimator-doc-link.fitted,\n",
1863
+ "a:link.sk-estimator-doc-link.fitted,\n",
1864
+ "a:visited.sk-estimator-doc-link.fitted {\n",
1865
+ " /* fitted */\n",
1866
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
1867
+ " color: var(--sklearn-color-fitted-level-1);\n",
1868
+ "}\n",
1869
+ "\n",
1870
+ "/* On hover */\n",
1871
+ "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
1872
+ ".sk-estimator-doc-link:hover,\n",
1873
+ "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
1874
+ ".sk-estimator-doc-link:hover {\n",
1875
+ " /* unfitted */\n",
1876
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
1877
+ " color: var(--sklearn-color-background);\n",
1878
+ " text-decoration: none;\n",
1879
+ "}\n",
1880
+ "\n",
1881
+ "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
1882
+ ".sk-estimator-doc-link.fitted:hover,\n",
1883
+ "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
1884
+ ".sk-estimator-doc-link.fitted:hover {\n",
1885
+ " /* fitted */\n",
1886
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
1887
+ " color: var(--sklearn-color-background);\n",
1888
+ " text-decoration: none;\n",
1889
+ "}\n",
1890
+ "\n",
1891
+ "/* Span, style for the box shown on hovering the info icon */\n",
1892
+ ".sk-estimator-doc-link span {\n",
1893
+ " display: none;\n",
1894
+ " z-index: 9999;\n",
1895
+ " position: relative;\n",
1896
+ " font-weight: normal;\n",
1897
+ " right: .2ex;\n",
1898
+ " padding: .5ex;\n",
1899
+ " margin: .5ex;\n",
1900
+ " width: min-content;\n",
1901
+ " min-width: 20ex;\n",
1902
+ " max-width: 50ex;\n",
1903
+ " color: var(--sklearn-color-text);\n",
1904
+ " box-shadow: 2pt 2pt 4pt #999;\n",
1905
+ " /* unfitted */\n",
1906
+ " background: var(--sklearn-color-unfitted-level-0);\n",
1907
+ " border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
1908
+ "}\n",
1909
+ "\n",
1910
+ ".sk-estimator-doc-link.fitted span {\n",
1911
+ " /* fitted */\n",
1912
+ " background: var(--sklearn-color-fitted-level-0);\n",
1913
+ " border: var(--sklearn-color-fitted-level-3);\n",
1914
+ "}\n",
1915
+ "\n",
1916
+ ".sk-estimator-doc-link:hover span {\n",
1917
+ " display: block;\n",
1918
+ "}\n",
1919
+ "\n",
1920
+ "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
1921
+ "\n",
1922
+ "#sk-container-id-3 a.estimator_doc_link {\n",
1923
+ " float: right;\n",
1924
+ " font-size: 1rem;\n",
1925
+ " line-height: 1em;\n",
1926
+ " font-family: monospace;\n",
1927
+ " background-color: var(--sklearn-color-background);\n",
1928
+ " border-radius: 1rem;\n",
1929
+ " height: 1rem;\n",
1930
+ " width: 1rem;\n",
1931
+ " text-decoration: none;\n",
1932
+ " /* unfitted */\n",
1933
+ " color: var(--sklearn-color-unfitted-level-1);\n",
1934
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
1935
+ "}\n",
1936
+ "\n",
1937
+ "#sk-container-id-3 a.estimator_doc_link.fitted {\n",
1938
+ " /* fitted */\n",
1939
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
1940
+ " color: var(--sklearn-color-fitted-level-1);\n",
1941
+ "}\n",
1942
+ "\n",
1943
+ "/* On hover */\n",
1944
+ "#sk-container-id-3 a.estimator_doc_link:hover {\n",
1945
+ " /* unfitted */\n",
1946
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
1947
+ " color: var(--sklearn-color-background);\n",
1948
+ " text-decoration: none;\n",
1949
+ "}\n",
1950
+ "\n",
1951
+ "#sk-container-id-3 a.estimator_doc_link.fitted:hover {\n",
1952
+ " /* fitted */\n",
1953
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
1954
+ "}\n",
1955
+ "</style><div id=\"sk-container-id-3\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>SVC(kernel=&#x27;linear&#x27;)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-3\" type=\"checkbox\" checked><label for=\"sk-estimator-id-3\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;SVC<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.svm.SVC.html\">?<span>Documentation for SVC</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>SVC(kernel=&#x27;linear&#x27;)</pre></div> </div></div></div></div>"
1956
+ ],
1957
+ "text/plain": [
1958
+ "SVC(kernel='linear')"
1959
+ ]
1960
+ },
1961
+ "execution_count": 24,
1962
+ "metadata": {},
1963
+ "output_type": "execute_result"
1964
+ }
1965
+ ],
1966
+ "source": [
1967
+ "svm_model = SVC(kernel=\"linear\")\n",
1968
+ "svm_model.fit(X_train_tfidf, y_train)"
1969
+ ]
1970
+ },
1971
+ {
1972
+ "cell_type": "code",
1973
+ "execution_count": 25,
1974
+ "id": "2dc1b193",
1975
+ "metadata": {},
1976
+ "outputs": [],
1977
+ "source": [
1978
+ "y_pred = svm_model.predict(X_test_tfidf)"
1979
+ ]
1980
+ },
1981
+ {
1982
+ "cell_type": "code",
1983
+ "execution_count": 26,
1984
+ "id": "92801e61",
1985
+ "metadata": {},
1986
+ "outputs": [
1987
+ {
1988
+ "name": "stdout",
1989
+ "output_type": "stream",
1990
+ "text": [
1991
+ "Accuracy of SVM model: 0.9524632670700086\n",
1992
+ " precision recall f1-score support\n",
1993
+ "\n",
1994
+ " FALSE 0.94 1.00 0.97 847\n",
1995
+ " TRUE 1.00 0.83 0.90 310\n",
1996
+ "\n",
1997
+ " accuracy 0.95 1157\n",
1998
+ " macro avg 0.97 0.91 0.94 1157\n",
1999
+ "weighted avg 0.96 0.95 0.95 1157\n",
2000
+ "\n"
2001
+ ]
2002
+ }
2003
+ ],
2004
+ "source": [
2005
+ "accuracy = accuracy_score(y_test, y_pred)\n",
2006
+ "print(\"Accuracy of SVM model:\", accuracy)\n",
2007
+ "print(classification_report(y_test, y_pred))"
2008
+ ]
2009
+ },
2010
+ {
2011
+ "cell_type": "markdown",
2012
+ "id": "1d1f6ebd",
2013
+ "metadata": {},
2014
+ "source": [
2015
+ "## Random Forest Model"
2016
+ ]
2017
+ },
2018
+ {
2019
+ "cell_type": "code",
2020
+ "execution_count": 27,
2021
+ "id": "9170c174",
2022
+ "metadata": {},
2023
+ "outputs": [],
2024
+ "source": [
2025
+ "from sklearn.model_selection import train_test_split\n",
2026
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
2027
+ "from sklearn.ensemble import RandomForestClassifier\n",
2028
+ "from sklearn.metrics import accuracy_score"
2029
+ ]
2030
+ },
2031
+ {
2032
+ "cell_type": "code",
2033
+ "execution_count": 28,
2034
+ "id": "2092ca05",
2035
+ "metadata": {},
2036
+ "outputs": [],
2037
+ "source": [
2038
+ "X_train, X_test, y_train, y_test = train_test_split(\n",
2039
+ " X, y, test_size=0.2, random_state=42\n",
2040
+ ")"
2041
+ ]
2042
+ },
2043
+ {
2044
+ "cell_type": "code",
2045
+ "execution_count": 29,
2046
+ "id": "206296ce",
2047
+ "metadata": {},
2048
+ "outputs": [],
2049
+ "source": [
2050
+ "tfidf_vectorizer = TfidfVectorizer(max_features=1000)\n",
2051
+ "X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
2052
+ "X_test_tfidf = tfidf_vectorizer.transform(X_test)"
2053
+ ]
2054
+ },
2055
+ {
2056
+ "cell_type": "code",
2057
+ "execution_count": 30,
2058
+ "id": "258bd78f",
2059
+ "metadata": {},
2060
+ "outputs": [
2061
+ {
2062
+ "data": {
2063
+ "text/html": [
2064
+ "<style>#sk-container-id-4 {\n",
2065
+ " /* Definition of color scheme common for light and dark mode */\n",
2066
+ " --sklearn-color-text: black;\n",
2067
+ " --sklearn-color-line: gray;\n",
2068
+ " /* Definition of color scheme for unfitted estimators */\n",
2069
+ " --sklearn-color-unfitted-level-0: #fff5e6;\n",
2070
+ " --sklearn-color-unfitted-level-1: #f6e4d2;\n",
2071
+ " --sklearn-color-unfitted-level-2: #ffe0b3;\n",
2072
+ " --sklearn-color-unfitted-level-3: chocolate;\n",
2073
+ " /* Definition of color scheme for fitted estimators */\n",
2074
+ " --sklearn-color-fitted-level-0: #f0f8ff;\n",
2075
+ " --sklearn-color-fitted-level-1: #d4ebff;\n",
2076
+ " --sklearn-color-fitted-level-2: #b3dbfd;\n",
2077
+ " --sklearn-color-fitted-level-3: cornflowerblue;\n",
2078
+ "\n",
2079
+ " /* Specific color for light theme */\n",
2080
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
2081
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
2082
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
2083
+ " --sklearn-color-icon: #696969;\n",
2084
+ "\n",
2085
+ " @media (prefers-color-scheme: dark) {\n",
2086
+ " /* Redefinition of color scheme for dark theme */\n",
2087
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
2088
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
2089
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
2090
+ " --sklearn-color-icon: #878787;\n",
2091
+ " }\n",
2092
+ "}\n",
2093
+ "\n",
2094
+ "#sk-container-id-4 {\n",
2095
+ " color: var(--sklearn-color-text);\n",
2096
+ "}\n",
2097
+ "\n",
2098
+ "#sk-container-id-4 pre {\n",
2099
+ " padding: 0;\n",
2100
+ "}\n",
2101
+ "\n",
2102
+ "#sk-container-id-4 input.sk-hidden--visually {\n",
2103
+ " border: 0;\n",
2104
+ " clip: rect(1px 1px 1px 1px);\n",
2105
+ " clip: rect(1px, 1px, 1px, 1px);\n",
2106
+ " height: 1px;\n",
2107
+ " margin: -1px;\n",
2108
+ " overflow: hidden;\n",
2109
+ " padding: 0;\n",
2110
+ " position: absolute;\n",
2111
+ " width: 1px;\n",
2112
+ "}\n",
2113
+ "\n",
2114
+ "#sk-container-id-4 div.sk-dashed-wrapped {\n",
2115
+ " border: 1px dashed var(--sklearn-color-line);\n",
2116
+ " margin: 0 0.4em 0.5em 0.4em;\n",
2117
+ " box-sizing: border-box;\n",
2118
+ " padding-bottom: 0.4em;\n",
2119
+ " background-color: var(--sklearn-color-background);\n",
2120
+ "}\n",
2121
+ "\n",
2122
+ "#sk-container-id-4 div.sk-container {\n",
2123
+ " /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
2124
+ " but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
2125
+ " so we also need the `!important` here to be able to override the\n",
2126
+ " default hidden behavior on the sphinx rendered scikit-learn.org.\n",
2127
+ " See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
2128
+ " display: inline-block !important;\n",
2129
+ " position: relative;\n",
2130
+ "}\n",
2131
+ "\n",
2132
+ "#sk-container-id-4 div.sk-text-repr-fallback {\n",
2133
+ " display: none;\n",
2134
+ "}\n",
2135
+ "\n",
2136
+ "div.sk-parallel-item,\n",
2137
+ "div.sk-serial,\n",
2138
+ "div.sk-item {\n",
2139
+ " /* draw centered vertical line to link estimators */\n",
2140
+ " background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
2141
+ " background-size: 2px 100%;\n",
2142
+ " background-repeat: no-repeat;\n",
2143
+ " background-position: center center;\n",
2144
+ "}\n",
2145
+ "\n",
2146
+ "/* Parallel-specific style estimator block */\n",
2147
+ "\n",
2148
+ "#sk-container-id-4 div.sk-parallel-item::after {\n",
2149
+ " content: \"\";\n",
2150
+ " width: 100%;\n",
2151
+ " border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
2152
+ " flex-grow: 1;\n",
2153
+ "}\n",
2154
+ "\n",
2155
+ "#sk-container-id-4 div.sk-parallel {\n",
2156
+ " display: flex;\n",
2157
+ " align-items: stretch;\n",
2158
+ " justify-content: center;\n",
2159
+ " background-color: var(--sklearn-color-background);\n",
2160
+ " position: relative;\n",
2161
+ "}\n",
2162
+ "\n",
2163
+ "#sk-container-id-4 div.sk-parallel-item {\n",
2164
+ " display: flex;\n",
2165
+ " flex-direction: column;\n",
2166
+ "}\n",
2167
+ "\n",
2168
+ "#sk-container-id-4 div.sk-parallel-item:first-child::after {\n",
2169
+ " align-self: flex-end;\n",
2170
+ " width: 50%;\n",
2171
+ "}\n",
2172
+ "\n",
2173
+ "#sk-container-id-4 div.sk-parallel-item:last-child::after {\n",
2174
+ " align-self: flex-start;\n",
2175
+ " width: 50%;\n",
2176
+ "}\n",
2177
+ "\n",
2178
+ "#sk-container-id-4 div.sk-parallel-item:only-child::after {\n",
2179
+ " width: 0;\n",
2180
+ "}\n",
2181
+ "\n",
2182
+ "/* Serial-specific style estimator block */\n",
2183
+ "\n",
2184
+ "#sk-container-id-4 div.sk-serial {\n",
2185
+ " display: flex;\n",
2186
+ " flex-direction: column;\n",
2187
+ " align-items: center;\n",
2188
+ " background-color: var(--sklearn-color-background);\n",
2189
+ " padding-right: 1em;\n",
2190
+ " padding-left: 1em;\n",
2191
+ "}\n",
2192
+ "\n",
2193
+ "\n",
2194
+ "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
2195
+ "clickable and can be expanded/collapsed.\n",
2196
+ "- Pipeline and ColumnTransformer use this feature and define the default style\n",
2197
+ "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
2198
+ "*/\n",
2199
+ "\n",
2200
+ "/* Pipeline and ColumnTransformer style (default) */\n",
2201
+ "\n",
2202
+ "#sk-container-id-4 div.sk-toggleable {\n",
2203
+ " /* Default theme specific background. It is overwritten whether we have a\n",
2204
+ " specific estimator or a Pipeline/ColumnTransformer */\n",
2205
+ " background-color: var(--sklearn-color-background);\n",
2206
+ "}\n",
2207
+ "\n",
2208
+ "/* Toggleable label */\n",
2209
+ "#sk-container-id-4 label.sk-toggleable__label {\n",
2210
+ " cursor: pointer;\n",
2211
+ " display: block;\n",
2212
+ " width: 100%;\n",
2213
+ " margin-bottom: 0;\n",
2214
+ " padding: 0.5em;\n",
2215
+ " box-sizing: border-box;\n",
2216
+ " text-align: center;\n",
2217
+ "}\n",
2218
+ "\n",
2219
+ "#sk-container-id-4 label.sk-toggleable__label-arrow:before {\n",
2220
+ " /* Arrow on the left of the label */\n",
2221
+ " content: \"▸\";\n",
2222
+ " float: left;\n",
2223
+ " margin-right: 0.25em;\n",
2224
+ " color: var(--sklearn-color-icon);\n",
2225
+ "}\n",
2226
+ "\n",
2227
+ "#sk-container-id-4 label.sk-toggleable__label-arrow:hover:before {\n",
2228
+ " color: var(--sklearn-color-text);\n",
2229
+ "}\n",
2230
+ "\n",
2231
+ "/* Toggleable content - dropdown */\n",
2232
+ "\n",
2233
+ "#sk-container-id-4 div.sk-toggleable__content {\n",
2234
+ " max-height: 0;\n",
2235
+ " max-width: 0;\n",
2236
+ " overflow: hidden;\n",
2237
+ " text-align: left;\n",
2238
+ " /* unfitted */\n",
2239
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
2240
+ "}\n",
2241
+ "\n",
2242
+ "#sk-container-id-4 div.sk-toggleable__content.fitted {\n",
2243
+ " /* fitted */\n",
2244
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
2245
+ "}\n",
2246
+ "\n",
2247
+ "#sk-container-id-4 div.sk-toggleable__content pre {\n",
2248
+ " margin: 0.2em;\n",
2249
+ " border-radius: 0.25em;\n",
2250
+ " color: var(--sklearn-color-text);\n",
2251
+ " /* unfitted */\n",
2252
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
2253
+ "}\n",
2254
+ "\n",
2255
+ "#sk-container-id-4 div.sk-toggleable__content.fitted pre {\n",
2256
+ " /* unfitted */\n",
2257
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
2258
+ "}\n",
2259
+ "\n",
2260
+ "#sk-container-id-4 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
2261
+ " /* Expand drop-down */\n",
2262
+ " max-height: 200px;\n",
2263
+ " max-width: 100%;\n",
2264
+ " overflow: auto;\n",
2265
+ "}\n",
2266
+ "\n",
2267
+ "#sk-container-id-4 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
2268
+ " content: \"▾\";\n",
2269
+ "}\n",
2270
+ "\n",
2271
+ "/* Pipeline/ColumnTransformer-specific style */\n",
2272
+ "\n",
2273
+ "#sk-container-id-4 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2274
+ " color: var(--sklearn-color-text);\n",
2275
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
2276
+ "}\n",
2277
+ "\n",
2278
+ "#sk-container-id-4 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2279
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
2280
+ "}\n",
2281
+ "\n",
2282
+ "/* Estimator-specific style */\n",
2283
+ "\n",
2284
+ "/* Colorize estimator box */\n",
2285
+ "#sk-container-id-4 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2286
+ " /* unfitted */\n",
2287
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
2288
+ "}\n",
2289
+ "\n",
2290
+ "#sk-container-id-4 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2291
+ " /* fitted */\n",
2292
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
2293
+ "}\n",
2294
+ "\n",
2295
+ "#sk-container-id-4 div.sk-label label.sk-toggleable__label,\n",
2296
+ "#sk-container-id-4 div.sk-label label {\n",
2297
+ " /* The background is the default theme color */\n",
2298
+ " color: var(--sklearn-color-text-on-default-background);\n",
2299
+ "}\n",
2300
+ "\n",
2301
+ "/* On hover, darken the color of the background */\n",
2302
+ "#sk-container-id-4 div.sk-label:hover label.sk-toggleable__label {\n",
2303
+ " color: var(--sklearn-color-text);\n",
2304
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
2305
+ "}\n",
2306
+ "\n",
2307
+ "/* Label box, darken color on hover, fitted */\n",
2308
+ "#sk-container-id-4 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
2309
+ " color: var(--sklearn-color-text);\n",
2310
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
2311
+ "}\n",
2312
+ "\n",
2313
+ "/* Estimator label */\n",
2314
+ "\n",
2315
+ "#sk-container-id-4 div.sk-label label {\n",
2316
+ " font-family: monospace;\n",
2317
+ " font-weight: bold;\n",
2318
+ " display: inline-block;\n",
2319
+ " line-height: 1.2em;\n",
2320
+ "}\n",
2321
+ "\n",
2322
+ "#sk-container-id-4 div.sk-label-container {\n",
2323
+ " text-align: center;\n",
2324
+ "}\n",
2325
+ "\n",
2326
+ "/* Estimator-specific */\n",
2327
+ "#sk-container-id-4 div.sk-estimator {\n",
2328
+ " font-family: monospace;\n",
2329
+ " border: 1px dotted var(--sklearn-color-border-box);\n",
2330
+ " border-radius: 0.25em;\n",
2331
+ " box-sizing: border-box;\n",
2332
+ " margin-bottom: 0.5em;\n",
2333
+ " /* unfitted */\n",
2334
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
2335
+ "}\n",
2336
+ "\n",
2337
+ "#sk-container-id-4 div.sk-estimator.fitted {\n",
2338
+ " /* fitted */\n",
2339
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
2340
+ "}\n",
2341
+ "\n",
2342
+ "/* on hover */\n",
2343
+ "#sk-container-id-4 div.sk-estimator:hover {\n",
2344
+ " /* unfitted */\n",
2345
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
2346
+ "}\n",
2347
+ "\n",
2348
+ "#sk-container-id-4 div.sk-estimator.fitted:hover {\n",
2349
+ " /* fitted */\n",
2350
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
2351
+ "}\n",
2352
+ "\n",
2353
+ "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
2354
+ "\n",
2355
+ "/* Common style for \"i\" and \"?\" */\n",
2356
+ "\n",
2357
+ ".sk-estimator-doc-link,\n",
2358
+ "a:link.sk-estimator-doc-link,\n",
2359
+ "a:visited.sk-estimator-doc-link {\n",
2360
+ " float: right;\n",
2361
+ " font-size: smaller;\n",
2362
+ " line-height: 1em;\n",
2363
+ " font-family: monospace;\n",
2364
+ " background-color: var(--sklearn-color-background);\n",
2365
+ " border-radius: 1em;\n",
2366
+ " height: 1em;\n",
2367
+ " width: 1em;\n",
2368
+ " text-decoration: none !important;\n",
2369
+ " margin-left: 1ex;\n",
2370
+ " /* unfitted */\n",
2371
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
2372
+ " color: var(--sklearn-color-unfitted-level-1);\n",
2373
+ "}\n",
2374
+ "\n",
2375
+ ".sk-estimator-doc-link.fitted,\n",
2376
+ "a:link.sk-estimator-doc-link.fitted,\n",
2377
+ "a:visited.sk-estimator-doc-link.fitted {\n",
2378
+ " /* fitted */\n",
2379
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
2380
+ " color: var(--sklearn-color-fitted-level-1);\n",
2381
+ "}\n",
2382
+ "\n",
2383
+ "/* On hover */\n",
2384
+ "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
2385
+ ".sk-estimator-doc-link:hover,\n",
2386
+ "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
2387
+ ".sk-estimator-doc-link:hover {\n",
2388
+ " /* unfitted */\n",
2389
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
2390
+ " color: var(--sklearn-color-background);\n",
2391
+ " text-decoration: none;\n",
2392
+ "}\n",
2393
+ "\n",
2394
+ "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
2395
+ ".sk-estimator-doc-link.fitted:hover,\n",
2396
+ "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
2397
+ ".sk-estimator-doc-link.fitted:hover {\n",
2398
+ " /* fitted */\n",
2399
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
2400
+ " color: var(--sklearn-color-background);\n",
2401
+ " text-decoration: none;\n",
2402
+ "}\n",
2403
+ "\n",
2404
+ "/* Span, style for the box shown on hovering the info icon */\n",
2405
+ ".sk-estimator-doc-link span {\n",
2406
+ " display: none;\n",
2407
+ " z-index: 9999;\n",
2408
+ " position: relative;\n",
2409
+ " font-weight: normal;\n",
2410
+ " right: .2ex;\n",
2411
+ " padding: .5ex;\n",
2412
+ " margin: .5ex;\n",
2413
+ " width: min-content;\n",
2414
+ " min-width: 20ex;\n",
2415
+ " max-width: 50ex;\n",
2416
+ " color: var(--sklearn-color-text);\n",
2417
+ " box-shadow: 2pt 2pt 4pt #999;\n",
2418
+ " /* unfitted */\n",
2419
+ " background: var(--sklearn-color-unfitted-level-0);\n",
2420
+ " border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
2421
+ "}\n",
2422
+ "\n",
2423
+ ".sk-estimator-doc-link.fitted span {\n",
2424
+ " /* fitted */\n",
2425
+ " background: var(--sklearn-color-fitted-level-0);\n",
2426
+ " border: var(--sklearn-color-fitted-level-3);\n",
2427
+ "}\n",
2428
+ "\n",
2429
+ ".sk-estimator-doc-link:hover span {\n",
2430
+ " display: block;\n",
2431
+ "}\n",
2432
+ "\n",
2433
+ "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
2434
+ "\n",
2435
+ "#sk-container-id-4 a.estimator_doc_link {\n",
2436
+ " float: right;\n",
2437
+ " font-size: 1rem;\n",
2438
+ " line-height: 1em;\n",
2439
+ " font-family: monospace;\n",
2440
+ " background-color: var(--sklearn-color-background);\n",
2441
+ " border-radius: 1rem;\n",
2442
+ " height: 1rem;\n",
2443
+ " width: 1rem;\n",
2444
+ " text-decoration: none;\n",
2445
+ " /* unfitted */\n",
2446
+ " color: var(--sklearn-color-unfitted-level-1);\n",
2447
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
2448
+ "}\n",
2449
+ "\n",
2450
+ "#sk-container-id-4 a.estimator_doc_link.fitted {\n",
2451
+ " /* fitted */\n",
2452
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
2453
+ " color: var(--sklearn-color-fitted-level-1);\n",
2454
+ "}\n",
2455
+ "\n",
2456
+ "/* On hover */\n",
2457
+ "#sk-container-id-4 a.estimator_doc_link:hover {\n",
2458
+ " /* unfitted */\n",
2459
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
2460
+ " color: var(--sklearn-color-background);\n",
2461
+ " text-decoration: none;\n",
2462
+ "}\n",
2463
+ "\n",
2464
+ "#sk-container-id-4 a.estimator_doc_link.fitted:hover {\n",
2465
+ " /* fitted */\n",
2466
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
2467
+ "}\n",
2468
+ "</style><div id=\"sk-container-id-4\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomForestClassifier(random_state=42)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" checked><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;RandomForestClassifier<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.ensemble.RandomForestClassifier.html\">?<span>Documentation for RandomForestClassifier</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>RandomForestClassifier(random_state=42)</pre></div> </div></div></div></div>"
2469
+ ],
2470
+ "text/plain": [
2471
+ "RandomForestClassifier(random_state=42)"
2472
+ ]
2473
+ },
2474
+ "execution_count": 30,
2475
+ "metadata": {},
2476
+ "output_type": "execute_result"
2477
+ }
2478
+ ],
2479
+ "source": [
2480
+ "rf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n",
2481
+ "rf_model.fit(X_train_tfidf, y_train)"
2482
+ ]
2483
+ },
2484
+ {
2485
+ "cell_type": "code",
2486
+ "execution_count": 31,
2487
+ "id": "0e2910f6",
2488
+ "metadata": {},
2489
+ "outputs": [],
2490
+ "source": [
2491
+ "y_pred = rf_model.predict(X_test_tfidf)"
2492
+ ]
2493
+ },
2494
+ {
2495
+ "cell_type": "code",
2496
+ "execution_count": 32,
2497
+ "id": "f06900d3",
2498
+ "metadata": {},
2499
+ "outputs": [
2500
+ {
2501
+ "name": "stdout",
2502
+ "output_type": "stream",
2503
+ "text": [
2504
+ "Accuracy of Random Forest Model: 0.9628349178910977\n",
2505
+ " precision recall f1-score support\n",
2506
+ "\n",
2507
+ " FALSE 0.96 1.00 0.98 847\n",
2508
+ " TRUE 0.99 0.87 0.93 310\n",
2509
+ "\n",
2510
+ " accuracy 0.96 1157\n",
2511
+ " macro avg 0.97 0.93 0.95 1157\n",
2512
+ "weighted avg 0.96 0.96 0.96 1157\n",
2513
+ "\n"
2514
+ ]
2515
+ }
2516
+ ],
2517
+ "source": [
2518
+ "accuracy = accuracy_score(y_test, y_pred)\n",
2519
+ "print(\"Accuracy of Random Forest Model:\", accuracy)\n",
2520
+ "print(classification_report(y_test, y_pred))"
2521
+ ]
2522
+ }
2523
+ ],
2524
+ "metadata": {
2525
+ "kernelspec": {
2526
+ "display_name": "Python 3 (ipykernel)",
2527
+ "language": "python",
2528
+ "name": "python3"
2529
+ },
2530
+ "language_info": {
2531
+ "codemirror_mode": {
2532
+ "name": "ipython",
2533
+ "version": 3
2534
+ },
2535
+ "file_extension": ".py",
2536
+ "mimetype": "text/x-python",
2537
+ "name": "python",
2538
+ "nbconvert_exporter": "python",
2539
+ "pygments_lexer": "ipython3",
2540
+ "version": "3.12.4"
2541
+ }
2542
+ },
2543
+ "nbformat": 4,
2544
+ "nbformat_minor": 5
2545
+ }
notebooks/03_basic_text_preprocessing.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/04_topic_modelling_benchmark_using_headline.ipynb ADDED
@@ -0,0 +1,913 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "46322fb5-5918-4b70-9689-9e0781439ac4",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "workding dir: /Users/inflaton/code/engd/papers/maritime/global-incidents\n",
14
+ "loading env vars from: /Users/inflaton/code/engd/papers/maritime/global-incidents/.env\n"
15
+ ]
16
+ },
17
+ {
18
+ "data": {
19
+ "text/plain": [
20
+ "True"
21
+ ]
22
+ },
23
+ "execution_count": 1,
24
+ "metadata": {},
25
+ "output_type": "execute_result"
26
+ }
27
+ ],
28
+ "source": [
29
+ "%load_ext autoreload\n",
30
+ "%autoreload 2\n",
31
+ "\n",
32
+ "import os\n",
33
+ "import sys\n",
34
+ "from pathlib import Path\n",
35
+ "\n",
36
+ "workding_dir = str(Path.cwd().parent)\n",
37
+ "os.chdir(workding_dir)\n",
38
+ "sys.path.append(workding_dir)\n",
39
+ "print(\"workding dir:\", workding_dir)\n",
40
+ "\n",
41
+ "from dotenv import find_dotenv, load_dotenv\n",
42
+ "\n",
43
+ "found_dotenv = find_dotenv(\".env\")\n",
44
+ "\n",
45
+ "if len(found_dotenv) == 0:\n",
46
+ " found_dotenv = find_dotenv(\".env.example\")\n",
47
+ "print(f\"loading env vars from: {found_dotenv}\")\n",
48
+ "load_dotenv(found_dotenv, override=True)"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 2,
54
+ "id": "daf1e3d1-75ac-4299-8bed-2f413a49f9a6",
55
+ "metadata": {
56
+ "tags": []
57
+ },
58
+ "outputs": [],
59
+ "source": [
60
+ "import nltk\n",
61
+ "from nltk.tokenize import sent_tokenize\n",
62
+ "from nltk.tokenize import word_tokenize\n",
63
+ "\n",
64
+ "import gensim\n",
65
+ "from gensim import corpora\n",
66
+ "from gensim import similarities\n",
67
+ "from gensim import models\n",
68
+ "from gensim.models import CoherenceModel\n",
69
+ "\n",
70
+ "# from wordcloud import WordCloud, ImageColorGenerator\n",
71
+ "import matplotlib.pyplot as plt\n",
72
+ "import seaborn as sns\n",
73
+ "import pandas as pd\n",
74
+ "import re\n",
75
+ "import os\n",
76
+ "import datetime\n",
77
+ "\n",
78
+ "import warnings\n",
79
+ "\n",
80
+ "warnings.filterwarnings(\"ignore\")\n",
81
+ "\n",
82
+ "from pprint import pprint\n",
83
+ "import pyLDAvis\n",
84
+ "import pyLDAvis.gensim_models as gensimvis"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "markdown",
89
+ "id": "49e6de6b-71bd-4948-8827-52601406058f",
90
+ "metadata": {},
91
+ "source": [
92
+ "# Import Data with only the news headline and details"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "execution_count": 6,
98
+ "id": "49222182-7811-4fa6-8c0a-21d3a546863e",
99
+ "metadata": {},
100
+ "outputs": [],
101
+ "source": [
102
+ "df = pd.read_parquet(\"data/processed_data.parquet\")"
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "code",
107
+ "execution_count": 7,
108
+ "id": "3fb59a30",
109
+ "metadata": {},
110
+ "outputs": [
111
+ {
112
+ "data": {
113
+ "text/html": [
114
+ "<div>\n",
115
+ "<style scoped>\n",
116
+ " .dataframe tbody tr th:only-of-type {\n",
117
+ " vertical-align: middle;\n",
118
+ " }\n",
119
+ "\n",
120
+ " .dataframe tbody tr th {\n",
121
+ " vertical-align: top;\n",
122
+ " }\n",
123
+ "\n",
124
+ " .dataframe thead th {\n",
125
+ " text-align: right;\n",
126
+ " }\n",
127
+ "</style>\n",
128
+ "<table border=\"1\" class=\"dataframe\">\n",
129
+ " <thead>\n",
130
+ " <tr style=\"text-align: right;\">\n",
131
+ " <th></th>\n",
132
+ " <th>id</th>\n",
133
+ " <th>Headline</th>\n",
134
+ " <th>Details</th>\n",
135
+ " <th>Severity</th>\n",
136
+ " <th>Category</th>\n",
137
+ " <th>Region</th>\n",
138
+ " <th>Datetime</th>\n",
139
+ " <th>Year</th>\n",
140
+ " <th>lat</th>\n",
141
+ " <th>lon</th>\n",
142
+ " <th>maritime_label</th>\n",
143
+ " <th>found_ports</th>\n",
144
+ " <th>contains_port_info</th>\n",
145
+ " <th>if_labeled</th>\n",
146
+ " <th>Month</th>\n",
147
+ " <th>Week</th>\n",
148
+ " <th>Headline_Details</th>\n",
149
+ " <th>cleaned_Headline_Details</th>\n",
150
+ " <th>binary_Headline_Details</th>\n",
151
+ " <th>word_count</th>\n",
152
+ " </tr>\n",
153
+ " </thead>\n",
154
+ " <tbody>\n",
155
+ " <tr>\n",
156
+ " <th>0</th>\n",
157
+ " <td>1.0</td>\n",
158
+ " <td>Grasberg Mine- Grasberg mine workers extend st...</td>\n",
159
+ " <td>Media sources indicate that workers at the Gra...</td>\n",
160
+ " <td>Moderate</td>\n",
161
+ " <td>Mine Workers Strike</td>\n",
162
+ " <td>Indonesia</td>\n",
163
+ " <td>28/5/17 17:08</td>\n",
164
+ " <td>2017.0</td>\n",
165
+ " <td>-4.05608</td>\n",
166
+ " <td>137.11302</td>\n",
167
+ " <td>False</td>\n",
168
+ " <td>['freeport']</td>\n",
169
+ " <td>1.0</td>\n",
170
+ " <td>False</td>\n",
171
+ " <td>5.0</td>\n",
172
+ " <td>21.0</td>\n",
173
+ " <td>Grasberg Mine- Grasberg mine workers extend st...</td>\n",
174
+ " <td>[grasberg, grasberg, mine, worker, extend, str...</td>\n",
175
+ " <td>[worker_grasberg_mine]</td>\n",
176
+ " <td>1</td>\n",
177
+ " </tr>\n",
178
+ " <tr>\n",
179
+ " <th>1</th>\n",
180
+ " <td>2.0</td>\n",
181
+ " <td>Indonesia: Undersea internet cables damaged by...</td>\n",
182
+ " <td>News sources are stating that recent typhoons ...</td>\n",
183
+ " <td>Minor</td>\n",
184
+ " <td>Travel Warning</td>\n",
185
+ " <td>Indonesia</td>\n",
186
+ " <td>4/9/17 14:30</td>\n",
187
+ " <td>2017.0</td>\n",
188
+ " <td>NaN</td>\n",
189
+ " <td>NaN</td>\n",
190
+ " <td>False</td>\n",
191
+ " <td>['hong kong']</td>\n",
192
+ " <td>1.0</td>\n",
193
+ " <td>False</td>\n",
194
+ " <td>4.0</td>\n",
195
+ " <td>14.0</td>\n",
196
+ " <td>Indonesia: Undersea internet cables damaged by...</td>\n",
197
+ " <td>[indonesia, undersea, internet, cable, damage,...</td>\n",
198
+ " <td>[undersea_internet_cable]</td>\n",
199
+ " <td>1</td>\n",
200
+ " </tr>\n",
201
+ " <tr>\n",
202
+ " <th>2</th>\n",
203
+ " <td>3.0</td>\n",
204
+ " <td>Shanghai port congestion impacts terminals in ...</td>\n",
205
+ " <td>The persisting port congestion at Shanghai’s Y...</td>\n",
206
+ " <td>Minor</td>\n",
207
+ " <td>Port Congestion</td>\n",
208
+ " <td>China</td>\n",
209
+ " <td>27/4/17 9:16</td>\n",
210
+ " <td>2017.0</td>\n",
211
+ " <td>29.52000</td>\n",
212
+ " <td>121.33190</td>\n",
213
+ " <td>True</td>\n",
214
+ " <td>['ningbo', 'qingdao', 'shanghai']</td>\n",
215
+ " <td>1.0</td>\n",
216
+ " <td>False</td>\n",
217
+ " <td>4.0</td>\n",
218
+ " <td>17.0</td>\n",
219
+ " <td>Shanghai port congestion impacts terminals in ...</td>\n",
220
+ " <td>[shanghai, port, congestion, impact, terminal,...</td>\n",
221
+ " <td>[]</td>\n",
222
+ " <td>0</td>\n",
223
+ " </tr>\n",
224
+ " <tr>\n",
225
+ " <th>3</th>\n",
226
+ " <td>4.0</td>\n",
227
+ " <td>UPDATE - Indonesia: Explosion at KP Terminal i...</td>\n",
228
+ " <td>Updated local media sources from Jakarta indic...</td>\n",
229
+ " <td>Extreme</td>\n",
230
+ " <td>Bombing, Police Operations</td>\n",
231
+ " <td>Indonesia</td>\n",
232
+ " <td>24/5/17 15:15</td>\n",
233
+ " <td>2017.0</td>\n",
234
+ " <td>-6.22465</td>\n",
235
+ " <td>106.86700</td>\n",
236
+ " <td>True</td>\n",
237
+ " <td>['jakarta']</td>\n",
238
+ " <td>1.0</td>\n",
239
+ " <td>False</td>\n",
240
+ " <td>5.0</td>\n",
241
+ " <td>21.0</td>\n",
242
+ " <td>UPDATE - Indonesia: Explosion at KP Terminal i...</td>\n",
243
+ " <td>[update, indonesia, explosion, at, kp, termina...</td>\n",
244
+ " <td>[]</td>\n",
245
+ " <td>0</td>\n",
246
+ " </tr>\n",
247
+ " <tr>\n",
248
+ " <th>4</th>\n",
249
+ " <td>5.0</td>\n",
250
+ " <td>UPDATE - Indonesia: Police confirm two explosi...</td>\n",
251
+ " <td>According to local police in Jakarta, two expl...</td>\n",
252
+ " <td>Extreme</td>\n",
253
+ " <td>Bombing, Police Operations</td>\n",
254
+ " <td>Indonesia</td>\n",
255
+ " <td>24/5/17 16:20</td>\n",
256
+ " <td>2017.0</td>\n",
257
+ " <td>NaN</td>\n",
258
+ " <td>NaN</td>\n",
259
+ " <td>True</td>\n",
260
+ " <td>['jakarta']</td>\n",
261
+ " <td>1.0</td>\n",
262
+ " <td>True</td>\n",
263
+ " <td>5.0</td>\n",
264
+ " <td>21.0</td>\n",
265
+ " <td>UPDATE - Indonesia: Police confirm two explosi...</td>\n",
266
+ " <td>[update, indonesia, police, confirm, two, expl...</td>\n",
267
+ " <td>[]</td>\n",
268
+ " <td>0</td>\n",
269
+ " </tr>\n",
270
+ " </tbody>\n",
271
+ "</table>\n",
272
+ "</div>"
273
+ ],
274
+ "text/plain": [
275
+ " id Headline \\\n",
276
+ "0 1.0 Grasberg Mine- Grasberg mine workers extend st... \n",
277
+ "1 2.0 Indonesia: Undersea internet cables damaged by... \n",
278
+ "2 3.0 Shanghai port congestion impacts terminals in ... \n",
279
+ "3 4.0 UPDATE - Indonesia: Explosion at KP Terminal i... \n",
280
+ "4 5.0 UPDATE - Indonesia: Police confirm two explosi... \n",
281
+ "\n",
282
+ " Details Severity \\\n",
283
+ "0 Media sources indicate that workers at the Gra... Moderate \n",
284
+ "1 News sources are stating that recent typhoons ... Minor \n",
285
+ "2 The persisting port congestion at Shanghai’s Y... Minor \n",
286
+ "3 Updated local media sources from Jakarta indic... Extreme \n",
287
+ "4 According to local police in Jakarta, two expl... Extreme \n",
288
+ "\n",
289
+ " Category Region Datetime Year lat \\\n",
290
+ "0 Mine Workers Strike Indonesia 28/5/17 17:08 2017.0 -4.05608 \n",
291
+ "1 Travel Warning Indonesia 4/9/17 14:30 2017.0 NaN \n",
292
+ "2 Port Congestion China 27/4/17 9:16 2017.0 29.52000 \n",
293
+ "3 Bombing, Police Operations Indonesia 24/5/17 15:15 2017.0 -6.22465 \n",
294
+ "4 Bombing, Police Operations Indonesia 24/5/17 16:20 2017.0 NaN \n",
295
+ "\n",
296
+ " lon maritime_label found_ports \\\n",
297
+ "0 137.11302 False ['freeport'] \n",
298
+ "1 NaN False ['hong kong'] \n",
299
+ "2 121.33190 True ['ningbo', 'qingdao', 'shanghai'] \n",
300
+ "3 106.86700 True ['jakarta'] \n",
301
+ "4 NaN True ['jakarta'] \n",
302
+ "\n",
303
+ " contains_port_info if_labeled Month Week \\\n",
304
+ "0 1.0 False 5.0 21.0 \n",
305
+ "1 1.0 False 4.0 14.0 \n",
306
+ "2 1.0 False 4.0 17.0 \n",
307
+ "3 1.0 False 5.0 21.0 \n",
308
+ "4 1.0 True 5.0 21.0 \n",
309
+ "\n",
310
+ " Headline_Details \\\n",
311
+ "0 Grasberg Mine- Grasberg mine workers extend st... \n",
312
+ "1 Indonesia: Undersea internet cables damaged by... \n",
313
+ "2 Shanghai port congestion impacts terminals in ... \n",
314
+ "3 UPDATE - Indonesia: Explosion at KP Terminal i... \n",
315
+ "4 UPDATE - Indonesia: Police confirm two explosi... \n",
316
+ "\n",
317
+ " cleaned_Headline_Details \\\n",
318
+ "0 [grasberg, grasberg, mine, worker, extend, str... \n",
319
+ "1 [indonesia, undersea, internet, cable, damage,... \n",
320
+ "2 [shanghai, port, congestion, impact, terminal,... \n",
321
+ "3 [update, indonesia, explosion, at, kp, termina... \n",
322
+ "4 [update, indonesia, police, confirm, two, expl... \n",
323
+ "\n",
324
+ " binary_Headline_Details word_count \n",
325
+ "0 [worker_grasberg_mine] 1 \n",
326
+ "1 [undersea_internet_cable] 1 \n",
327
+ "2 [] 0 \n",
328
+ "3 [] 0 \n",
329
+ "4 [] 0 "
330
+ ]
331
+ },
332
+ "execution_count": 7,
333
+ "metadata": {},
334
+ "output_type": "execute_result"
335
+ }
336
+ ],
337
+ "source": [
338
+ "df.head()"
339
+ ]
340
+ },
341
+ {
342
+ "cell_type": "code",
343
+ "execution_count": 8,
344
+ "id": "09113e88-66cc-414c-a953-da04db83c4ae",
345
+ "metadata": {},
346
+ "outputs": [
347
+ {
348
+ "data": {
349
+ "text/plain": [
350
+ "(5778, 20)"
351
+ ]
352
+ },
353
+ "execution_count": 8,
354
+ "metadata": {},
355
+ "output_type": "execute_result"
356
+ }
357
+ ],
358
+ "source": [
359
+ "df.shape"
360
+ ]
361
+ },
362
+ {
363
+ "cell_type": "code",
364
+ "execution_count": 26,
365
+ "id": "be6fc947",
366
+ "metadata": {},
367
+ "outputs": [
368
+ {
369
+ "name": "stdout",
370
+ "output_type": "stream",
371
+ "text": [
372
+ "DataFrames are identical: True\n"
373
+ ]
374
+ }
375
+ ],
376
+ "source": [
377
+ "df2 = pd.read_parquet(\"IS424_Data_Mining/code/LDA/processed_data.parquet\")\n",
378
+ "# Check if two DataFrames are exactly the same\n",
379
+ "are_identical = df.equals(df2)\n",
380
+ "print(f\"DataFrames are identical: {are_identical}\")"
381
+ ]
382
+ },
383
+ {
384
+ "cell_type": "markdown",
385
+ "id": "037e74fc-bbcd-43e3-8346-799920cca8d8",
386
+ "metadata": {},
387
+ "source": [
388
+ "# Vectorisation"
389
+ ]
390
+ },
391
+ {
392
+ "cell_type": "markdown",
393
+ "id": "d67cef3a-59fb-4dd8-adc8-2cf288b90728",
394
+ "metadata": {},
395
+ "source": [
396
+ "NLP vectorization refers to the process of converting text data into numerical vectors that machine learning algorithms can understand and process. \n",
397
+ "\n",
398
+ "Bag-of-Words (BoW) is used here that represents text as a collection of unique words along with their frequencies. Each word is assigned an index, and the vector contains the count of each word present in the document."
399
+ ]
400
+ },
401
+ {
402
+ "cell_type": "code",
403
+ "execution_count": 14,
404
+ "id": "c95b7b8a-9767-469d-812d-c9a9d9fee0e9",
405
+ "metadata": {},
406
+ "outputs": [],
407
+ "source": [
408
+ "cleaned = df.copy()"
409
+ ]
410
+ },
411
+ {
412
+ "cell_type": "code",
413
+ "execution_count": 15,
414
+ "id": "de71c523-a59e-44b2-aa96-5f17d872c9c6",
415
+ "metadata": {},
416
+ "outputs": [],
417
+ "source": [
418
+ "headline = cleaned.binary_Headline_Details"
419
+ ]
420
+ },
421
+ {
422
+ "cell_type": "code",
423
+ "execution_count": 16,
424
+ "id": "5b1e34e1",
425
+ "metadata": {},
426
+ "outputs": [
427
+ {
428
+ "data": {
429
+ "text/plain": [
430
+ "array(['worker_grasberg_mine'], dtype=object)"
431
+ ]
432
+ },
433
+ "execution_count": 16,
434
+ "metadata": {},
435
+ "output_type": "execute_result"
436
+ }
437
+ ],
438
+ "source": [
439
+ "headline[0]"
440
+ ]
441
+ },
442
+ {
443
+ "cell_type": "code",
444
+ "execution_count": 17,
445
+ "id": "677055b4-978e-4253-90f4-3f903662e225",
446
+ "metadata": {
447
+ "tags": []
448
+ },
449
+ "outputs": [],
450
+ "source": [
451
+ "# vectorise the words\n",
452
+ "doc_dict = gensim.corpora.Dictionary(headline)\n",
453
+ "docs_vecs = [doc_dict.doc2bow(doc) for doc in headline]"
454
+ ]
455
+ },
456
+ {
457
+ "cell_type": "code",
458
+ "execution_count": 18,
459
+ "id": "a54d1768-b069-4936-a156-deaf0b506d93",
460
+ "metadata": {
461
+ "tags": []
462
+ },
463
+ "outputs": [
464
+ {
465
+ "name": "stdout",
466
+ "output_type": "stream",
467
+ "text": [
468
+ "Number of unique tokens: 5319\n",
469
+ "Number of articles: 5778\n"
470
+ ]
471
+ }
472
+ ],
473
+ "source": [
474
+ "print(\"Number of unique tokens: %d\" % len(doc_dict))\n",
475
+ "print(\"Number of articles: %d\" % len(docs_vecs))"
476
+ ]
477
+ },
478
+ {
479
+ "cell_type": "code",
480
+ "execution_count": 19,
481
+ "id": "9147fa86-1503-4252-bd9b-92fea1e6a926",
482
+ "metadata": {
483
+ "scrolled": true,
484
+ "tags": []
485
+ },
486
+ "outputs": [
487
+ {
488
+ "name": "stdout",
489
+ "output_type": "stream",
490
+ "text": [
491
+ "[('due_dense_fog', 20),\n",
492
+ " ('strong_wind', 19),\n",
493
+ " ('indicate_average', 19),\n",
494
+ " ('day_port', 17),\n",
495
+ " ('san_antonio', 17),\n",
496
+ " ('vessel_port', 16),\n",
497
+ " ('low_visibility', 15),\n",
498
+ " ('average_wait', 13),\n",
499
+ " ('port_qingdao', 12),\n",
500
+ " ('east_coast_parkway', 12),\n",
501
+ " ('port_shanghai', 11),\n",
502
+ " ('port_hong_kong', 11),\n",
503
+ " ('severe_wind', 11),\n",
504
+ " ('blank_week_service', 11),\n",
505
+ " ('congestion_port_manila', 10),\n",
506
+ " ('wait_hour', 10),\n",
507
+ " ('day_situation', 9),\n",
508
+ " ('port_ningbo', 9),\n",
509
+ " ('coastal_area', 9),\n",
510
+ " ('strong_wind_forecast', 9),\n",
511
+ " ('vessel_arrival', 8),\n",
512
+ " ('high_wind', 8),\n",
513
+ " ('pomeranian_voivodeship', 8),\n",
514
+ " ('wait_day', 8),\n",
515
+ " ('previous_week', 8),\n",
516
+ " ('vessel_wait', 8),\n",
517
+ " ('average_wait_port', 8),\n",
518
+ " ('affect_operation_port', 7),\n",
519
+ " ('berth_manila_south', 7),\n",
520
+ " ('day_port_saigon', 7),\n",
521
+ " ('vessel_port_hong', 7),\n",
522
+ " ('vessel_port_shanghai', 7),\n",
523
+ " ('high_risk_port', 7),\n",
524
+ " ('wait_hour_port', 7),\n",
525
+ " ('qianwan_container', 7),\n",
526
+ " ('day_port_shanghai', 6),\n",
527
+ " ('international_container', 6),\n",
528
+ " ('tropical_storm', 6),\n",
529
+ " ('arrive_window', 6),\n",
530
+ " ('moderate_risk_port', 6),\n",
531
+ " ('vessel_port_qingdao', 6),\n",
532
+ " ('landside_rail_operation', 6),\n",
533
+ " ('dp_world_southampton', 6),\n",
534
+ " ('high_yard_density', 6),\n",
535
+ " ('vessel_port_ningbo', 6),\n",
536
+ " ('port_busan', 6),\n",
537
+ " ('strong_wind_area', 6),\n",
538
+ " ('new_york_city', 6),\n",
539
+ " ('duration_closure', 6),\n",
540
+ " ('vessel_berth_port', 5),\n",
541
+ " ('wait_decrease_day', 5),\n",
542
+ " ('disrupt_operation_port', 5),\n",
543
+ " ('waterside_landside_operation', 5),\n",
544
+ " ('congest_vessel', 5),\n",
545
+ " ('port_charleston', 5),\n",
546
+ " ('san_antonio_puerto', 5),\n",
547
+ " ('yantian_shipping', 5),\n",
548
+ " ('port_saigon', 5),\n",
549
+ " ('port_ho_chi', 5),\n",
550
+ " ('engine_failure', 5),\n",
551
+ " ('port_hong', 5),\n",
552
+ " ('dense_fog', 5),\n",
553
+ " ('disrupt_port_operation', 5),\n",
554
+ " ('high_wind_port', 5),\n",
555
+ " ('vessel_wait_day', 5),\n",
556
+ " ('vessel_port_tanjung', 5),\n",
557
+ " ('landside_waterside_operation', 5),\n",
558
+ " ('port_durban', 5),\n",
559
+ " ('port_manila', 5),\n",
560
+ " ('next_day_morning', 5),\n",
561
+ " ('strong_wind_halt', 5),\n",
562
+ " ('houston_ship_channel', 5),\n",
563
+ " ('low_visibility_forecast', 5),\n",
564
+ " ('full_port', 5),\n",
565
+ " ('tropical_storm_hanna', 5),\n",
566
+ " ('likely_port_closure', 5),\n",
567
+ " ('strong_wind_industry', 5),\n",
568
+ " ('hour_qingdao_qianwan', 5),\n",
569
+ " ('antonio_halt_operation', 5),\n",
570
+ " ('high_chance_port', 4),\n",
571
+ " ('pilot_service', 4),\n",
572
+ " ('low_productivity', 4),\n",
573
+ " ('heavy_rain', 4),\n",
574
+ " ('resume_operation', 4),\n",
575
+ " ('terminal_indicate', 4),\n",
576
+ " ('port_fuzhou', 4),\n",
577
+ " ('disrupt_port', 4),\n",
578
+ " ('nhava_sheva', 4),\n",
579
+ " ('berth_day', 4),\n",
580
+ " ('congestion_port', 4),\n",
581
+ " ('indicate_waterside_operation', 4),\n",
582
+ " ('injury_damage_incident', 4),\n",
583
+ " ('due_fog', 4),\n",
584
+ " ('wait_vessel_port', 4),\n",
585
+ " ('kong_industry', 4),\n",
586
+ " ('hong_kong', 4),\n",
587
+ " ('wait_vessel_berth', 4),\n",
588
+ " ('terminal_port', 4),\n",
589
+ " ('wait_vessel', 4),\n",
590
+ " ('affect_port_operation', 4)]\n"
591
+ ]
592
+ }
593
+ ],
594
+ "source": [
595
+ "# Calculate word frequencies\n",
596
+ "word_frequencies = {doc_dict[word_id]: freq for word_id, freq in doc_dict.cfs.items()}\n",
597
+ "sorted_words = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)\n",
598
+ "\n",
599
+ "pprint(sorted_words[:100])"
600
+ ]
601
+ },
602
+ {
603
+ "cell_type": "markdown",
604
+ "id": "5ed78239-2ce1-4784-a8f4-4c7438c8627b",
605
+ "metadata": {},
606
+ "source": [
607
+ "# LDA Modelling"
608
+ ]
609
+ },
610
+ {
611
+ "cell_type": "markdown",
612
+ "id": "9db83273-461d-4f70-b23f-ec967579d94f",
613
+ "metadata": {},
614
+ "source": [
615
+ "## Benchmark Model"
616
+ ]
617
+ },
618
+ {
619
+ "cell_type": "code",
620
+ "execution_count": 20,
621
+ "id": "e6d577bd-9936-4d45-be90-345af2eb4827",
622
+ "metadata": {
623
+ "tags": []
624
+ },
625
+ "outputs": [],
626
+ "source": [
627
+ "# Build LDA benchmark model\n",
628
+ "lda_model = gensim.models.LdaMulticore(\n",
629
+ " corpus=docs_vecs,\n",
630
+ " id2word=doc_dict,\n",
631
+ " num_topics=4,\n",
632
+ " random_state=42,\n",
633
+ " chunksize=100,\n",
634
+ " passes=10,\n",
635
+ " per_word_topics=True,\n",
636
+ ")"
637
+ ]
638
+ },
639
+ {
640
+ "cell_type": "code",
641
+ "execution_count": 21,
642
+ "id": "c4f1521f-5f43-40d2-a3a3-a8ac2ca6fec2",
643
+ "metadata": {
644
+ "tags": []
645
+ },
646
+ "outputs": [
647
+ {
648
+ "name": "stdout",
649
+ "output_type": "stream",
650
+ "text": [
651
+ "[(0,\n",
652
+ " '0.003*\"indicate_average\" + 0.002*\"coastal_area\" + 0.002*\"vessel_port_hong\" '\n",
653
+ " '+ 0.002*\"east_coast_parkway\" + 0.002*\"port_charleston\" + '\n",
654
+ " '0.002*\"average_wait_port\" + 0.002*\"severe_wind\" + '\n",
655
+ " '0.002*\"pomeranian_voivodeship\" + 0.002*\"engine_failure\" + '\n",
656
+ " '0.002*\"due_dense_fog\"'),\n",
657
+ " (1,\n",
658
+ " '0.005*\"san_antonio\" + 0.003*\"blank_week_service\" + 0.003*\"wait_hour\" + '\n",
659
+ " '0.003*\"day_port_saigon\" + 0.002*\"low_visibility\" + 0.002*\"high_wind\" + '\n",
660
+ " '0.002*\"waterside_landside_operation\" + 0.002*\"tropical_storm\" + '\n",
661
+ " '0.002*\"qianwan_container\" + 0.002*\"port_ningbo\"'),\n",
662
+ " (2,\n",
663
+ " '0.004*\"strong_wind\" + 0.002*\"port_shanghai\" + 0.002*\"port_qingdao\" + '\n",
664
+ " '0.002*\"vessel_port_qingdao\" + 0.002*\"day_port_shanghai\" + '\n",
665
+ " '0.002*\"indicate_average\" + 0.002*\"disrupt_operation_port\" + '\n",
666
+ " '0.002*\"strong_wind_area\" + 0.002*\"port_ho_chi\" + '\n",
667
+ " '0.001*\"operation_pier_port\"'),\n",
668
+ " (3,\n",
669
+ " '0.004*\"port_hong_kong\" + 0.003*\"vessel_port\" + 0.003*\"day_port\" + '\n",
670
+ " '0.003*\"congestion_port_manila\" + 0.002*\"berth_manila_south\" + '\n",
671
+ " '0.002*\"arrive_window\" + 0.002*\"landside_rail_operation\" + '\n",
672
+ " '0.002*\"international_container\" + 0.002*\"day_situation\" + '\n",
673
+ " '0.002*\"congestion_port\"')]\n"
674
+ ]
675
+ }
676
+ ],
677
+ "source": [
678
+ "from pprint import pprint\n",
679
+ "\n",
680
+ "# Print the Keyword in the 10 topics\n",
681
+ "pprint(lda_model.print_topics())\n",
682
+ "doc_lda = lda_model[docs_vecs]"
683
+ ]
684
+ },
685
+ {
686
+ "cell_type": "code",
687
+ "execution_count": 22,
688
+ "id": "fd57b1f4-a6cd-41e8-964f-d8a1d30aa3c9",
689
+ "metadata": {
690
+ "tags": []
691
+ },
692
+ "outputs": [
693
+ {
694
+ "name": "stdout",
695
+ "output_type": "stream",
696
+ "text": [
697
+ "\n",
698
+ "Coherence Score LDAModel: 0.7011993291597081\n"
699
+ ]
700
+ }
701
+ ],
702
+ "source": [
703
+ "# Compute Benchmark Coherence Score\n",
704
+ "coherence_model_lda = CoherenceModel(\n",
705
+ " model=lda_model, texts=headline, dictionary=doc_dict, coherence=\"c_v\"\n",
706
+ ")\n",
707
+ "coherence_lda = coherence_model_lda.get_coherence()\n",
708
+ "print(\"\\nCoherence Score LDAModel: \", coherence_lda)"
709
+ ]
710
+ },
711
+ {
712
+ "cell_type": "code",
713
+ "execution_count": 23,
714
+ "id": "152e5a3a-7afe-4fb8-a02f-d7492ad80936",
715
+ "metadata": {
716
+ "tags": []
717
+ },
718
+ "outputs": [
719
+ {
720
+ "name": "stdout",
721
+ "output_type": "stream",
722
+ "text": [
723
+ "\n",
724
+ "Perplexity for LDAModel: -9.594271136114548\n"
725
+ ]
726
+ }
727
+ ],
728
+ "source": [
729
+ "# Compute Benchmark Perplexity\n",
730
+ "perplex = lda_model.log_perplexity(docs_vecs, total_docs=None) # For LDAModel\n",
731
+ "# a measure of how good the model is. lower the better.\n",
732
+ "\n",
733
+ "print(\"\\nPerplexity for LDAModel: \", perplex)"
734
+ ]
735
+ },
736
+ {
737
+ "cell_type": "code",
738
+ "execution_count": 24,
739
+ "id": "7dd3a60a-5c6f-4249-9868-30528a5b0ac8",
740
+ "metadata": {},
741
+ "outputs": [
742
+ {
743
+ "name": "stderr",
744
+ "output_type": "stream",
745
+ "text": [
746
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
747
+ " pid = os.fork()\n",
748
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
749
+ " pid = os.fork()\n",
750
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
751
+ " pid = os.fork()\n",
752
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
753
+ " pid = os.fork()\n",
754
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
755
+ " pid = os.fork()\n",
756
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
757
+ " pid = os.fork()\n",
758
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
759
+ " pid = os.fork()\n",
760
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
761
+ " pid = os.fork()\n",
762
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
763
+ " pid = os.fork()\n",
764
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
765
+ " pid = os.fork()\n",
766
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
767
+ " pid = os.fork()\n",
768
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
769
+ " pid = os.fork()\n",
770
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
771
+ " pid = os.fork()\n",
772
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
773
+ " pid = os.fork()\n",
774
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=30092) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
775
+ " pid = os.fork()\n"
776
+ ]
777
+ }
778
+ ],
779
+ "source": [
780
+ "from pprint import pprint\n",
781
+ "import pyLDAvis\n",
782
+ "import pyLDAvis.gensim_models as gensimvis\n",
783
+ "\n",
784
+ "# feed the LDA model into the pyLDAvis instance\n",
785
+ "pyLDAvis.enable_notebook()\n",
786
+ "visual = gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n",
787
+ "\n",
788
+ "# Save the output to the html file\n",
789
+ "pyLDAvis.save_html(visual, \"topic_viz_benchmark.html\")"
790
+ ]
791
+ },
792
+ {
793
+ "cell_type": "code",
794
+ "execution_count": 25,
795
+ "id": "48bd84a0-ce9b-4117-bf6e-d9afb9936147",
796
+ "metadata": {},
797
+ "outputs": [
798
+ {
799
+ "data": {
800
+ "text/html": [
801
+ "<div>\n",
802
+ "<style scoped>\n",
803
+ " .dataframe tbody tr th:only-of-type {\n",
804
+ " vertical-align: middle;\n",
805
+ " }\n",
806
+ "\n",
807
+ " .dataframe tbody tr th {\n",
808
+ " vertical-align: top;\n",
809
+ " }\n",
810
+ "\n",
811
+ " .dataframe thead th {\n",
812
+ " text-align: right;\n",
813
+ " }\n",
814
+ "</style>\n",
815
+ "<table border=\"1\" class=\"dataframe\">\n",
816
+ " <thead>\n",
817
+ " <tr style=\"text-align: right;\">\n",
818
+ " <th></th>\n",
819
+ " <th>Topic Keywords</th>\n",
820
+ " <th>Topic ID</th>\n",
821
+ " </tr>\n",
822
+ " </thead>\n",
823
+ " <tbody>\n",
824
+ " <tr>\n",
825
+ " <th>0</th>\n",
826
+ " <td>0.003*\"indicate_average\" + 0.002*\"coastal_area\" + 0.002*\"vessel_port_hong\" + 0.002*\"east_coast_parkway\" + 0.002*\"port_charleston\" + 0.002*\"average_wait_port\"</td>\n",
827
+ " <td>0</td>\n",
828
+ " </tr>\n",
829
+ " <tr>\n",
830
+ " <th>1</th>\n",
831
+ " <td>0.005*\"san_antonio\" + 0.003*\"blank_week_service\" + 0.003*\"wait_hour\" + 0.003*\"day_port_saigon\" + 0.002*\"low_visibility\" + 0.002*\"high_wind\"</td>\n",
832
+ " <td>1</td>\n",
833
+ " </tr>\n",
834
+ " <tr>\n",
835
+ " <th>2</th>\n",
836
+ " <td>0.004*\"strong_wind\" + 0.002*\"port_shanghai\" + 0.002*\"port_qingdao\" + 0.002*\"vessel_port_qingdao\" + 0.002*\"day_port_shanghai\" + 0.002*\"indicate_average\"</td>\n",
837
+ " <td>2</td>\n",
838
+ " </tr>\n",
839
+ " <tr>\n",
840
+ " <th>3</th>\n",
841
+ " <td>0.004*\"port_hong_kong\" + 0.003*\"vessel_port\" + 0.003*\"day_port\" + 0.003*\"congestion_port_manila\" + 0.002*\"berth_manila_south\" + 0.002*\"arrive_window\"</td>\n",
842
+ " <td>3</td>\n",
843
+ " </tr>\n",
844
+ " </tbody>\n",
845
+ "</table>\n",
846
+ "</div>"
847
+ ],
848
+ "text/plain": [
849
+ " Topic Keywords \\\n",
850
+ "0 0.003*\"indicate_average\" + 0.002*\"coastal_area\" + 0.002*\"vessel_port_hong\" + 0.002*\"east_coast_parkway\" + 0.002*\"port_charleston\" + 0.002*\"average_wait_port\" \n",
851
+ "1 0.005*\"san_antonio\" + 0.003*\"blank_week_service\" + 0.003*\"wait_hour\" + 0.003*\"day_port_saigon\" + 0.002*\"low_visibility\" + 0.002*\"high_wind\" \n",
852
+ "2 0.004*\"strong_wind\" + 0.002*\"port_shanghai\" + 0.002*\"port_qingdao\" + 0.002*\"vessel_port_qingdao\" + 0.002*\"day_port_shanghai\" + 0.002*\"indicate_average\" \n",
853
+ "3 0.004*\"port_hong_kong\" + 0.003*\"vessel_port\" + 0.003*\"day_port\" + 0.003*\"congestion_port_manila\" + 0.002*\"berth_manila_south\" + 0.002*\"arrive_window\" \n",
854
+ "\n",
855
+ " Topic ID \n",
856
+ "0 0 \n",
857
+ "1 1 \n",
858
+ "2 2 \n",
859
+ "3 3 "
860
+ ]
861
+ },
862
+ "execution_count": 25,
863
+ "metadata": {},
864
+ "output_type": "execute_result"
865
+ }
866
+ ],
867
+ "source": [
868
+ "pd.set_option(\"max_colwidth\", 200)\n",
869
+ "# Get the topics and their top keywords into a dataframe\n",
870
+ "topics = lda_model.show_topics(num_words=6)\n",
871
+ "\n",
872
+ "topic_keywords = pd.DataFrame()\n",
873
+ "for topic_id, topic in topics:\n",
874
+ " topic_keywords.at[topic_id, \"Topic Keywords\"] = topic\n",
875
+ "\n",
876
+ "topic_keywords[\"Topic ID\"] = topic_keywords.index\n",
877
+ "# topic_keywords['Topic Name'] = topic_mapping\n",
878
+ "topic_keywords"
879
+ ]
880
+ },
881
+ {
882
+ "cell_type": "markdown",
883
+ "id": "3247fe12",
884
+ "metadata": {},
885
+ "source": [
886
+ "## Conclusion\n",
887
+ "\n",
888
+ "No significant insights gained from the model result as we cannot identify any topics, maybe more data is needed, will deploy web scraping for full news content after mid term."
889
+ ]
890
+ }
891
+ ],
892
+ "metadata": {
893
+ "kernelspec": {
894
+ "display_name": "Python 3",
895
+ "language": "python",
896
+ "name": "python3"
897
+ },
898
+ "language_info": {
899
+ "codemirror_mode": {
900
+ "name": "ipython",
901
+ "version": 3
902
+ },
903
+ "file_extension": ".py",
904
+ "mimetype": "text/x-python",
905
+ "name": "python",
906
+ "nbconvert_exporter": "python",
907
+ "pygments_lexer": "ipython3",
908
+ "version": "3.12.4"
909
+ }
910
+ },
911
+ "nbformat": 4,
912
+ "nbformat_minor": 5
913
+ }
notebooks/05a_newsScraper_run_1.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/05b_newsScraper_run_2.ipynb ADDED
@@ -0,0 +1,1074 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "42801c6a",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "workding dir: /Users/inflaton/code/engd/papers/maritime/global-incidents\n",
14
+ "loading env vars from: /Users/inflaton/code/engd/papers/maritime/global-incidents/.env\n"
15
+ ]
16
+ },
17
+ {
18
+ "data": {
19
+ "text/plain": [
20
+ "True"
21
+ ]
22
+ },
23
+ "execution_count": 1,
24
+ "metadata": {},
25
+ "output_type": "execute_result"
26
+ }
27
+ ],
28
+ "source": [
29
+ "%load_ext autoreload\n",
30
+ "%autoreload 2\n",
31
+ "\n",
32
+ "import os\n",
33
+ "import sys\n",
34
+ "from pathlib import Path\n",
35
+ "\n",
36
+ "workding_dir = str(Path.cwd().parent)\n",
37
+ "os.chdir(workding_dir)\n",
38
+ "sys.path.append(workding_dir)\n",
39
+ "print(\"workding dir:\", workding_dir)\n",
40
+ "\n",
41
+ "from dotenv import find_dotenv, load_dotenv\n",
42
+ "\n",
43
+ "found_dotenv = find_dotenv(\".env\")\n",
44
+ "\n",
45
+ "if len(found_dotenv) == 0:\n",
46
+ " found_dotenv = find_dotenv(\".env.example\")\n",
47
+ "print(f\"loading env vars from: {found_dotenv}\")\n",
48
+ "load_dotenv(found_dotenv, override=True)"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 2,
54
+ "id": "ddf1e32e-7751-43db-9b5a-22cb08e35c6c",
55
+ "metadata": {},
56
+ "outputs": [],
57
+ "source": [
58
+ "import pandas as pd\n",
59
+ "import nltk\n",
60
+ "from nltk.corpus import PlaintextCorpusReader\n",
61
+ "from nltk.corpus import stopwords\n",
62
+ "from nltk.stem.porter import *\n",
63
+ "from nltk import pos_tag, word_tokenize\n",
64
+ "from nltk.stem import WordNetLemmatizer\n",
65
+ "from nltk.probability import FreqDist\n",
66
+ "from nltk.tokenize import sent_tokenize\n",
67
+ "from nltk.tokenize import word_tokenize\n",
68
+ "import contractions\n",
69
+ "\n",
70
+ "import gensim\n",
71
+ "from gensim import corpora\n",
72
+ "from gensim import similarities\n",
73
+ "from gensim import models\n",
74
+ "from gensim.models import CoherenceModel\n",
75
+ "\n",
76
+ "# from wordcloud import WordCloud, ImageColorGenerator\n",
77
+ "import matplotlib.pyplot as plt\n",
78
+ "import seaborn as sns\n",
79
+ "import pandas as pd\n",
80
+ "import re\n",
81
+ "import os\n",
82
+ "import glob\n",
83
+ "import json\n",
84
+ "\n",
85
+ "import pickle\n",
86
+ "from datetime import datetime\n",
87
+ "import datetime\n",
88
+ "from newspaper import Article\n",
89
+ "from gnews import GNews"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": 3,
95
+ "id": "a9c6ed07-a7d0-4aaa-b571-038919c75e05",
96
+ "metadata": {},
97
+ "outputs": [],
98
+ "source": [
99
+ "# Function to get full news article by headline\n",
100
+ "def get_news_article(headline):\n",
101
+ " gnews = GNews()\n",
102
+ " articles = gnews.get_news(headline)\n",
103
+ "\n",
104
+ " if articles:\n",
105
+ " for article in articles:\n",
106
+ " # Assuming the headline contains the URL of the news article\n",
107
+ " try:\n",
108
+ " article = Article(article[\"url\"])\n",
109
+ " article.download()\n",
110
+ " article.parse()\n",
111
+ " return article.url, article.title, article.text\n",
112
+ " except Exception as e:\n",
113
+ " # link cannot be scraped\n",
114
+ " # print(\"get_news_article - exception:\", e)\n",
115
+ " return [\n",
116
+ " f\"cannot scrape the url: {e}\",\n",
117
+ " \"cannot scrape the title\",\n",
118
+ " \"cannot scrape the content\",\n",
119
+ " ]\n",
120
+ " # No articles found for the given title.\n",
121
+ " else:\n",
122
+ " return [\"no url found\", \"no title found\", \"no content found\"]"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": 4,
128
+ "id": "0acaf899-cd96-4424-b825-c71bf042355a",
129
+ "metadata": {
130
+ "scrolled": true
131
+ },
132
+ "outputs": [],
133
+ "source": [
134
+ "# Read the CSV file with news titles\n",
135
+ "csv_file_path = \"data/cleaned_data.csv\"\n",
136
+ "df = pd.read_csv(csv_file_path)"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": 5,
142
+ "id": "465acb47-5f57-4d90-a272-7972d56e002b",
143
+ "metadata": {},
144
+ "outputs": [
145
+ {
146
+ "data": {
147
+ "text/plain": [
148
+ "(5782, 17)"
149
+ ]
150
+ },
151
+ "execution_count": 5,
152
+ "metadata": {},
153
+ "output_type": "execute_result"
154
+ }
155
+ ],
156
+ "source": [
157
+ "df.shape"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": 6,
163
+ "id": "801d892d-3b87-4ecf-805e-3441815d00f1",
164
+ "metadata": {},
165
+ "outputs": [],
166
+ "source": [
167
+ "# drop empty news\n",
168
+ "df.dropna(subset=[\"Headline\"], inplace=True)"
169
+ ]
170
+ },
171
+ {
172
+ "cell_type": "code",
173
+ "execution_count": 7,
174
+ "id": "a65a8eaf",
175
+ "metadata": {},
176
+ "outputs": [
177
+ {
178
+ "data": {
179
+ "text/plain": [
180
+ "(5780, 17)"
181
+ ]
182
+ },
183
+ "execution_count": 7,
184
+ "metadata": {},
185
+ "output_type": "execute_result"
186
+ }
187
+ ],
188
+ "source": [
189
+ "df.shape"
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": 8,
195
+ "id": "1267bf68-4726-49bf-ae21-9fed09682945",
196
+ "metadata": {},
197
+ "outputs": [
198
+ {
199
+ "data": {
200
+ "text/plain": [
201
+ "70"
202
+ ]
203
+ },
204
+ "execution_count": 8,
205
+ "metadata": {},
206
+ "output_type": "execute_result"
207
+ }
208
+ ],
209
+ "source": [
210
+ "df[[\"Year\", \"Headline\", \"Region\"]].duplicated().sum()"
211
+ ]
212
+ },
213
+ {
214
+ "cell_type": "code",
215
+ "execution_count": 9,
216
+ "id": "51e0f672",
217
+ "metadata": {},
218
+ "outputs": [
219
+ {
220
+ "data": {
221
+ "text/plain": [
222
+ "(5710, 17)"
223
+ ]
224
+ },
225
+ "execution_count": 9,
226
+ "metadata": {},
227
+ "output_type": "execute_result"
228
+ }
229
+ ],
230
+ "source": [
231
+ "df_uni = df.drop_duplicates(subset=[\"Year\", \"Headline\", \"Region\"], keep=\"first\")\n",
232
+ "df_uni.shape"
233
+ ]
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": 10,
238
+ "id": "f3c35a3c-d852-4285-86bf-3f7db7df78e7",
239
+ "metadata": {
240
+ "scrolled": true
241
+ },
242
+ "outputs": [
243
+ {
244
+ "name": "stderr",
245
+ "output_type": "stream",
246
+ "text": [
247
+ " 13%|█▎ | 757/5710 [27:31<3:29:09, 2.53s/it] /Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/dateutil/parser/_parser.py:1207: UnknownTimezoneWarning: tzname PST identified but not understood. Pass `tzinfos` argument in order to correctly return a timezone-aware datetime. In a future version, this will raise an exception.\n",
248
+ " warnings.warn(\"tzname {tzname} identified but not understood. \"\n",
249
+ " 28%|██▊ | 1588/5710 [57:29<3:22:58, 2.95s/it] /Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/dateutil/parser/_parser.py:1207: UnknownTimezoneWarning: tzname PDT identified but not understood. Pass `tzinfos` argument in order to correctly return a timezone-aware datetime. In a future version, this will raise an exception.\n",
250
+ " warnings.warn(\"tzname {tzname} identified but not understood. \"\n",
251
+ " 68%|██████▊ | 3862/5710 [2:24:11<1:04:55, 2.11s/it] 06/29/2024 07:38:31 PM - newspaper ignoring lxml node error: Input object is not an XML element: HtmlProcessingInstruction\n",
252
+ " 70%|██████▉ | 3978/5710 [2:27:39<1:44:23, 3.62s/it]Building prefix dict from /Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/jieba/dict.txt ...\n",
253
+ "06/29/2024 07:41:56 PM - Building prefix dict from /Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/jieba/dict.txt ...\n",
254
+ "Loading model from cache /var/folders/7x/56svhln929zdh2xhr3mwqg4r0000gn/T/jieba.cache\n",
255
+ "06/29/2024 07:41:56 PM - Loading model from cache /var/folders/7x/56svhln929zdh2xhr3mwqg4r0000gn/T/jieba.cache\n",
256
+ "Loading model cost 0.28935670852661133 seconds.\n",
257
+ "06/29/2024 07:41:56 PM - Loading model cost 0.28935670852661133 seconds.\n",
258
+ "Prefix dict has been built succesfully.\n",
259
+ "06/29/2024 07:41:56 PM - Prefix dict has been built succesfully.\n",
260
+ "100%|██████████| 5710/5710 [3:30:27<00:00, 2.21s/it] "
261
+ ]
262
+ },
263
+ {
264
+ "name": "stdout",
265
+ "output_type": "stream",
266
+ "text": [
267
+ "CPU times: user 16min 31s, sys: 3min 20s, total: 19min 51s\n",
268
+ "Wall time: 3h 30min 27s\n"
269
+ ]
270
+ },
271
+ {
272
+ "name": "stderr",
273
+ "output_type": "stream",
274
+ "text": [
275
+ "\n",
276
+ "<timed exec>:20: SettingWithCopyWarning: \n",
277
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
278
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
279
+ "\n",
280
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
281
+ "<timed exec>:21: SettingWithCopyWarning: \n",
282
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
283
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
284
+ "\n",
285
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
286
+ "<timed exec>:22: SettingWithCopyWarning: \n",
287
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
288
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
289
+ "\n",
290
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
291
+ ]
292
+ }
293
+ ],
294
+ "source": [
295
+ "%%time\n",
296
+ "\n",
297
+ "from tqdm import tqdm\n",
298
+ "\n",
299
+ "# Iterate through each row and get the full news article\n",
300
+ "tes = df_uni\n",
301
+ "\n",
302
+ "urls = []\n",
303
+ "titles = []\n",
304
+ "contents = []\n",
305
+ "\n",
306
+ "for index in tqdm(range(len(tes))):\n",
307
+ " row = tes.iloc[index]\n",
308
+ " headline = row[\"Headline\"]\n",
309
+ "\n",
310
+ " results = get_news_article(headline)\n",
311
+ " urls.append(results[0])\n",
312
+ " titles.append(results[1])\n",
313
+ " contents.append(results[2])\n",
314
+ "\n",
315
+ "# Update the DataFrame with the fetched data\n",
316
+ "tes[\"url\"] = urls\n",
317
+ "tes[\"title\"] = titles\n",
318
+ "tes[\"content\"] = contents"
319
+ ]
320
+ },
321
+ {
322
+ "cell_type": "code",
323
+ "execution_count": 11,
324
+ "id": "66d98c3b-c1f2-42e6-ae5f-8192c5a91f44",
325
+ "metadata": {},
326
+ "outputs": [
327
+ {
328
+ "data": {
329
+ "text/html": [
330
+ "<div>\n",
331
+ "<style scoped>\n",
332
+ " .dataframe tbody tr th:only-of-type {\n",
333
+ " vertical-align: middle;\n",
334
+ " }\n",
335
+ "\n",
336
+ " .dataframe tbody tr th {\n",
337
+ " vertical-align: top;\n",
338
+ " }\n",
339
+ "\n",
340
+ " .dataframe thead th {\n",
341
+ " text-align: right;\n",
342
+ " }\n",
343
+ "</style>\n",
344
+ "<table border=\"1\" class=\"dataframe\">\n",
345
+ " <thead>\n",
346
+ " <tr style=\"text-align: right;\">\n",
347
+ " <th></th>\n",
348
+ " <th>id</th>\n",
349
+ " <th>Headline</th>\n",
350
+ " <th>Details</th>\n",
351
+ " <th>Severity</th>\n",
352
+ " <th>Category</th>\n",
353
+ " <th>Region</th>\n",
354
+ " <th>Datetime</th>\n",
355
+ " <th>Year</th>\n",
356
+ " <th>lat</th>\n",
357
+ " <th>lon</th>\n",
358
+ " <th>maritime_label</th>\n",
359
+ " <th>found_ports</th>\n",
360
+ " <th>contains_port_info</th>\n",
361
+ " <th>if_labeled</th>\n",
362
+ " <th>Month</th>\n",
363
+ " <th>Week</th>\n",
364
+ " <th>Headline_Details</th>\n",
365
+ " <th>url</th>\n",
366
+ " <th>title</th>\n",
367
+ " <th>content</th>\n",
368
+ " </tr>\n",
369
+ " </thead>\n",
370
+ " <tbody>\n",
371
+ " <tr>\n",
372
+ " <th>0</th>\n",
373
+ " <td>1.0</td>\n",
374
+ " <td>Grasberg Mine- Grasberg mine workers extend st...</td>\n",
375
+ " <td>Media sources indicate that workers at the Gra...</td>\n",
376
+ " <td>Moderate</td>\n",
377
+ " <td>Mine Workers Strike</td>\n",
378
+ " <td>Indonesia</td>\n",
379
+ " <td>28/5/17 17:08</td>\n",
380
+ " <td>2017.0</td>\n",
381
+ " <td>-4.05608</td>\n",
382
+ " <td>137.11302</td>\n",
383
+ " <td>False</td>\n",
384
+ " <td>['freeport']</td>\n",
385
+ " <td>1.0</td>\n",
386
+ " <td>False</td>\n",
387
+ " <td>5.0</td>\n",
388
+ " <td>21.0</td>\n",
389
+ " <td>Grasberg Mine- Grasberg mine workers extend st...</td>\n",
390
+ " <td>https://news.google.com/rss/articles/CBMiZ2h0d...</td>\n",
391
+ " <td>Freeport Indonesia mine workers extend strike ...</td>\n",
392
+ " <td>By Wilda Asmarini and Susan Taylor\\n\\nJAKARTA/...</td>\n",
393
+ " </tr>\n",
394
+ " <tr>\n",
395
+ " <th>1</th>\n",
396
+ " <td>2.0</td>\n",
397
+ " <td>Indonesia: Undersea internet cables damaged by...</td>\n",
398
+ " <td>News sources are stating that recent typhoons ...</td>\n",
399
+ " <td>Minor</td>\n",
400
+ " <td>Travel Warning</td>\n",
401
+ " <td>Indonesia</td>\n",
402
+ " <td>4/9/17 14:30</td>\n",
403
+ " <td>2017.0</td>\n",
404
+ " <td>NaN</td>\n",
405
+ " <td>NaN</td>\n",
406
+ " <td>False</td>\n",
407
+ " <td>['hong kong']</td>\n",
408
+ " <td>1.0</td>\n",
409
+ " <td>False</td>\n",
410
+ " <td>4.0</td>\n",
411
+ " <td>14.0</td>\n",
412
+ " <td>Indonesia: Undersea internet cables damaged by...</td>\n",
413
+ " <td>no url found</td>\n",
414
+ " <td>no title found</td>\n",
415
+ " <td>no content found</td>\n",
416
+ " </tr>\n",
417
+ " <tr>\n",
418
+ " <th>2</th>\n",
419
+ " <td>3.0</td>\n",
420
+ " <td>Shanghai port congestion impacts terminals in ...</td>\n",
421
+ " <td>The persisting port congestion at Shanghai’s Y...</td>\n",
422
+ " <td>Minor</td>\n",
423
+ " <td>Port Congestion</td>\n",
424
+ " <td>China</td>\n",
425
+ " <td>27/4/17 9:16</td>\n",
426
+ " <td>2017.0</td>\n",
427
+ " <td>29.52000</td>\n",
428
+ " <td>121.33190</td>\n",
429
+ " <td>True</td>\n",
430
+ " <td>['ningbo', 'qingdao', 'shanghai']</td>\n",
431
+ " <td>1.0</td>\n",
432
+ " <td>False</td>\n",
433
+ " <td>4.0</td>\n",
434
+ " <td>17.0</td>\n",
435
+ " <td>Shanghai port congestion impacts terminals in ...</td>\n",
436
+ " <td>https://news.google.com/rss/articles/CBMiZmh0d...</td>\n",
437
+ " <td>Bad weather causing delays, congestion at majo...</td>\n",
438
+ " <td>LA-LB ports investing $25 million in charging ...</td>\n",
439
+ " </tr>\n",
440
+ " <tr>\n",
441
+ " <th>3</th>\n",
442
+ " <td>4.0</td>\n",
443
+ " <td>UPDATE - Indonesia: Explosion at KP Terminal i...</td>\n",
444
+ " <td>Updated local media sources from Jakarta indic...</td>\n",
445
+ " <td>Extreme</td>\n",
446
+ " <td>Bombing, Police Operations</td>\n",
447
+ " <td>Indonesia</td>\n",
448
+ " <td>24/5/17 15:15</td>\n",
449
+ " <td>2017.0</td>\n",
450
+ " <td>-6.22465</td>\n",
451
+ " <td>106.86700</td>\n",
452
+ " <td>True</td>\n",
453
+ " <td>['jakarta']</td>\n",
454
+ " <td>1.0</td>\n",
455
+ " <td>False</td>\n",
456
+ " <td>5.0</td>\n",
457
+ " <td>21.0</td>\n",
458
+ " <td>UPDATE - Indonesia: Explosion at KP Terminal i...</td>\n",
459
+ " <td>no url found</td>\n",
460
+ " <td>no title found</td>\n",
461
+ " <td>no content found</td>\n",
462
+ " </tr>\n",
463
+ " <tr>\n",
464
+ " <th>4</th>\n",
465
+ " <td>5.0</td>\n",
466
+ " <td>UPDATE - Indonesia: Police confirm two explosi...</td>\n",
467
+ " <td>According to local police in Jakarta, two expl...</td>\n",
468
+ " <td>Extreme</td>\n",
469
+ " <td>Bombing, Police Operations</td>\n",
470
+ " <td>Indonesia</td>\n",
471
+ " <td>24/5/17 16:20</td>\n",
472
+ " <td>2017.0</td>\n",
473
+ " <td>NaN</td>\n",
474
+ " <td>NaN</td>\n",
475
+ " <td>True</td>\n",
476
+ " <td>['jakarta']</td>\n",
477
+ " <td>1.0</td>\n",
478
+ " <td>True</td>\n",
479
+ " <td>5.0</td>\n",
480
+ " <td>21.0</td>\n",
481
+ " <td>UPDATE - Indonesia: Police confirm two explosi...</td>\n",
482
+ " <td>https://news.google.com/rss/articles/CBMiV2h0d...</td>\n",
483
+ " <td>Three dead confirmed in Jakarta double bombing</td>\n",
484
+ " <td>Two explosions strike bus station in Indonesia...</td>\n",
485
+ " </tr>\n",
486
+ " <tr>\n",
487
+ " <th>...</th>\n",
488
+ " <td>...</td>\n",
489
+ " <td>...</td>\n",
490
+ " <td>...</td>\n",
491
+ " <td>...</td>\n",
492
+ " <td>...</td>\n",
493
+ " <td>...</td>\n",
494
+ " <td>...</td>\n",
495
+ " <td>...</td>\n",
496
+ " <td>...</td>\n",
497
+ " <td>...</td>\n",
498
+ " <td>...</td>\n",
499
+ " <td>...</td>\n",
500
+ " <td>...</td>\n",
501
+ " <td>...</td>\n",
502
+ " <td>...</td>\n",
503
+ " <td>...</td>\n",
504
+ " <td>...</td>\n",
505
+ " <td>...</td>\n",
506
+ " <td>...</td>\n",
507
+ " <td>...</td>\n",
508
+ " </tr>\n",
509
+ " <tr>\n",
510
+ " <th>5777</th>\n",
511
+ " <td>5776.0</td>\n",
512
+ " <td>Winter storm may bring inclement weather condi...</td>\n",
513
+ " <td>Intelligence received by Everstream Analytics ...</td>\n",
514
+ " <td>Moderate</td>\n",
515
+ " <td>Ice Storm</td>\n",
516
+ " <td>United States</td>\n",
517
+ " <td>16/12/20 2:47</td>\n",
518
+ " <td>2020.0</td>\n",
519
+ " <td>38.90072</td>\n",
520
+ " <td>-77.05440</td>\n",
521
+ " <td>False</td>\n",
522
+ " <td>['new york']</td>\n",
523
+ " <td>1.0</td>\n",
524
+ " <td>False</td>\n",
525
+ " <td>12.0</td>\n",
526
+ " <td>51.0</td>\n",
527
+ " <td>Winter storm may bring inclement weather condi...</td>\n",
528
+ " <td>https://news.google.com/rss/articles/CBMiWWh0d...</td>\n",
529
+ " <td>Winter Storm Warnings posted for parts of Nort...</td>\n",
530
+ " <td>A coastal storm is set to deliver another slop...</td>\n",
531
+ " </tr>\n",
532
+ " <tr>\n",
533
+ " <th>5778</th>\n",
534
+ " <td>5777.0</td>\n",
535
+ " <td>Winter weather expected to continue to impact ...</td>\n",
536
+ " <td>Meteorological sources indicate that a series ...</td>\n",
537
+ " <td>Minor</td>\n",
538
+ " <td>Roadway Closure / Disruption, Ground Transport...</td>\n",
539
+ " <td>United States</td>\n",
540
+ " <td>31/12/20 18:15</td>\n",
541
+ " <td>2020.0</td>\n",
542
+ " <td>41.30357</td>\n",
543
+ " <td>-72.90561</td>\n",
544
+ " <td>False</td>\n",
545
+ " <td>['new york', 'virginia']</td>\n",
546
+ " <td>1.0</td>\n",
547
+ " <td>True</td>\n",
548
+ " <td>12.0</td>\n",
549
+ " <td>53.0</td>\n",
550
+ " <td>Winter weather expected to continue to impact ...</td>\n",
551
+ " <td>https://news.google.com/rss/articles/CBMiT2h0d...</td>\n",
552
+ " <td>Deadly nor'easter wreaks havoc in New England ...</td>\n",
553
+ " <td>BOSTON – A deadly late-season nor’easter that ...</td>\n",
554
+ " </tr>\n",
555
+ " <tr>\n",
556
+ " <th>5779</th>\n",
557
+ " <td>5778.0</td>\n",
558
+ " <td>Workers of Svitzer Australia plan to strike on...</td>\n",
559
+ " <td>Industry sources report on December 7 that Svi...</td>\n",
560
+ " <td>Moderate</td>\n",
561
+ " <td>Industrial Action</td>\n",
562
+ " <td>Australia</td>\n",
563
+ " <td>7/12/20 6:16</td>\n",
564
+ " <td>2020.0</td>\n",
565
+ " <td>-38.35169</td>\n",
566
+ " <td>145.25050</td>\n",
567
+ " <td>False</td>\n",
568
+ " <td>['geelong', 'melbourne']</td>\n",
569
+ " <td>1.0</td>\n",
570
+ " <td>False</td>\n",
571
+ " <td>7.0</td>\n",
572
+ " <td>28.0</td>\n",
573
+ " <td>Workers of Svitzer Australia plan to strike on...</td>\n",
574
+ " <td>no url found</td>\n",
575
+ " <td>no title found</td>\n",
576
+ " <td>no content found</td>\n",
577
+ " </tr>\n",
578
+ " <tr>\n",
579
+ " <th>5780</th>\n",
580
+ " <td>5779.0</td>\n",
581
+ " <td>Workers stage 24-hour stoppage at DP World Ter...</td>\n",
582
+ " <td>Industry sources indicate on December 14 that ...</td>\n",
583
+ " <td>Moderate</td>\n",
584
+ " <td>Port Strike</td>\n",
585
+ " <td>Australia</td>\n",
586
+ " <td>14/12/20 16:52</td>\n",
587
+ " <td>2020.0</td>\n",
588
+ " <td>NaN</td>\n",
589
+ " <td>NaN</td>\n",
590
+ " <td>True</td>\n",
591
+ " <td>['fremantle']</td>\n",
592
+ " <td>1.0</td>\n",
593
+ " <td>True</td>\n",
594
+ " <td>12.0</td>\n",
595
+ " <td>51.0</td>\n",
596
+ " <td>Workers stage 24-hour stoppage at DP World Ter...</td>\n",
597
+ " <td>https://news.google.com/rss/articles/CBMiOWh0d...</td>\n",
598
+ " <td>Union shuts down strikes as DP World threatens...</td>\n",
599
+ " <td>The Maritime Union of Australia (MUA) has curt...</td>\n",
600
+ " </tr>\n",
601
+ " <tr>\n",
602
+ " <th>5781</th>\n",
603
+ " <td>5780.0</td>\n",
604
+ " <td>Workers with Ertsoverslagbedrijf Europoort C.V...</td>\n",
605
+ " <td>On November 17, Dutch media sources reported t...</td>\n",
606
+ " <td>Minor</td>\n",
607
+ " <td>Port Strike</td>\n",
608
+ " <td>Netherlands</td>\n",
609
+ " <td>18/11/20 17:55</td>\n",
610
+ " <td>2020.0</td>\n",
611
+ " <td>51.96121</td>\n",
612
+ " <td>4.10550</td>\n",
613
+ " <td>False</td>\n",
614
+ " <td>['rotterdam']</td>\n",
615
+ " <td>1.0</td>\n",
616
+ " <td>True</td>\n",
617
+ " <td>11.0</td>\n",
618
+ " <td>47.0</td>\n",
619
+ " <td>Workers with Ertsoverslagbedrijf Europoort C.V...</td>\n",
620
+ " <td>no url found</td>\n",
621
+ " <td>no title found</td>\n",
622
+ " <td>no content found</td>\n",
623
+ " </tr>\n",
624
+ " </tbody>\n",
625
+ "</table>\n",
626
+ "<p>5710 rows × 20 columns</p>\n",
627
+ "</div>"
628
+ ],
629
+ "text/plain": [
630
+ " id Headline \\\n",
631
+ "0 1.0 Grasberg Mine- Grasberg mine workers extend st... \n",
632
+ "1 2.0 Indonesia: Undersea internet cables damaged by... \n",
633
+ "2 3.0 Shanghai port congestion impacts terminals in ... \n",
634
+ "3 4.0 UPDATE - Indonesia: Explosion at KP Terminal i... \n",
635
+ "4 5.0 UPDATE - Indonesia: Police confirm two explosi... \n",
636
+ "... ... ... \n",
637
+ "5777 5776.0 Winter storm may bring inclement weather condi... \n",
638
+ "5778 5777.0 Winter weather expected to continue to impact ... \n",
639
+ "5779 5778.0 Workers of Svitzer Australia plan to strike on... \n",
640
+ "5780 5779.0 Workers stage 24-hour stoppage at DP World Ter... \n",
641
+ "5781 5780.0 Workers with Ertsoverslagbedrijf Europoort C.V... \n",
642
+ "\n",
643
+ " Details Severity \\\n",
644
+ "0 Media sources indicate that workers at the Gra... Moderate \n",
645
+ "1 News sources are stating that recent typhoons ... Minor \n",
646
+ "2 The persisting port congestion at Shanghai’s Y... Minor \n",
647
+ "3 Updated local media sources from Jakarta indic... Extreme \n",
648
+ "4 According to local police in Jakarta, two expl... Extreme \n",
649
+ "... ... ... \n",
650
+ "5777 Intelligence received by Everstream Analytics ... Moderate \n",
651
+ "5778 Meteorological sources indicate that a series ... Minor \n",
652
+ "5779 Industry sources report on December 7 that Svi... Moderate \n",
653
+ "5780 Industry sources indicate on December 14 that ... Moderate \n",
654
+ "5781 On November 17, Dutch media sources reported t... Minor \n",
655
+ "\n",
656
+ " Category Region \\\n",
657
+ "0 Mine Workers Strike Indonesia \n",
658
+ "1 Travel Warning Indonesia \n",
659
+ "2 Port Congestion China \n",
660
+ "3 Bombing, Police Operations Indonesia \n",
661
+ "4 Bombing, Police Operations Indonesia \n",
662
+ "... ... ... \n",
663
+ "5777 Ice Storm United States \n",
664
+ "5778 Roadway Closure / Disruption, Ground Transport... United States \n",
665
+ "5779 Industrial Action Australia \n",
666
+ "5780 Port Strike Australia \n",
667
+ "5781 Port Strike Netherlands \n",
668
+ "\n",
669
+ " Datetime Year lat lon maritime_label \\\n",
670
+ "0 28/5/17 17:08 2017.0 -4.05608 137.11302 False \n",
671
+ "1 4/9/17 14:30 2017.0 NaN NaN False \n",
672
+ "2 27/4/17 9:16 2017.0 29.52000 121.33190 True \n",
673
+ "3 24/5/17 15:15 2017.0 -6.22465 106.86700 True \n",
674
+ "4 24/5/17 16:20 2017.0 NaN NaN True \n",
675
+ "... ... ... ... ... ... \n",
676
+ "5777 16/12/20 2:47 2020.0 38.90072 -77.05440 False \n",
677
+ "5778 31/12/20 18:15 2020.0 41.30357 -72.90561 False \n",
678
+ "5779 7/12/20 6:16 2020.0 -38.35169 145.25050 False \n",
679
+ "5780 14/12/20 16:52 2020.0 NaN NaN True \n",
680
+ "5781 18/11/20 17:55 2020.0 51.96121 4.10550 False \n",
681
+ "\n",
682
+ " found_ports contains_port_info if_labeled Month \\\n",
683
+ "0 ['freeport'] 1.0 False 5.0 \n",
684
+ "1 ['hong kong'] 1.0 False 4.0 \n",
685
+ "2 ['ningbo', 'qingdao', 'shanghai'] 1.0 False 4.0 \n",
686
+ "3 ['jakarta'] 1.0 False 5.0 \n",
687
+ "4 ['jakarta'] 1.0 True 5.0 \n",
688
+ "... ... ... ... ... \n",
689
+ "5777 ['new york'] 1.0 False 12.0 \n",
690
+ "5778 ['new york', 'virginia'] 1.0 True 12.0 \n",
691
+ "5779 ['geelong', 'melbourne'] 1.0 False 7.0 \n",
692
+ "5780 ['fremantle'] 1.0 True 12.0 \n",
693
+ "5781 ['rotterdam'] 1.0 True 11.0 \n",
694
+ "\n",
695
+ " Week Headline_Details \\\n",
696
+ "0 21.0 Grasberg Mine- Grasberg mine workers extend st... \n",
697
+ "1 14.0 Indonesia: Undersea internet cables damaged by... \n",
698
+ "2 17.0 Shanghai port congestion impacts terminals in ... \n",
699
+ "3 21.0 UPDATE - Indonesia: Explosion at KP Terminal i... \n",
700
+ "4 21.0 UPDATE - Indonesia: Police confirm two explosi... \n",
701
+ "... ... ... \n",
702
+ "5777 51.0 Winter storm may bring inclement weather condi... \n",
703
+ "5778 53.0 Winter weather expected to continue to impact ... \n",
704
+ "5779 28.0 Workers of Svitzer Australia plan to strike on... \n",
705
+ "5780 51.0 Workers stage 24-hour stoppage at DP World Ter... \n",
706
+ "5781 47.0 Workers with Ertsoverslagbedrijf Europoort C.V... \n",
707
+ "\n",
708
+ " url \\\n",
709
+ "0 https://news.google.com/rss/articles/CBMiZ2h0d... \n",
710
+ "1 no url found \n",
711
+ "2 https://news.google.com/rss/articles/CBMiZmh0d... \n",
712
+ "3 no url found \n",
713
+ "4 https://news.google.com/rss/articles/CBMiV2h0d... \n",
714
+ "... ... \n",
715
+ "5777 https://news.google.com/rss/articles/CBMiWWh0d... \n",
716
+ "5778 https://news.google.com/rss/articles/CBMiT2h0d... \n",
717
+ "5779 no url found \n",
718
+ "5780 https://news.google.com/rss/articles/CBMiOWh0d... \n",
719
+ "5781 no url found \n",
720
+ "\n",
721
+ " title \\\n",
722
+ "0 Freeport Indonesia mine workers extend strike ... \n",
723
+ "1 no title found \n",
724
+ "2 Bad weather causing delays, congestion at majo... \n",
725
+ "3 no title found \n",
726
+ "4 Three dead confirmed in Jakarta double bombing \n",
727
+ "... ... \n",
728
+ "5777 Winter Storm Warnings posted for parts of Nort... \n",
729
+ "5778 Deadly nor'easter wreaks havoc in New England ... \n",
730
+ "5779 no title found \n",
731
+ "5780 Union shuts down strikes as DP World threatens... \n",
732
+ "5781 no title found \n",
733
+ "\n",
734
+ " content \n",
735
+ "0 By Wilda Asmarini and Susan Taylor\\n\\nJAKARTA/... \n",
736
+ "1 no content found \n",
737
+ "2 LA-LB ports investing $25 million in charging ... \n",
738
+ "3 no content found \n",
739
+ "4 Two explosions strike bus station in Indonesia... \n",
740
+ "... ... \n",
741
+ "5777 A coastal storm is set to deliver another slop... \n",
742
+ "5778 BOSTON – A deadly late-season nor’easter that ... \n",
743
+ "5779 no content found \n",
744
+ "5780 The Maritime Union of Australia (MUA) has curt... \n",
745
+ "5781 no content found \n",
746
+ "\n",
747
+ "[5710 rows x 20 columns]"
748
+ ]
749
+ },
750
+ "execution_count": 11,
751
+ "metadata": {},
752
+ "output_type": "execute_result"
753
+ }
754
+ ],
755
+ "source": [
756
+ "tes"
757
+ ]
758
+ },
759
+ {
760
+ "cell_type": "code",
761
+ "execution_count": 12,
762
+ "id": "d6fe260c-1bbb-4556-b9ac-2b9b62c5ba5f",
763
+ "metadata": {},
764
+ "outputs": [
765
+ {
766
+ "data": {
767
+ "text/plain": [
768
+ "3218"
769
+ ]
770
+ },
771
+ "execution_count": 12,
772
+ "metadata": {},
773
+ "output_type": "execute_result"
774
+ }
775
+ ],
776
+ "source": [
777
+ "count_rows = tes[\n",
778
+ " ~tes[\"content\"].isin([\"cannot scrape the content\", \"no content found\"])\n",
779
+ "].shape[0]\n",
780
+ "count_rows"
781
+ ]
782
+ },
783
+ {
784
+ "cell_type": "code",
785
+ "execution_count": 13,
786
+ "id": "6818fe7b-6e7d-4d30-8ee8-866b29fd40c8",
787
+ "metadata": {},
788
+ "outputs": [
789
+ {
790
+ "data": {
791
+ "text/plain": [
792
+ "2132"
793
+ ]
794
+ },
795
+ "execution_count": 13,
796
+ "metadata": {},
797
+ "output_type": "execute_result"
798
+ }
799
+ ],
800
+ "source": [
801
+ "count_rows = tes[tes[\"content\"].isin([\"no content found\"])].shape[0]\n",
802
+ "count_rows"
803
+ ]
804
+ },
805
+ {
806
+ "cell_type": "code",
807
+ "execution_count": 14,
808
+ "id": "f59e2ce4-072a-43e2-a247-fd399c388c2d",
809
+ "metadata": {},
810
+ "outputs": [
811
+ {
812
+ "data": {
813
+ "text/plain": [
814
+ "360"
815
+ ]
816
+ },
817
+ "execution_count": 14,
818
+ "metadata": {},
819
+ "output_type": "execute_result"
820
+ }
821
+ ],
822
+ "source": [
823
+ "count_rows = tes[tes[\"content\"].isin([\"cannot scrape the content\"])].shape[0]\n",
824
+ "count_rows"
825
+ ]
826
+ },
827
+ {
828
+ "cell_type": "code",
829
+ "execution_count": 15,
830
+ "id": "8063b964",
831
+ "metadata": {},
832
+ "outputs": [],
833
+ "source": [
834
+ "scrapped_df1 = pd.read_parquet(\"data/scrapped_data1.parquet\")"
835
+ ]
836
+ },
837
+ {
838
+ "cell_type": "code",
839
+ "execution_count": 16,
840
+ "id": "78896dd3",
841
+ "metadata": {},
842
+ "outputs": [
843
+ {
844
+ "data": {
845
+ "text/plain": [
846
+ "(5710, 20)"
847
+ ]
848
+ },
849
+ "execution_count": 16,
850
+ "metadata": {},
851
+ "output_type": "execute_result"
852
+ }
853
+ ],
854
+ "source": [
855
+ "scrapped_df1.shape"
856
+ ]
857
+ },
858
+ {
859
+ "cell_type": "code",
860
+ "execution_count": 17,
861
+ "id": "316d6be5",
862
+ "metadata": {},
863
+ "outputs": [
864
+ {
865
+ "name": "stderr",
866
+ "output_type": "stream",
867
+ "text": [
868
+ "100%|██████████| 5710/5710 [00:00<00:00, 35312.10it/s]"
869
+ ]
870
+ },
871
+ {
872
+ "name": "stdout",
873
+ "output_type": "stream",
874
+ "text": [
875
+ "CPU times: user 162 ms, sys: 3.81 ms, total: 166 ms\n",
876
+ "Wall time: 165 ms\n"
877
+ ]
878
+ },
879
+ {
880
+ "name": "stderr",
881
+ "output_type": "stream",
882
+ "text": [
883
+ "\n",
884
+ "<timed exec>:23: SettingWithCopyWarning: \n",
885
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
886
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
887
+ "\n",
888
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
889
+ "<timed exec>:24: SettingWithCopyWarning: \n",
890
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
891
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
892
+ "\n",
893
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
894
+ "<timed exec>:25: SettingWithCopyWarning: \n",
895
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
896
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
897
+ "\n",
898
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
899
+ ]
900
+ }
901
+ ],
902
+ "source": [
903
+ "%%time\n",
904
+ "\n",
905
+ "from tqdm import tqdm\n",
906
+ "\n",
907
+ "urls = []\n",
908
+ "titles = []\n",
909
+ "contents = []\n",
910
+ "\n",
911
+ "not_found = [\"cannot scrape the content\", \"no content found\", None]\n",
912
+ "\n",
913
+ "for index in tqdm(range(len(tes))):\n",
914
+ " row1 = scrapped_df1.iloc[index]\n",
915
+ " row2 = tes.iloc[index]\n",
916
+ " \n",
917
+ " if row1[\"content\"] in not_found and not row2[\"content\"] in not_found and row2[\"title\"] is not None:\n",
918
+ " row = row2\n",
919
+ " else:\n",
920
+ " row = row1\n",
921
+ "\n",
922
+ " urls.append(row[\"url\"])\n",
923
+ " titles.append(row[\"title\"])\n",
924
+ " contents.append(row[\"content\"])\n",
925
+ "\n",
926
+ "# Update the DataFrame with the fetched data\n",
927
+ "tes[\"url\"] = urls\n",
928
+ "tes[\"title\"] = titles\n",
929
+ "tes[\"content\"] = contents"
930
+ ]
931
+ },
932
+ {
933
+ "cell_type": "code",
934
+ "execution_count": 18,
935
+ "id": "4926f410",
936
+ "metadata": {},
937
+ "outputs": [
938
+ {
939
+ "data": {
940
+ "text/plain": [
941
+ "3694"
942
+ ]
943
+ },
944
+ "execution_count": 18,
945
+ "metadata": {},
946
+ "output_type": "execute_result"
947
+ }
948
+ ],
949
+ "source": [
950
+ "count_rows = tes[~tes[\"content\"].isin(not_found)].shape[0]\n",
951
+ "count_rows"
952
+ ]
953
+ },
954
+ {
955
+ "cell_type": "code",
956
+ "execution_count": 19,
957
+ "id": "44bff271",
958
+ "metadata": {},
959
+ "outputs": [
960
+ {
961
+ "data": {
962
+ "text/plain": [
963
+ "0.6469352014010508"
964
+ ]
965
+ },
966
+ "execution_count": 19,
967
+ "metadata": {},
968
+ "output_type": "execute_result"
969
+ }
970
+ ],
971
+ "source": [
972
+ "count_rows / len(tes)"
973
+ ]
974
+ },
975
+ {
976
+ "cell_type": "code",
977
+ "execution_count": 20,
978
+ "id": "e573b48a",
979
+ "metadata": {},
980
+ "outputs": [
981
+ {
982
+ "data": {
983
+ "text/plain": [
984
+ "1945"
985
+ ]
986
+ },
987
+ "execution_count": 20,
988
+ "metadata": {},
989
+ "output_type": "execute_result"
990
+ }
991
+ ],
992
+ "source": [
993
+ "count_rows = tes[tes[\"content\"].isin([\"no content found\"])].shape[0]\n",
994
+ "count_rows"
995
+ ]
996
+ },
997
+ {
998
+ "cell_type": "code",
999
+ "execution_count": 21,
1000
+ "id": "d4a527b3",
1001
+ "metadata": {},
1002
+ "outputs": [
1003
+ {
1004
+ "data": {
1005
+ "text/plain": [
1006
+ "71"
1007
+ ]
1008
+ },
1009
+ "execution_count": 21,
1010
+ "metadata": {},
1011
+ "output_type": "execute_result"
1012
+ }
1013
+ ],
1014
+ "source": [
1015
+ "count_rows = tes[tes[\"content\"].isin([\"cannot scrape the content\"])].shape[0]\n",
1016
+ "count_rows"
1017
+ ]
1018
+ },
1019
+ {
1020
+ "cell_type": "code",
1021
+ "execution_count": 22,
1022
+ "id": "158acaec",
1023
+ "metadata": {},
1024
+ "outputs": [
1025
+ {
1026
+ "name": "stderr",
1027
+ "output_type": "stream",
1028
+ "text": [
1029
+ "/var/folders/7x/56svhln929zdh2xhr3mwqg4r0000gn/T/ipykernel_64033/1183224327.py:1: SettingWithCopyWarning: \n",
1030
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
1031
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
1032
+ "\n",
1033
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
1034
+ " tes[\"id\"] = tes[\"id\"].astype(int)\n"
1035
+ ]
1036
+ }
1037
+ ],
1038
+ "source": [
1039
+ "tes[\"id\"] = tes[\"id\"].astype(int)"
1040
+ ]
1041
+ },
1042
+ {
1043
+ "cell_type": "code",
1044
+ "execution_count": 23,
1045
+ "id": "154aada9-8e4a-479a-8288-a63b72ae86ad",
1046
+ "metadata": {},
1047
+ "outputs": [],
1048
+ "source": [
1049
+ "tes.to_csv(\"data/scrapped_data2.csv\", index=False)"
1050
+ ]
1051
+ }
1052
+ ],
1053
+ "metadata": {
1054
+ "kernelspec": {
1055
+ "display_name": "Python 3",
1056
+ "language": "python",
1057
+ "name": "python3"
1058
+ },
1059
+ "language_info": {
1060
+ "codemirror_mode": {
1061
+ "name": "ipython",
1062
+ "version": 3
1063
+ },
1064
+ "file_extension": ".py",
1065
+ "mimetype": "text/x-python",
1066
+ "name": "python",
1067
+ "nbconvert_exporter": "python",
1068
+ "pygments_lexer": "ipython3",
1069
+ "version": "3.12.4"
1070
+ }
1071
+ },
1072
+ "nbformat": 4,
1073
+ "nbformat_minor": 5
1074
+ }
notebooks/06_basic_text_preprocessing_on_scraped_data.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/07_topic_modelling_minor.ipynb ADDED
@@ -0,0 +1,1564 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "46322fb5-5918-4b70-9689-9e0781439ac4",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "workding dir: /Users/inflaton/code/engd/papers/maritime/global-incidents\n",
14
+ "loading env vars from: /Users/inflaton/code/engd/papers/maritime/global-incidents/.env\n"
15
+ ]
16
+ },
17
+ {
18
+ "data": {
19
+ "text/plain": [
20
+ "True"
21
+ ]
22
+ },
23
+ "execution_count": 1,
24
+ "metadata": {},
25
+ "output_type": "execute_result"
26
+ }
27
+ ],
28
+ "source": [
29
+ "%load_ext autoreload\n",
30
+ "%autoreload 2\n",
31
+ "\n",
32
+ "import os\n",
33
+ "import sys\n",
34
+ "from pathlib import Path\n",
35
+ "\n",
36
+ "workding_dir = str(Path.cwd().parent)\n",
37
+ "os.chdir(workding_dir)\n",
38
+ "sys.path.append(workding_dir)\n",
39
+ "print(\"workding dir:\", workding_dir)\n",
40
+ "\n",
41
+ "from dotenv import find_dotenv, load_dotenv\n",
42
+ "\n",
43
+ "found_dotenv = find_dotenv(\".env\")\n",
44
+ "\n",
45
+ "if len(found_dotenv) == 0:\n",
46
+ " found_dotenv = find_dotenv(\".env.example\")\n",
47
+ "print(f\"loading env vars from: {found_dotenv}\")\n",
48
+ "load_dotenv(found_dotenv, override=True)"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 2,
54
+ "id": "daf1e3d1-75ac-4299-8bed-2f413a49f9a6",
55
+ "metadata": {
56
+ "tags": []
57
+ },
58
+ "outputs": [],
59
+ "source": [
60
+ "import nltk\n",
61
+ "from nltk.tokenize import sent_tokenize\n",
62
+ "from nltk.tokenize import word_tokenize\n",
63
+ "\n",
64
+ "import gensim\n",
65
+ "from gensim import corpora\n",
66
+ "from gensim import similarities\n",
67
+ "from gensim import models\n",
68
+ "from gensim.models import CoherenceModel\n",
69
+ "\n",
70
+ "# from wordcloud import WordCloud, ImageColorGenerator\n",
71
+ "import matplotlib.pyplot as plt\n",
72
+ "import seaborn as sns\n",
73
+ "import pandas as pd\n",
74
+ "import re\n",
75
+ "import os\n",
76
+ "import datetime\n",
77
+ "\n",
78
+ "import warnings\n",
79
+ "warnings.filterwarnings('ignore')\n",
80
+ "\n",
81
+ "from pprint import pprint\n",
82
+ "import pyLDAvis\n",
83
+ "import pyLDAvis.gensim_models as gensimvis"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "markdown",
88
+ "id": "49e6de6b-71bd-4948-8827-52601406058f",
89
+ "metadata": {},
90
+ "source": [
91
+ "# Import Data"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": 3,
97
+ "id": "49222182-7811-4fa6-8c0a-21d3a546863e",
98
+ "metadata": {},
99
+ "outputs": [],
100
+ "source": [
101
+ "df = pd.read_parquet('data/processed_data2.parquet')"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": 4,
107
+ "id": "3fb59a30",
108
+ "metadata": {},
109
+ "outputs": [
110
+ {
111
+ "data": {
112
+ "text/html": [
113
+ "<div>\n",
114
+ "<style scoped>\n",
115
+ " .dataframe tbody tr th:only-of-type {\n",
116
+ " vertical-align: middle;\n",
117
+ " }\n",
118
+ "\n",
119
+ " .dataframe tbody tr th {\n",
120
+ " vertical-align: top;\n",
121
+ " }\n",
122
+ "\n",
123
+ " .dataframe thead th {\n",
124
+ " text-align: right;\n",
125
+ " }\n",
126
+ "</style>\n",
127
+ "<table border=\"1\" class=\"dataframe\">\n",
128
+ " <thead>\n",
129
+ " <tr style=\"text-align: right;\">\n",
130
+ " <th></th>\n",
131
+ " <th>id</th>\n",
132
+ " <th>Headline</th>\n",
133
+ " <th>Details</th>\n",
134
+ " <th>Severity</th>\n",
135
+ " <th>Category</th>\n",
136
+ " <th>Region</th>\n",
137
+ " <th>Datetime</th>\n",
138
+ " <th>Year</th>\n",
139
+ " <th>lat</th>\n",
140
+ " <th>lon</th>\n",
141
+ " <th>...</th>\n",
142
+ " <th>if_labeled</th>\n",
143
+ " <th>Month</th>\n",
144
+ " <th>Week</th>\n",
145
+ " <th>Headline_Details</th>\n",
146
+ " <th>url</th>\n",
147
+ " <th>title</th>\n",
148
+ " <th>content</th>\n",
149
+ " <th>cleaned_content</th>\n",
150
+ " <th>binary_content</th>\n",
151
+ " <th>word_count</th>\n",
152
+ " </tr>\n",
153
+ " </thead>\n",
154
+ " <tbody>\n",
155
+ " <tr>\n",
156
+ " <th>0</th>\n",
157
+ " <td>1</td>\n",
158
+ " <td>Grasberg Mine- Grasberg mine workers extend st...</td>\n",
159
+ " <td>Media sources indicate that workers at the Gra...</td>\n",
160
+ " <td>Moderate</td>\n",
161
+ " <td>Mine Workers Strike</td>\n",
162
+ " <td>Indonesia</td>\n",
163
+ " <td>28/5/17 17:08</td>\n",
164
+ " <td>2017.0</td>\n",
165
+ " <td>-4.05608</td>\n",
166
+ " <td>137.11302</td>\n",
167
+ " <td>...</td>\n",
168
+ " <td>False</td>\n",
169
+ " <td>5.0</td>\n",
170
+ " <td>21.0</td>\n",
171
+ " <td>Grasberg Mine- Grasberg mine workers extend st...</td>\n",
172
+ " <td>https://news.google.com/rss/articles/CBMiZ2h0d...</td>\n",
173
+ " <td>Freeport Indonesia mine workers extend strike ...</td>\n",
174
+ " <td>Trucks are seen on a road in the Grasberg copp...</td>\n",
175
+ " <td>[truck, be, see, on, road, in, grasberg, coppe...</td>\n",
176
+ " <td>[adkerson_jakarta_try, agreement_freeport_indo...</td>\n",
177
+ " <td>53</td>\n",
178
+ " </tr>\n",
179
+ " <tr>\n",
180
+ " <th>1</th>\n",
181
+ " <td>3</td>\n",
182
+ " <td>Shanghai port congestion impacts terminals in ...</td>\n",
183
+ " <td>The persisting port congestion at Shanghai’s Y...</td>\n",
184
+ " <td>Minor</td>\n",
185
+ " <td>Port Congestion</td>\n",
186
+ " <td>China</td>\n",
187
+ " <td>27/4/17 9:16</td>\n",
188
+ " <td>2017.0</td>\n",
189
+ " <td>29.52000</td>\n",
190
+ " <td>121.33190</td>\n",
191
+ " <td>...</td>\n",
192
+ " <td>False</td>\n",
193
+ " <td>4.0</td>\n",
194
+ " <td>17.0</td>\n",
195
+ " <td>Shanghai port congestion impacts terminals in ...</td>\n",
196
+ " <td>https://news.google.com/rss/articles/CBMiVWh0d...</td>\n",
197
+ " <td>Typhoon Muifa to shut China ports for second t...</td>\n",
198
+ " <td>By Sam Whelan 13/09/2022\\n\\nAnother typhoon ha...</td>\n",
199
+ " <td>[by, sam, whelan, typhoon, have, prompt, port,...</td>\n",
200
+ " <td>[additional_ripple_effect, avoid_path_typhoon,...</td>\n",
201
+ " <td>44</td>\n",
202
+ " </tr>\n",
203
+ " <tr>\n",
204
+ " <th>2</th>\n",
205
+ " <td>5</td>\n",
206
+ " <td>UPDATE - Indonesia: Police confirm two explosi...</td>\n",
207
+ " <td>According to local police in Jakarta, two expl...</td>\n",
208
+ " <td>Extreme</td>\n",
209
+ " <td>Bombing, Police Operations</td>\n",
210
+ " <td>Indonesia</td>\n",
211
+ " <td>24/5/17 16:20</td>\n",
212
+ " <td>2017.0</td>\n",
213
+ " <td>NaN</td>\n",
214
+ " <td>NaN</td>\n",
215
+ " <td>...</td>\n",
216
+ " <td>True</td>\n",
217
+ " <td>5.0</td>\n",
218
+ " <td>21.0</td>\n",
219
+ " <td>UPDATE - Indonesia: Police confirm two explosi...</td>\n",
220
+ " <td>https://news.google.com/rss/articles/CBMiZWh0d...</td>\n",
221
+ " <td>Jakarta Police Receive 2 More Reports on Coldp...</td>\n",
222
+ " <td>TEMPO.CO, Jakarta - South Jakarta Metro Police...</td>\n",
223
+ " <td>[jakarta, south, jakarta, metro, police, recei...</td>\n",
224
+ " <td>[actress_accord, available_day_concert, click_...</td>\n",
225
+ " <td>24</td>\n",
226
+ " </tr>\n",
227
+ " <tr>\n",
228
+ " <th>3</th>\n",
229
+ " <td>6</td>\n",
230
+ " <td>UPDATE - Indonesia: Severe winds damage infras...</td>\n",
231
+ " <td>Severe winds have downed billboards and trees ...</td>\n",
232
+ " <td>Moderate</td>\n",
233
+ " <td>Roadway Closure / Disruption, Flooding, Severe...</td>\n",
234
+ " <td>Indonesia</td>\n",
235
+ " <td>19/4/17 9:10</td>\n",
236
+ " <td>2017.0</td>\n",
237
+ " <td>-6.91264</td>\n",
238
+ " <td>107.65700</td>\n",
239
+ " <td>...</td>\n",
240
+ " <td>True</td>\n",
241
+ " <td>4.0</td>\n",
242
+ " <td>16.0</td>\n",
243
+ " <td>UPDATE - Indonesia: Severe winds damage infras...</td>\n",
244
+ " <td>https://news.google.com/rss/articles/CBMiSWh0d...</td>\n",
245
+ " <td>Indonesia hit by some of strongest winds recorded</td>\n",
246
+ " <td>A man stands near damaged houses following a t...</td>\n",
247
+ " <td>[man, stand, near, damage, house, follow, torn...</td>\n",
248
+ " <td>[bbc_indonesia, climatologist_government_resea...</td>\n",
249
+ " <td>28</td>\n",
250
+ " </tr>\n",
251
+ " <tr>\n",
252
+ " <th>4</th>\n",
253
+ " <td>14</td>\n",
254
+ " <td>2 miles E of Chesterfield - A tornado has touc...</td>\n",
255
+ " <td>Government sources are reporting a tornado has...</td>\n",
256
+ " <td>Minor</td>\n",
257
+ " <td>Tornado</td>\n",
258
+ " <td>United States</td>\n",
259
+ " <td>17/9/18 19:55</td>\n",
260
+ " <td>2018.0</td>\n",
261
+ " <td>37.51000</td>\n",
262
+ " <td>-77.61000</td>\n",
263
+ " <td>...</td>\n",
264
+ " <td>True</td>\n",
265
+ " <td>9.0</td>\n",
266
+ " <td>38.0</td>\n",
267
+ " <td>2 miles E of Chesterfield - A tornado has touc...</td>\n",
268
+ " <td>https://news.google.com/rss/articles/CBMigAFod...</td>\n",
269
+ " <td>UPDATE: Number of homes without power down to ...</td>\n",
270
+ " <td>More than 90,000 homes and businesses across t...</td>\n",
271
+ " <td>[more, than, home, business, across, richmond,...</td>\n",
272
+ " <td>[advise_seek_alternate, affect_richmond, alter...</td>\n",
273
+ " <td>134</td>\n",
274
+ " </tr>\n",
275
+ " </tbody>\n",
276
+ "</table>\n",
277
+ "<p>5 rows × 23 columns</p>\n",
278
+ "</div>"
279
+ ],
280
+ "text/plain": [
281
+ " id Headline \\\n",
282
+ "0 1 Grasberg Mine- Grasberg mine workers extend st... \n",
283
+ "1 3 Shanghai port congestion impacts terminals in ... \n",
284
+ "2 5 UPDATE - Indonesia: Police confirm two explosi... \n",
285
+ "3 6 UPDATE - Indonesia: Severe winds damage infras... \n",
286
+ "4 14 2 miles E of Chesterfield - A tornado has touc... \n",
287
+ "\n",
288
+ " Details Severity \\\n",
289
+ "0 Media sources indicate that workers at the Gra... Moderate \n",
290
+ "1 The persisting port congestion at Shanghai’s Y... Minor \n",
291
+ "2 According to local police in Jakarta, two expl... Extreme \n",
292
+ "3 Severe winds have downed billboards and trees ... Moderate \n",
293
+ "4 Government sources are reporting a tornado has... Minor \n",
294
+ "\n",
295
+ " Category Region \\\n",
296
+ "0 Mine Workers Strike Indonesia \n",
297
+ "1 Port Congestion China \n",
298
+ "2 Bombing, Police Operations Indonesia \n",
299
+ "3 Roadway Closure / Disruption, Flooding, Severe... Indonesia \n",
300
+ "4 Tornado United States \n",
301
+ "\n",
302
+ " Datetime Year lat lon ... if_labeled Month Week \\\n",
303
+ "0 28/5/17 17:08 2017.0 -4.05608 137.11302 ... False 5.0 21.0 \n",
304
+ "1 27/4/17 9:16 2017.0 29.52000 121.33190 ... False 4.0 17.0 \n",
305
+ "2 24/5/17 16:20 2017.0 NaN NaN ... True 5.0 21.0 \n",
306
+ "3 19/4/17 9:10 2017.0 -6.91264 107.65700 ... True 4.0 16.0 \n",
307
+ "4 17/9/18 19:55 2018.0 37.51000 -77.61000 ... True 9.0 38.0 \n",
308
+ "\n",
309
+ " Headline_Details \\\n",
310
+ "0 Grasberg Mine- Grasberg mine workers extend st... \n",
311
+ "1 Shanghai port congestion impacts terminals in ... \n",
312
+ "2 UPDATE - Indonesia: Police confirm two explosi... \n",
313
+ "3 UPDATE - Indonesia: Severe winds damage infras... \n",
314
+ "4 2 miles E of Chesterfield - A tornado has touc... \n",
315
+ "\n",
316
+ " url \\\n",
317
+ "0 https://news.google.com/rss/articles/CBMiZ2h0d... \n",
318
+ "1 https://news.google.com/rss/articles/CBMiVWh0d... \n",
319
+ "2 https://news.google.com/rss/articles/CBMiZWh0d... \n",
320
+ "3 https://news.google.com/rss/articles/CBMiSWh0d... \n",
321
+ "4 https://news.google.com/rss/articles/CBMigAFod... \n",
322
+ "\n",
323
+ " title \\\n",
324
+ "0 Freeport Indonesia mine workers extend strike ... \n",
325
+ "1 Typhoon Muifa to shut China ports for second t... \n",
326
+ "2 Jakarta Police Receive 2 More Reports on Coldp... \n",
327
+ "3 Indonesia hit by some of strongest winds recorded \n",
328
+ "4 UPDATE: Number of homes without power down to ... \n",
329
+ "\n",
330
+ " content \\\n",
331
+ "0 Trucks are seen on a road in the Grasberg copp... \n",
332
+ "1 By Sam Whelan 13/09/2022\\n\\nAnother typhoon ha... \n",
333
+ "2 TEMPO.CO, Jakarta - South Jakarta Metro Police... \n",
334
+ "3 A man stands near damaged houses following a t... \n",
335
+ "4 More than 90,000 homes and businesses across t... \n",
336
+ "\n",
337
+ " cleaned_content \\\n",
338
+ "0 [truck, be, see, on, road, in, grasberg, coppe... \n",
339
+ "1 [by, sam, whelan, typhoon, have, prompt, port,... \n",
340
+ "2 [jakarta, south, jakarta, metro, police, recei... \n",
341
+ "3 [man, stand, near, damage, house, follow, torn... \n",
342
+ "4 [more, than, home, business, across, richmond,... \n",
343
+ "\n",
344
+ " binary_content word_count \n",
345
+ "0 [adkerson_jakarta_try, agreement_freeport_indo... 53 \n",
346
+ "1 [additional_ripple_effect, avoid_path_typhoon,... 44 \n",
347
+ "2 [actress_accord, available_day_concert, click_... 24 \n",
348
+ "3 [bbc_indonesia, climatologist_government_resea... 28 \n",
349
+ "4 [advise_seek_alternate, affect_richmond, alter... 134 \n",
350
+ "\n",
351
+ "[5 rows x 23 columns]"
352
+ ]
353
+ },
354
+ "execution_count": 4,
355
+ "metadata": {},
356
+ "output_type": "execute_result"
357
+ }
358
+ ],
359
+ "source": [
360
+ "df.head()"
361
+ ]
362
+ },
363
+ {
364
+ "cell_type": "code",
365
+ "execution_count": 5,
366
+ "id": "09113e88-66cc-414c-a953-da04db83c4ae",
367
+ "metadata": {},
368
+ "outputs": [
369
+ {
370
+ "data": {
371
+ "text/plain": [
372
+ "(3681, 23)"
373
+ ]
374
+ },
375
+ "execution_count": 5,
376
+ "metadata": {},
377
+ "output_type": "execute_result"
378
+ }
379
+ ],
380
+ "source": [
381
+ "df.shape"
382
+ ]
383
+ },
384
+ {
385
+ "cell_type": "markdown",
386
+ "id": "037e74fc-bbcd-43e3-8346-799920cca8d8",
387
+ "metadata": {},
388
+ "source": [
389
+ "# Vectorisation"
390
+ ]
391
+ },
392
+ {
393
+ "cell_type": "markdown",
394
+ "id": "d67cef3a-59fb-4dd8-adc8-2cf288b90728",
395
+ "metadata": {},
396
+ "source": [
397
+ "NLP vectorization refers to the process of converting text data into numerical vectors that machine learning algorithms can understand and process. \n",
398
+ "\n",
399
+ "Bag-of-Words (BoW) is used here that represents text as a collection of unique words along with their frequencies. Each word is assigned an index, and the vector contains the count of each word present in the document."
400
+ ]
401
+ },
402
+ {
403
+ "cell_type": "code",
404
+ "execution_count": 6,
405
+ "id": "c95b7b8a-9767-469d-812d-c9a9d9fee0e9",
406
+ "metadata": {},
407
+ "outputs": [],
408
+ "source": [
409
+ "df_copy = df.copy()"
410
+ ]
411
+ },
412
+ {
413
+ "cell_type": "code",
414
+ "execution_count": 7,
415
+ "id": "dfb2001e-04c1-49dc-b423-a64ea47af5a9",
416
+ "metadata": {},
417
+ "outputs": [],
418
+ "source": [
419
+ "# choose only the extreme and severe cases for modelling\n",
420
+ "cleaned = df_copy[df_copy['Severity'].isin(['Minor'])]\n",
421
+ "cleaned.reset_index(drop=True, inplace=True)"
422
+ ]
423
+ },
424
+ {
425
+ "cell_type": "code",
426
+ "execution_count": 8,
427
+ "id": "3da09b6a-65c6-4f40-9a21-e0b798318ca5",
428
+ "metadata": {},
429
+ "outputs": [
430
+ {
431
+ "data": {
432
+ "text/plain": [
433
+ "(1620, 23)"
434
+ ]
435
+ },
436
+ "execution_count": 8,
437
+ "metadata": {},
438
+ "output_type": "execute_result"
439
+ }
440
+ ],
441
+ "source": [
442
+ "cleaned.shape"
443
+ ]
444
+ },
445
+ {
446
+ "cell_type": "code",
447
+ "execution_count": 9,
448
+ "id": "de71c523-a59e-44b2-aa96-5f17d872c9c6",
449
+ "metadata": {},
450
+ "outputs": [],
451
+ "source": [
452
+ "headline = cleaned.cleaned_content"
453
+ ]
454
+ },
455
+ {
456
+ "cell_type": "code",
457
+ "execution_count": 10,
458
+ "id": "5b1e34e1",
459
+ "metadata": {},
460
+ "outputs": [
461
+ {
462
+ "data": {
463
+ "text/plain": [
464
+ "array(['man', 'be', 'seriously', 'injure', 'after', 'boat', 'catch',\n",
465
+ " 'fire', 'in', 'sydney', 'marina', 'today', 'emergency', 'service',\n",
466
+ " 'be', 'call', 'birkenhead', 'point', 'marina', 'in', 'drummoyne',\n",
467
+ " 'shortly', 'after', 'follow', 'report', 'of', 'explosion', 'on',\n",
468
+ " 'boat', 'careflight', 's', 'rapid', 'response', 'helicopter', 'be',\n",
469
+ " 'task', 'at', 'crew', 'include', 'doctor', 'intensive', 'care',\n",
470
+ " 'paramedic', 'fly', 'drummoyne', 'land', 'in', 'nearby', 'brett',\n",
471
+ " 'park', 'just', 'minute', 'late', 'careflight', 'boat', 'catch',\n",
472
+ " 'fire', 'near', 'birkenhead', 'point', 'shopping', 'outlet',\n",
473
+ " 'simon', 'r', 'supply', 'reader', 'image', 'of', 'boat', 'on',\n",
474
+ " 'fire', 'at', 'marina', 'in', 'drummoyne', 'simon', 'r', 'supply',\n",
475
+ " 'three', 'nsw', 'ambulance', 'crew', 'careflight', 'chopper',\n",
476
+ " 'attend', 'scene', 'find', 'man', 'suffer', 'serious', 'burn',\n",
477
+ " 'legs', 'arm', 'shoulder', 'man', 'suffer', 'burn', 'percent',\n",
478
+ " 'of', 'body', 'boat', 'reportedly', 'explode', 'into', 'flame',\n",
479
+ " 'shortly', 'before', 'morning', 'paramedic', 'treat', 'man', 'at',\n",
480
+ " 'scene', 'with', 'burn', 'percent', 'of', 'body', 'firefighting',\n",
481
+ " 'boat', 'be', 'deploy', 'help', 'extinguish', 'blaze', 'man', 'be',\n",
482
+ " 'treat', 'on', 'scene', 'be', 'surround', 'by', 'family', 'be',\n",
483
+ " 'not', 'onboard', 'boat', 'at', 'time'], dtype=object)"
484
+ ]
485
+ },
486
+ "execution_count": 10,
487
+ "metadata": {},
488
+ "output_type": "execute_result"
489
+ }
490
+ ],
491
+ "source": [
492
+ "headline[5]"
493
+ ]
494
+ },
495
+ {
496
+ "cell_type": "code",
497
+ "execution_count": 11,
498
+ "id": "677055b4-978e-4253-90f4-3f903662e225",
499
+ "metadata": {
500
+ "tags": []
501
+ },
502
+ "outputs": [],
503
+ "source": [
504
+ "# vectorise the words\n",
505
+ "doc_dict = gensim.corpora.Dictionary(headline)\n",
506
+ "docs_vecs = [doc_dict.doc2bow(doc) for doc in headline]"
507
+ ]
508
+ },
509
+ {
510
+ "cell_type": "code",
511
+ "execution_count": 12,
512
+ "id": "a54d1768-b069-4936-a156-deaf0b506d93",
513
+ "metadata": {
514
+ "tags": []
515
+ },
516
+ "outputs": [
517
+ {
518
+ "name": "stdout",
519
+ "output_type": "stream",
520
+ "text": [
521
+ "Number of unique tokens: 35192\n",
522
+ "Number of articles: 1620\n"
523
+ ]
524
+ }
525
+ ],
526
+ "source": [
527
+ "print('Number of unique tokens: %d' % len(doc_dict)) \n",
528
+ "print('Number of articles: %d' % len(docs_vecs)) "
529
+ ]
530
+ },
531
+ {
532
+ "cell_type": "code",
533
+ "execution_count": 13,
534
+ "id": "9147fa86-1503-4252-bd9b-92fea1e6a926",
535
+ "metadata": {
536
+ "scrolled": true,
537
+ "tags": []
538
+ },
539
+ "outputs": [
540
+ {
541
+ "name": "stdout",
542
+ "output_type": "stream",
543
+ "text": [
544
+ "[('be', 58147),\n",
545
+ " ('of', 53324),\n",
546
+ " ('in', 42710),\n",
547
+ " ('for', 19846),\n",
548
+ " ('on', 18899),\n",
549
+ " ('have', 18769),\n",
550
+ " ('with', 13029),\n",
551
+ " ('as', 12079),\n",
552
+ " ('port', 11548),\n",
553
+ " ('from', 11033),\n",
554
+ " ('at', 10589),\n",
555
+ " ('by', 10527),\n",
556
+ " ('s', 8527),\n",
557
+ " ('will', 8465),\n",
558
+ " ('that', 7825),\n",
559
+ " ('say', 7752),\n",
560
+ " ('not', 5664),\n",
561
+ " ('china', 5654),\n",
562
+ " ('day', 5304),\n",
563
+ " ('more', 5266),\n",
564
+ " ('new', 5198),\n",
565
+ " ('service', 5001),\n",
566
+ " ('also', 4707),\n",
567
+ " ('ship', 4693),\n",
568
+ " ('time', 4651),\n",
569
+ " ('than', 3673),\n",
570
+ " ('year', 3669),\n",
571
+ " ('trade', 3518),\n",
572
+ " ('can', 3504),\n",
573
+ " ('state', 3378),\n",
574
+ " ('between', 3226),\n",
575
+ " ('include', 3220),\n",
576
+ " ('over', 3173),\n",
577
+ " ('supply', 3094),\n",
578
+ " ('do', 3083),\n",
579
+ " ('vessel', 3079),\n",
580
+ " ('report', 2988),\n",
581
+ " ('country', 2979),\n",
582
+ " ('after', 2962),\n",
583
+ " ('area', 2913),\n",
584
+ " ('cargo', 2908),\n",
585
+ " ('one', 2905),\n",
586
+ " ('people', 2868),\n",
587
+ " ('other', 2866),\n",
588
+ " ('increase', 2852),\n",
589
+ " ('continue', 2787),\n",
590
+ " ('market', 2766),\n",
591
+ " ('container', 2765),\n",
592
+ " ('strike', 2713),\n",
593
+ " ('chain', 2708),\n",
594
+ " ('expect', 2656),\n",
595
+ " ('work', 2624),\n",
596
+ " ('while', 2590),\n",
597
+ " ('high', 2579),\n",
598
+ " ('about', 2514),\n",
599
+ " ('http', 2489),\n",
600
+ " ('into', 2470),\n",
601
+ " ('would', 2461),\n",
602
+ " ('remain', 2448),\n",
603
+ " ('pm', 2448),\n",
604
+ " ('government', 2437),\n",
605
+ " ('take', 2425),\n",
606
+ " ('two', 2349),\n",
607
+ " ('terminal', 2348),\n",
608
+ " ('may', 2332),\n",
609
+ " ('due', 2325),\n",
610
+ " ('use', 2313),\n",
611
+ " ('update', 2310),\n",
612
+ " ('city', 2295),\n",
613
+ " ('train', 2290),\n",
614
+ " ('make', 2278),\n",
615
+ " ('info', 2253),\n",
616
+ " ('demand', 2240),\n",
617
+ " ('world', 2219),\n",
618
+ " ('asia', 2194),\n",
619
+ " ('south', 2160),\n",
620
+ " ('company', 2148),\n",
621
+ " ('march', 2125),\n",
622
+ " ('see', 2101),\n",
623
+ " ('large', 2097),\n",
624
+ " ('customer', 2081),\n",
625
+ " ('through', 2042),\n",
626
+ " ('most', 2041),\n",
627
+ " ('plan', 2038),\n",
628
+ " ('global', 2031),\n",
629
+ " ('first', 2011),\n",
630
+ " ('line', 1962),\n",
631
+ " ('north', 1961),\n",
632
+ " ('could', 1948),\n",
633
+ " ('operation', 1931),\n",
634
+ " ('good', 1928),\n",
635
+ " ('if', 1906),\n",
636
+ " ('across', 1893),\n",
637
+ " ('coast', 1887),\n",
638
+ " ('business', 1863),\n",
639
+ " ('when', 1825),\n",
640
+ " ('week', 1813),\n",
641
+ " ('during', 1805),\n",
642
+ " ('last', 1771),\n",
643
+ " ('million', 1770)]\n"
644
+ ]
645
+ }
646
+ ],
647
+ "source": [
648
+ "# Calculate word frequencies\n",
649
+ "word_frequencies = {doc_dict[word_id]: freq for word_id, freq in doc_dict.cfs.items()}\n",
650
+ "sorted_words = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)\n",
651
+ "\n",
652
+ "pprint(sorted_words[:100])"
653
+ ]
654
+ },
655
+ {
656
+ "cell_type": "markdown",
657
+ "id": "5ed78239-2ce1-4784-a8f4-4c7438c8627b",
658
+ "metadata": {},
659
+ "source": [
660
+ "# LDA Modelling"
661
+ ]
662
+ },
663
+ {
664
+ "cell_type": "markdown",
665
+ "id": "9db83273-461d-4f70-b23f-ec967579d94f",
666
+ "metadata": {},
667
+ "source": [
668
+ "## Benchmark Model"
669
+ ]
670
+ },
671
+ {
672
+ "cell_type": "code",
673
+ "execution_count": 14,
674
+ "id": "e6d577bd-9936-4d45-be90-345af2eb4827",
675
+ "metadata": {
676
+ "tags": []
677
+ },
678
+ "outputs": [],
679
+ "source": [
680
+ "# Build LDA benchmark model\n",
681
+ "lda_model = gensim.models.LdaMulticore(corpus=docs_vecs,\n",
682
+ " id2word=doc_dict,\n",
683
+ " num_topics=4, \n",
684
+ " random_state=42,\n",
685
+ " chunksize=100,\n",
686
+ " passes=10,\n",
687
+ " per_word_topics=True)"
688
+ ]
689
+ },
690
+ {
691
+ "cell_type": "code",
692
+ "execution_count": 15,
693
+ "id": "c4f1521f-5f43-40d2-a3a3-a8ac2ca6fec2",
694
+ "metadata": {
695
+ "tags": []
696
+ },
697
+ "outputs": [
698
+ {
699
+ "name": "stdout",
700
+ "output_type": "stream",
701
+ "text": [
702
+ "[(0,\n",
703
+ " '0.042*\"of\" + 0.030*\"in\" + 0.026*\"be\" + 0.012*\"china\" + 0.012*\"on\" + '\n",
704
+ " '0.011*\"s\" + 0.011*\"for\" + 0.011*\"have\" + 0.010*\"that\" + 0.009*\"as\"'),\n",
705
+ " (1,\n",
706
+ " '0.028*\"be\" + 0.028*\"of\" + 0.018*\"in\" + 0.011*\"de\" + 0.009*\"y\" + 0.009*\"for\" '\n",
707
+ " '+ 0.008*\"by\" + 0.007*\"http\" + 0.007*\"on\" + 0.007*\"have\"'),\n",
708
+ " (2,\n",
709
+ " '0.041*\"be\" + 0.033*\"of\" + 0.028*\"in\" + 0.015*\"on\" + 0.013*\"have\" + '\n",
710
+ " '0.013*\"for\" + 0.009*\"at\" + 0.009*\"say\" + 0.009*\"as\" + 0.008*\"with\"'),\n",
711
+ " (3,\n",
712
+ " '0.039*\"be\" + 0.029*\"of\" + 0.026*\"in\" + 0.018*\"port\" + 0.013*\"for\" + '\n",
713
+ " '0.011*\"have\" + 0.009*\"on\" + 0.009*\"with\" + 0.009*\"day\" + 0.008*\"from\"')]\n"
714
+ ]
715
+ }
716
+ ],
717
+ "source": [
718
+ "from pprint import pprint\n",
719
+ "\n",
720
+ "# Print the Keyword in the 10 topics\n",
721
+ "pprint(lda_model.print_topics())\n",
722
+ "doc_lda = lda_model[docs_vecs]"
723
+ ]
724
+ },
725
+ {
726
+ "cell_type": "code",
727
+ "execution_count": 16,
728
+ "id": "fd57b1f4-a6cd-41e8-964f-d8a1d30aa3c9",
729
+ "metadata": {
730
+ "tags": []
731
+ },
732
+ "outputs": [
733
+ {
734
+ "name": "stdout",
735
+ "output_type": "stream",
736
+ "text": [
737
+ "\n",
738
+ "Coherence Score LDAModel: 0.2757800922361482\n"
739
+ ]
740
+ }
741
+ ],
742
+ "source": [
743
+ "# Compute Benchmark Coherence Score\n",
744
+ "coherence_model_lda = CoherenceModel(model=lda_model, texts=headline, dictionary=doc_dict, coherence='c_v')\n",
745
+ "coherence_lda = coherence_model_lda.get_coherence()\n",
746
+ "print('\\nCoherence Score LDAModel: ', coherence_lda)"
747
+ ]
748
+ },
749
+ {
750
+ "cell_type": "code",
751
+ "execution_count": 17,
752
+ "id": "152e5a3a-7afe-4fb8-a02f-d7492ad80936",
753
+ "metadata": {
754
+ "tags": []
755
+ },
756
+ "outputs": [
757
+ {
758
+ "name": "stdout",
759
+ "output_type": "stream",
760
+ "text": [
761
+ "\n",
762
+ "Perplexity for LDAModel: -7.407722316633226\n"
763
+ ]
764
+ }
765
+ ],
766
+ "source": [
767
+ "# Compute Benchmark Perplexity\n",
768
+ "perplex= lda_model.log_perplexity(docs_vecs, total_docs=None) #For LDAModel\n",
769
+ " # a measure of how good the model is. lower the better.\n",
770
+ "\n",
771
+ "print('\\nPerplexity for LDAModel: ', perplex)"
772
+ ]
773
+ },
774
+ {
775
+ "cell_type": "code",
776
+ "execution_count": 18,
777
+ "id": "7dd3a60a-5c6f-4249-9868-30528a5b0ac8",
778
+ "metadata": {},
779
+ "outputs": [
780
+ {
781
+ "name": "stderr",
782
+ "output_type": "stream",
783
+ "text": [
784
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
785
+ " pid = os.fork()\n",
786
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
787
+ " pid = os.fork()\n",
788
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
789
+ " pid = os.fork()\n",
790
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
791
+ " pid = os.fork()\n",
792
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
793
+ " pid = os.fork()\n",
794
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
795
+ " pid = os.fork()\n",
796
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
797
+ " pid = os.fork()\n",
798
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
799
+ " pid = os.fork()\n",
800
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
801
+ " pid = os.fork()\n",
802
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
803
+ " pid = os.fork()\n",
804
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
805
+ " pid = os.fork()\n",
806
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
807
+ " pid = os.fork()\n",
808
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
809
+ " pid = os.fork()\n",
810
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
811
+ " pid = os.fork()\n",
812
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=39505) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
813
+ " pid = os.fork()\n"
814
+ ]
815
+ }
816
+ ],
817
+ "source": [
818
+ "from pprint import pprint\n",
819
+ "import pyLDAvis\n",
820
+ "import pyLDAvis.gensim_models as gensimvis\n",
821
+ "\n",
822
+ "# feed the LDA model into the pyLDAvis instance\n",
823
+ "pyLDAvis.enable_notebook()\n",
824
+ "visual= gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n",
825
+ "\n",
826
+ "# Save the output to the html file\n",
827
+ "pyLDAvis.save_html(visual, \"data/topic_viz_benchmark_minor.html\")"
828
+ ]
829
+ },
830
+ {
831
+ "cell_type": "markdown",
832
+ "id": "1895598f-3e5f-4acd-83a6-4491cc90f695",
833
+ "metadata": {},
834
+ "source": [
835
+ "# Hyper-Perameter Tuning and Evaluation"
836
+ ]
837
+ },
838
+ {
839
+ "cell_type": "markdown",
840
+ "id": "47136c89-ff7b-4ac9-840f-04122fe62160",
841
+ "metadata": {},
842
+ "source": [
843
+ "Run the cells below only for re-modelling with new datasets, the whole tuning and evaluation process may take hours to run."
844
+ ]
845
+ },
846
+ {
847
+ "cell_type": "code",
848
+ "execution_count": 19,
849
+ "id": "c79ca5c4-e078-43ce-a430-8c1ed93dcd64",
850
+ "metadata": {},
851
+ "outputs": [],
852
+ "source": [
853
+ "# hyper-perameter tuning (alpha and beta)\n",
854
+ "def compute_coherence_values(corpus, dictionary, k, a, b):\n",
855
+ " \n",
856
+ " lda_model = gensim.models.LdaMulticore(corpus=corpus,\n",
857
+ " id2word=dictionary,\n",
858
+ " num_topics=k, \n",
859
+ " random_state=42,\n",
860
+ " chunksize=100,\n",
861
+ " passes=10,\n",
862
+ " alpha=a,\n",
863
+ " eta=b)\n",
864
+ " \n",
865
+ " coherence_model_lda = CoherenceModel(model=lda_model, texts=headline, dictionary=doc_dict, coherence='c_v')\n",
866
+ " coherence = coherence_model_lda.get_coherence()\n",
867
+ " perplex = lda_model.log_perplexity(docs_vecs, total_docs=None) \n",
868
+ " \n",
869
+ " return coherence, perplex"
870
+ ]
871
+ },
872
+ {
873
+ "cell_type": "code",
874
+ "execution_count": 20,
875
+ "id": "1c3c8478-9336-40f2-bb30-a37db4243b67",
876
+ "metadata": {},
877
+ "outputs": [],
878
+ "source": [
879
+ "# setup\n",
880
+ "import numpy as np\n",
881
+ "\n",
882
+ "from gensim.models import CoherenceModel\n",
883
+ "\n",
884
+ "model_list = []\n",
885
+ "coherence_values = []\n",
886
+ "perplexity_values = []\n",
887
+ "model_topics = []\n",
888
+ "alpha_result = []\n",
889
+ "beta_result = []\n",
890
+ "\n",
891
+ "# topic ranges\n",
892
+ "num_topics = range(4, 13)\n",
893
+ "\n",
894
+ "# Alpha parameter\n",
895
+ "alpha = list(np.arange(0.31, 1, 0.3))\n",
896
+ "alpha.append('symmetric')\n",
897
+ "alpha.append('asymmetric')\n",
898
+ "\n",
899
+ "# Beta parameter\n",
900
+ "beta = list(np.arange(0.31, 1, 0.3))\n",
901
+ "beta.append('symmetric')"
902
+ ]
903
+ },
904
+ {
905
+ "cell_type": "markdown",
906
+ "id": "c7e6bc53-0b57-4858-879a-644eca54ddbc",
907
+ "metadata": {},
908
+ "source": [
909
+ "Rational behind the alpha and eta: https://stats.stackexchange.com/questions/37405/natural-interpretation-for-lda-hyperparameters"
910
+ ]
911
+ },
912
+ {
913
+ "cell_type": "code",
914
+ "execution_count": 21,
915
+ "id": "02877b81-32df-4168-8e62-4cbca2be100b",
916
+ "metadata": {
917
+ "tags": []
918
+ },
919
+ "outputs": [
920
+ {
921
+ "name": "stdout",
922
+ "output_type": "stream",
923
+ "text": [
924
+ "Topic range: range(4, 13)\n",
925
+ "Alpha: [0.31, 0.61, 0.9099999999999999, 'symmetric', 'asymmetric']\n",
926
+ "Beta: [0.31, 0.61, 0.9099999999999999, 'symmetric']\n"
927
+ ]
928
+ }
929
+ ],
930
+ "source": [
931
+ "print(\"Topic range: \",num_topics)\n",
932
+ "print(\"Alpha: \",alpha)\n",
933
+ "print(\"Beta: \", beta)"
934
+ ]
935
+ },
936
+ {
937
+ "cell_type": "code",
938
+ "execution_count": 23,
939
+ "id": "3c1f703c-4778-467f-a12e-0c18eeb274c5",
940
+ "metadata": {},
941
+ "outputs": [
942
+ {
943
+ "name": "stdout",
944
+ "output_type": "stream",
945
+ "text": [
946
+ "2024-06-30 11:52:46.026823\n",
947
+ "#Topics: 4, CV Score: 0.26143167370104753, PV Score: -7.394241913115894, Alpha: 0.31, Beta: 0.31\n",
948
+ "#Topics: 5, CV Score: 0.29008917581081944, PV Score: -7.35821379289522, Alpha: 0.31, Beta: 0.31\n",
949
+ "#Topics: 6, CV Score: 0.3075550457683199, PV Score: -7.348746753666222, Alpha: 0.31, Beta: 0.31\n",
950
+ "#Topics: 7, CV Score: 0.27068500671158163, PV Score: -7.332448354134598, Alpha: 0.31, Beta: 0.31\n",
951
+ "#Topics: 8, CV Score: 0.2949752069963174, PV Score: -7.355618761173559, Alpha: 0.31, Beta: 0.31\n",
952
+ "#Topics: 9, CV Score: 0.2883770109840239, PV Score: -7.345756037675643, Alpha: 0.31, Beta: 0.31\n",
953
+ "#Topics: 10, CV Score: 0.30169838729877146, PV Score: -7.334658077515814, Alpha: 0.31, Beta: 0.31\n",
954
+ "#Topics: 11, CV Score: 0.28399502160009293, PV Score: -7.337863652882048, Alpha: 0.31, Beta: 0.31\n",
955
+ "#Topics: 12, CV Score: 0.3098908373731854, PV Score: -7.317278302545894, Alpha: 0.31, Beta: 0.31\n",
956
+ "#Topics: 4, CV Score: 0.26324058496974073, PV Score: -7.421960949456933, Alpha: 0.31, Beta: 0.61\n",
957
+ "#Topics: 5, CV Score: 0.28653242913009425, PV Score: -7.406328200019341, Alpha: 0.31, Beta: 0.61\n",
958
+ "#Topics: 6, CV Score: 0.29614945599419024, PV Score: -7.405829958529572, Alpha: 0.31, Beta: 0.61\n",
959
+ "#Topics: 7, CV Score: 0.26469167967575336, PV Score: -7.409695513631366, Alpha: 0.31, Beta: 0.61\n",
960
+ "#Topics: 8, CV Score: 0.29743885397540426, PV Score: -7.416190442593165, Alpha: 0.31, Beta: 0.61\n",
961
+ "#Topics: 9, CV Score: 0.29464381439032383, PV Score: -7.402408965099985, Alpha: 0.31, Beta: 0.61\n",
962
+ "#Topics: 10, CV Score: 0.34823725193043276, PV Score: -7.398638060374257, Alpha: 0.31, Beta: 0.61\n",
963
+ "#Topics: 11, CV Score: 0.31843503124612416, PV Score: -7.4064374372258275, Alpha: 0.31, Beta: 0.61\n",
964
+ "#Topics: 12, CV Score: 0.31444697036706293, PV Score: -7.405945850874236, Alpha: 0.31, Beta: 0.61\n",
965
+ "#Topics: 4, CV Score: 0.2712735526187385, PV Score: -7.4600931582453756, Alpha: 0.31, Beta: 0.9099999999999999\n",
966
+ "#Topics: 5, CV Score: 0.30311968074865997, PV Score: -7.460908353343469, Alpha: 0.31, Beta: 0.9099999999999999\n",
967
+ "#Topics: 6, CV Score: 0.29829568606376594, PV Score: -7.453422852489877, Alpha: 0.31, Beta: 0.9099999999999999\n",
968
+ "#Topics: 7, CV Score: 0.29621387204862615, PV Score: -7.46166337203672, Alpha: 0.31, Beta: 0.9099999999999999\n",
969
+ "#Topics: 8, CV Score: 0.32127327516432214, PV Score: -7.467013507843407, Alpha: 0.31, Beta: 0.9099999999999999\n",
970
+ "#Topics: 9, CV Score: 0.31985991254557, PV Score: -7.431396910423797, Alpha: 0.31, Beta: 0.9099999999999999\n",
971
+ "#Topics: 10, CV Score: 0.3224915541388905, PV Score: -7.448438398378084, Alpha: 0.31, Beta: 0.9099999999999999\n",
972
+ "#Topics: 11, CV Score: 0.3381018616555288, PV Score: -7.457876704009056, Alpha: 0.31, Beta: 0.9099999999999999\n",
973
+ "#Topics: 12, CV Score: 0.30836100224079815, PV Score: -7.45460656674408, Alpha: 0.31, Beta: 0.9099999999999999\n",
974
+ "#Topics: 4, CV Score: 0.2643367431201753, PV Score: -7.390794744679982, Alpha: 0.31, Beta: symmetric\n",
975
+ "#Topics: 5, CV Score: 0.29008917581081944, PV Score: -7.341357499253523, Alpha: 0.31, Beta: symmetric\n",
976
+ "#Topics: 6, CV Score: 0.27735837350398757, PV Score: -7.323864546372206, Alpha: 0.31, Beta: symmetric\n",
977
+ "#Topics: 7, CV Score: 0.27207166533136756, PV Score: -7.298831876196072, Alpha: 0.31, Beta: symmetric\n",
978
+ "#Topics: 8, CV Score: 0.29681559070931296, PV Score: -7.286549304258785, Alpha: 0.31, Beta: symmetric\n",
979
+ "#Topics: 9, CV Score: 0.29778017634331727, PV Score: -7.271904110257489, Alpha: 0.31, Beta: symmetric\n",
980
+ "#Topics: 10, CV Score: 0.3012188333925358, PV Score: -7.263682235965553, Alpha: 0.31, Beta: symmetric\n",
981
+ "#Topics: 11, CV Score: 0.30281626874462, PV Score: -7.267991091418852, Alpha: 0.31, Beta: symmetric\n",
982
+ "#Topics: 12, CV Score: 0.31307471307490264, PV Score: -7.263340871172593, Alpha: 0.31, Beta: symmetric\n",
983
+ "#Topics: 4, CV Score: 0.2631472484692373, PV Score: -7.391225160526899, Alpha: 0.61, Beta: 0.31\n",
984
+ "#Topics: 5, CV Score: 0.28117696184010554, PV Score: -7.358283237848617, Alpha: 0.61, Beta: 0.31\n",
985
+ "#Topics: 6, CV Score: 0.2946692074353743, PV Score: -7.350030009796271, Alpha: 0.61, Beta: 0.31\n",
986
+ "#Topics: 7, CV Score: 0.2695848086251494, PV Score: -7.332704133455395, Alpha: 0.61, Beta: 0.31\n",
987
+ "#Topics: 8, CV Score: 0.2873386785526998, PV Score: -7.365743826339281, Alpha: 0.61, Beta: 0.31\n",
988
+ "#Topics: 9, CV Score: 0.28209952470043215, PV Score: -7.352132999419965, Alpha: 0.61, Beta: 0.31\n",
989
+ "#Topics: 10, CV Score: 0.29783175477393864, PV Score: -7.343942952645343, Alpha: 0.61, Beta: 0.31\n",
990
+ "#Topics: 11, CV Score: 0.2786007151015459, PV Score: -7.345863425524115, Alpha: 0.61, Beta: 0.31\n",
991
+ "#Topics: 12, CV Score: 0.31534621656869705, PV Score: -7.331805220457104, Alpha: 0.61, Beta: 0.31\n",
992
+ "#Topics: 4, CV Score: 0.2505923687565112, PV Score: -7.416873289619847, Alpha: 0.61, Beta: 0.61\n",
993
+ "#Topics: 5, CV Score: 0.28706700596105156, PV Score: -7.403540595451488, Alpha: 0.61, Beta: 0.61\n",
994
+ "#Topics: 6, CV Score: 0.29473065024977163, PV Score: -7.4106494490711015, Alpha: 0.61, Beta: 0.61\n",
995
+ "#Topics: 7, CV Score: 0.26820275442412866, PV Score: -7.412576639292029, Alpha: 0.61, Beta: 0.61\n",
996
+ "#Topics: 8, CV Score: 0.29025456691114637, PV Score: -7.41531726726418, Alpha: 0.61, Beta: 0.61\n",
997
+ "#Topics: 9, CV Score: 0.29873861091584003, PV Score: -7.410097381301386, Alpha: 0.61, Beta: 0.61\n",
998
+ "#Topics: 10, CV Score: 0.33489681733700966, PV Score: -7.414464176932596, Alpha: 0.61, Beta: 0.61\n",
999
+ "#Topics: 11, CV Score: 0.3136515342313888, PV Score: -7.414824058449246, Alpha: 0.61, Beta: 0.61\n",
1000
+ "#Topics: 12, CV Score: 0.29901180359094387, PV Score: -7.425251302616095, Alpha: 0.61, Beta: 0.61\n",
1001
+ "#Topics: 4, CV Score: 0.25606628747612015, PV Score: -7.461432644283255, Alpha: 0.61, Beta: 0.9099999999999999\n",
1002
+ "#Topics: 5, CV Score: 0.2981919886915416, PV Score: -7.461173440540555, Alpha: 0.61, Beta: 0.9099999999999999\n",
1003
+ "#Topics: 6, CV Score: 0.3155490657279846, PV Score: -7.456975825811057, Alpha: 0.61, Beta: 0.9099999999999999\n",
1004
+ "#Topics: 7, CV Score: 0.3083085761113492, PV Score: -7.463721193947859, Alpha: 0.61, Beta: 0.9099999999999999\n",
1005
+ "#Topics: 8, CV Score: 0.2987637794108595, PV Score: -7.463974937352514, Alpha: 0.61, Beta: 0.9099999999999999\n",
1006
+ "#Topics: 9, CV Score: 0.30637199754319, PV Score: -7.447224084336919, Alpha: 0.61, Beta: 0.9099999999999999\n",
1007
+ "#Topics: 10, CV Score: 0.34064164600584956, PV Score: -7.4608295203390345, Alpha: 0.61, Beta: 0.9099999999999999\n",
1008
+ "#Topics: 11, CV Score: 0.33431924954293984, PV Score: -7.468074450247606, Alpha: 0.61, Beta: 0.9099999999999999\n",
1009
+ "#Topics: 12, CV Score: 0.3021823876546624, PV Score: -7.474199589973487, Alpha: 0.61, Beta: 0.9099999999999999\n",
1010
+ "#Topics: 4, CV Score: 0.2592664104701984, PV Score: -7.380958290626252, Alpha: 0.61, Beta: symmetric\n",
1011
+ "#Topics: 5, CV Score: 0.29008917581081944, PV Score: -7.34406277883799, Alpha: 0.61, Beta: symmetric\n",
1012
+ "#Topics: 6, CV Score: 0.2856139513419668, PV Score: -7.332074766991939, Alpha: 0.61, Beta: symmetric\n",
1013
+ "#Topics: 7, CV Score: 0.2721558660968534, PV Score: -7.297199516455353, Alpha: 0.61, Beta: symmetric\n",
1014
+ "#Topics: 8, CV Score: 0.28791066408428134, PV Score: -7.314898967963879, Alpha: 0.61, Beta: symmetric\n",
1015
+ "#Topics: 9, CV Score: 0.2972722790111482, PV Score: -7.283806173931616, Alpha: 0.61, Beta: symmetric\n",
1016
+ "#Topics: 10, CV Score: 0.3082121035138333, PV Score: -7.29151183400522, Alpha: 0.61, Beta: symmetric\n",
1017
+ "#Topics: 11, CV Score: 0.30375150084375463, PV Score: -7.280134176808763, Alpha: 0.61, Beta: symmetric\n",
1018
+ "#Topics: 12, CV Score: 0.3026143223444013, PV Score: -7.277233833271221, Alpha: 0.61, Beta: symmetric\n",
1019
+ "#Topics: 4, CV Score: 0.2587849246611012, PV Score: -7.391745121978334, Alpha: 0.9099999999999999, Beta: 0.31\n",
1020
+ "#Topics: 5, CV Score: 0.29232718870337143, PV Score: -7.359033621013408, Alpha: 0.9099999999999999, Beta: 0.31\n",
1021
+ "#Topics: 6, CV Score: 0.298114743775785, PV Score: -7.351153460394994, Alpha: 0.9099999999999999, Beta: 0.31\n",
1022
+ "#Topics: 7, CV Score: 0.26903864999105614, PV Score: -7.334371538653122, Alpha: 0.9099999999999999, Beta: 0.31\n",
1023
+ "#Topics: 8, CV Score: 0.28622468428206693, PV Score: -7.372271837301099, Alpha: 0.9099999999999999, Beta: 0.31\n",
1024
+ "#Topics: 9, CV Score: 0.29758804437495606, PV Score: -7.358497856224319, Alpha: 0.9099999999999999, Beta: 0.31\n",
1025
+ "#Topics: 10, CV Score: 0.29746253535912703, PV Score: -7.343247363065808, Alpha: 0.9099999999999999, Beta: 0.31\n",
1026
+ "#Topics: 11, CV Score: 0.2842073825386278, PV Score: -7.3531033741821386, Alpha: 0.9099999999999999, Beta: 0.31\n",
1027
+ "#Topics: 12, CV Score: 0.32301209366405703, PV Score: -7.344357219935557, Alpha: 0.9099999999999999, Beta: 0.31\n",
1028
+ "#Topics: 4, CV Score: 0.2620085857568131, PV Score: -7.431829725624846, Alpha: 0.9099999999999999, Beta: 0.61\n",
1029
+ "#Topics: 5, CV Score: 0.3148122601459173, PV Score: -7.4118133900365955, Alpha: 0.9099999999999999, Beta: 0.61\n",
1030
+ "#Topics: 6, CV Score: 0.29369633574952747, PV Score: -7.411592442393478, Alpha: 0.9099999999999999, Beta: 0.61\n",
1031
+ "#Topics: 7, CV Score: 0.2702663182798373, PV Score: -7.408869634701454, Alpha: 0.9099999999999999, Beta: 0.61\n",
1032
+ "#Topics: 8, CV Score: 0.29626015190055904, PV Score: -7.4218524805240165, Alpha: 0.9099999999999999, Beta: 0.61\n",
1033
+ "#Topics: 9, CV Score: 0.3029984127461481, PV Score: -7.413846845692629, Alpha: 0.9099999999999999, Beta: 0.61\n",
1034
+ "#Topics: 10, CV Score: 0.3328224542610766, PV Score: -7.42022182252732, Alpha: 0.9099999999999999, Beta: 0.61\n",
1035
+ "#Topics: 11, CV Score: 0.30475127610761077, PV Score: -7.422669034180523, Alpha: 0.9099999999999999, Beta: 0.61\n",
1036
+ "#Topics: 12, CV Score: 0.30231311065405014, PV Score: -7.434846448528838, Alpha: 0.9099999999999999, Beta: 0.61\n",
1037
+ "#Topics: 4, CV Score: 0.258672617830039, PV Score: -7.469954125650798, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1038
+ "#Topics: 5, CV Score: 0.3141300753700526, PV Score: -7.465439773550294, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1039
+ "#Topics: 6, CV Score: 0.33646277127762847, PV Score: -7.459430389322742, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1040
+ "#Topics: 7, CV Score: 0.2989500059387701, PV Score: -7.464860494675141, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1041
+ "#Topics: 8, CV Score: 0.3067664143272131, PV Score: -7.460823334723379, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1042
+ "#Topics: 9, CV Score: 0.32230471519058956, PV Score: -7.462203609695669, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1043
+ "#Topics: 10, CV Score: 0.355651395118116, PV Score: -7.466906313924779, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1044
+ "#Topics: 11, CV Score: 0.3074563046801128, PV Score: -7.4762246058984125, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1045
+ "#Topics: 12, CV Score: 0.292184655691141, PV Score: -7.48462689762355, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1046
+ "#Topics: 4, CV Score: 0.27425502082773556, PV Score: -7.381959277922089, Alpha: 0.9099999999999999, Beta: symmetric\n",
1047
+ "#Topics: 5, CV Score: 0.2792475123330335, PV Score: -7.340967173529193, Alpha: 0.9099999999999999, Beta: symmetric\n",
1048
+ "#Topics: 6, CV Score: 0.2748288743412546, PV Score: -7.338412973991607, Alpha: 0.9099999999999999, Beta: symmetric\n",
1049
+ "#Topics: 7, CV Score: 0.27272711338157735, PV Score: -7.304992530741422, Alpha: 0.9099999999999999, Beta: symmetric\n",
1050
+ "#Topics: 8, CV Score: 0.27875956141406333, PV Score: -7.310583890409706, Alpha: 0.9099999999999999, Beta: symmetric\n",
1051
+ "#Topics: 9, CV Score: 0.2958113176625343, PV Score: -7.290099223904081, Alpha: 0.9099999999999999, Beta: symmetric\n",
1052
+ "#Topics: 10, CV Score: 0.30039391830183415, PV Score: -7.29361844332684, Alpha: 0.9099999999999999, Beta: symmetric\n",
1053
+ "#Topics: 11, CV Score: 0.30238837300731736, PV Score: -7.293498894544746, Alpha: 0.9099999999999999, Beta: symmetric\n",
1054
+ "#Topics: 12, CV Score: 0.30057296940845457, PV Score: -7.286728893856696, Alpha: 0.9099999999999999, Beta: symmetric\n",
1055
+ "#Topics: 4, CV Score: 0.28045576981660336, PV Score: -7.3810882171558, Alpha: symmetric, Beta: 0.31\n",
1056
+ "#Topics: 5, CV Score: 0.27516707115009786, PV Score: -7.348322538492822, Alpha: symmetric, Beta: 0.31\n",
1057
+ "#Topics: 6, CV Score: 0.30174281503830513, PV Score: -7.357661750753335, Alpha: symmetric, Beta: 0.31\n",
1058
+ "#Topics: 7, CV Score: 0.26452009940043947, PV Score: -7.332995538212589, Alpha: symmetric, Beta: 0.31\n",
1059
+ "#Topics: 8, CV Score: 0.27957941981373957, PV Score: -7.34520072112036, Alpha: symmetric, Beta: 0.31\n",
1060
+ "#Topics: 9, CV Score: 0.2921761447063516, PV Score: -7.3174465018975985, Alpha: symmetric, Beta: 0.31\n",
1061
+ "#Topics: 10, CV Score: 0.2819456892021096, PV Score: -7.3271012408060425, Alpha: symmetric, Beta: 0.31\n",
1062
+ "#Topics: 11, CV Score: 0.2858145693331318, PV Score: -7.324243303702109, Alpha: symmetric, Beta: 0.31\n",
1063
+ "#Topics: 12, CV Score: 0.3162268304900056, PV Score: -7.301149412893058, Alpha: symmetric, Beta: 0.31\n",
1064
+ "#Topics: 4, CV Score: 0.26200858575681313, PV Score: -7.435025163734385, Alpha: symmetric, Beta: 0.61\n",
1065
+ "#Topics: 5, CV Score: 0.28653242913009425, PV Score: -7.402808127824116, Alpha: symmetric, Beta: 0.61\n",
1066
+ "#Topics: 6, CV Score: 0.30034116357398466, PV Score: -7.403041768564758, Alpha: symmetric, Beta: 0.61\n",
1067
+ "#Topics: 7, CV Score: 0.27713631118988463, PV Score: -7.401296493610201, Alpha: symmetric, Beta: 0.61\n",
1068
+ "#Topics: 8, CV Score: 0.305005574283436, PV Score: -7.403063894303076, Alpha: symmetric, Beta: 0.61\n",
1069
+ "#Topics: 9, CV Score: 0.319284842602941, PV Score: -7.381097776990529, Alpha: symmetric, Beta: 0.61\n",
1070
+ "#Topics: 10, CV Score: 0.3187835569771548, PV Score: -7.392442067244151, Alpha: symmetric, Beta: 0.61\n",
1071
+ "#Topics: 11, CV Score: 0.31619116554114374, PV Score: -7.398110003036577, Alpha: symmetric, Beta: 0.61\n",
1072
+ "#Topics: 12, CV Score: 0.30620936485014044, PV Score: -7.389060738256993, Alpha: symmetric, Beta: 0.61\n",
1073
+ "#Topics: 4, CV Score: 0.25861742292536233, PV Score: -7.463031931556786, Alpha: symmetric, Beta: 0.9099999999999999\n",
1074
+ "#Topics: 5, CV Score: 0.30612131816590366, PV Score: -7.459187288641012, Alpha: symmetric, Beta: 0.9099999999999999\n",
1075
+ "#Topics: 6, CV Score: 0.29856036681931464, PV Score: -7.451419283754892, Alpha: symmetric, Beta: 0.9099999999999999\n",
1076
+ "#Topics: 7, CV Score: 0.2963932433750025, PV Score: -7.451642157595998, Alpha: symmetric, Beta: 0.9099999999999999\n",
1077
+ "#Topics: 8, CV Score: 0.3175863739616889, PV Score: -7.460112275446344, Alpha: symmetric, Beta: 0.9099999999999999\n",
1078
+ "#Topics: 9, CV Score: 0.30091910989824133, PV Score: -7.423042776887486, Alpha: symmetric, Beta: 0.9099999999999999\n",
1079
+ "#Topics: 10, CV Score: 0.3221986755554972, PV Score: -7.4413791780149126, Alpha: symmetric, Beta: 0.9099999999999999\n",
1080
+ "#Topics: 11, CV Score: 0.3400624797510634, PV Score: -7.4475561490332405, Alpha: symmetric, Beta: 0.9099999999999999\n",
1081
+ "#Topics: 12, CV Score: 0.29577010557935146, PV Score: -7.438896552351397, Alpha: symmetric, Beta: 0.9099999999999999\n",
1082
+ "#Topics: 4, CV Score: 0.2875150010691004, PV Score: -7.398657443181138, Alpha: symmetric, Beta: symmetric\n",
1083
+ "#Topics: 5, CV Score: 0.2900891758108194, PV Score: -7.337054653335996, Alpha: symmetric, Beta: symmetric\n",
1084
+ "#Topics: 6, CV Score: 0.2872501529046281, PV Score: -7.319444085859869, Alpha: symmetric, Beta: symmetric\n",
1085
+ "#Topics: 7, CV Score: 0.27501057403010526, PV Score: -7.290723834382987, Alpha: symmetric, Beta: symmetric\n",
1086
+ "#Topics: 8, CV Score: 0.2923043477830799, PV Score: -7.2829244272999825, Alpha: symmetric, Beta: symmetric\n",
1087
+ "#Topics: 9, CV Score: 0.2978356724583504, PV Score: -7.263979371448642, Alpha: symmetric, Beta: symmetric\n",
1088
+ "#Topics: 10, CV Score: 0.3098726690096235, PV Score: -7.249251508569862, Alpha: symmetric, Beta: symmetric\n",
1089
+ "#Topics: 11, CV Score: 0.31266100984189077, PV Score: -7.252673302755199, Alpha: symmetric, Beta: symmetric\n",
1090
+ "#Topics: 12, CV Score: 0.31275264383294515, PV Score: -7.254777083051176, Alpha: symmetric, Beta: symmetric\n",
1091
+ "#Topics: 4, CV Score: 0.2602975588122612, PV Score: -7.381821385441839, Alpha: asymmetric, Beta: 0.31\n",
1092
+ "#Topics: 5, CV Score: 0.2792475123330335, PV Score: -7.350616172255018, Alpha: asymmetric, Beta: 0.31\n",
1093
+ "#Topics: 6, CV Score: 0.3015492641821192, PV Score: -7.349068241683937, Alpha: asymmetric, Beta: 0.31\n",
1094
+ "#Topics: 7, CV Score: 0.2668593619258581, PV Score: -7.334972757155578, Alpha: asymmetric, Beta: 0.31\n",
1095
+ "#Topics: 8, CV Score: 0.2916111243636439, PV Score: -7.3504381054918735, Alpha: asymmetric, Beta: 0.31\n",
1096
+ "#Topics: 9, CV Score: 0.3104237788212182, PV Score: -7.3330296284925875, Alpha: asymmetric, Beta: 0.31\n",
1097
+ "#Topics: 10, CV Score: 0.3282155213609229, PV Score: -7.336602404389545, Alpha: asymmetric, Beta: 0.31\n",
1098
+ "#Topics: 11, CV Score: 0.2878177794271231, PV Score: -7.319097172570247, Alpha: asymmetric, Beta: 0.31\n",
1099
+ "#Topics: 12, CV Score: 0.309148501118687, PV Score: -7.299840665764544, Alpha: asymmetric, Beta: 0.31\n",
1100
+ "#Topics: 4, CV Score: 0.2643914598991108, PV Score: -7.439997400522905, Alpha: asymmetric, Beta: 0.61\n",
1101
+ "#Topics: 5, CV Score: 0.30680350294176834, PV Score: -7.406369360079358, Alpha: asymmetric, Beta: 0.61\n",
1102
+ "#Topics: 6, CV Score: 0.30273133375875494, PV Score: -7.409977374651039, Alpha: asymmetric, Beta: 0.61\n",
1103
+ "#Topics: 7, CV Score: 0.2895606942065561, PV Score: -7.404088495718267, Alpha: asymmetric, Beta: 0.61\n",
1104
+ "#Topics: 8, CV Score: 0.30220942123026345, PV Score: -7.41654700444938, Alpha: asymmetric, Beta: 0.61\n",
1105
+ "#Topics: 9, CV Score: 0.30085511300217477, PV Score: -7.3893922736333995, Alpha: asymmetric, Beta: 0.61\n",
1106
+ "#Topics: 10, CV Score: 0.32576991286555973, PV Score: -7.400148584021936, Alpha: asymmetric, Beta: 0.61\n",
1107
+ "#Topics: 11, CV Score: 0.30570350016100467, PV Score: -7.3930680136359115, Alpha: asymmetric, Beta: 0.61\n",
1108
+ "#Topics: 12, CV Score: 0.27592390657639637, PV Score: -7.390687284583513, Alpha: asymmetric, Beta: 0.61\n",
1109
+ "#Topics: 4, CV Score: 0.26415862588678707, PV Score: -7.4660902988837075, Alpha: asymmetric, Beta: 0.9099999999999999\n",
1110
+ "#Topics: 5, CV Score: 0.3031196807486601, PV Score: -7.459922366408937, Alpha: asymmetric, Beta: 0.9099999999999999\n",
1111
+ "#Topics: 6, CV Score: 0.28267000810499315, PV Score: -7.446406611219757, Alpha: asymmetric, Beta: 0.9099999999999999\n",
1112
+ "#Topics: 7, CV Score: 0.30409252212111715, PV Score: -7.458109944307844, Alpha: asymmetric, Beta: 0.9099999999999999\n",
1113
+ "#Topics: 8, CV Score: 0.3186545711929555, PV Score: -7.460394137450762, Alpha: asymmetric, Beta: 0.9099999999999999\n"
1114
+ ]
1115
+ },
1116
+ {
1117
+ "ename": "KeyboardInterrupt",
1118
+ "evalue": "",
1119
+ "output_type": "error",
1120
+ "traceback": [
1121
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1122
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
1123
+ "File \u001b[0;32m<timed exec>:10\u001b[0m\n",
1124
+ "Cell \u001b[0;32mIn[19], line 4\u001b[0m, in \u001b[0;36mcompute_coherence_values\u001b[0;34m(corpus, dictionary, k, a, b)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompute_coherence_values\u001b[39m(corpus, dictionary, k, a, b):\n\u001b[0;32m----> 4\u001b[0m lda_model \u001b[38;5;241m=\u001b[39m \u001b[43mgensim\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodels\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mLdaMulticore\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcorpus\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcorpus\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43mid2word\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdictionary\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_topics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m42\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunksize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m100\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43mpasses\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43malpha\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43ma\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[43meta\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 13\u001b[0m coherence_model_lda \u001b[38;5;241m=\u001b[39m CoherenceModel(model\u001b[38;5;241m=\u001b[39mlda_model, texts\u001b[38;5;241m=\u001b[39mheadline, dictionary\u001b[38;5;241m=\u001b[39mdoc_dict, coherence\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mc_v\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 14\u001b[0m coherence \u001b[38;5;241m=\u001b[39m coherence_model_lda\u001b[38;5;241m.\u001b[39mget_coherence()\n",
1125
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/models/ldamulticore.py:186\u001b[0m, in \u001b[0;36mLdaMulticore.__init__\u001b[0;34m(self, corpus, num_topics, id2word, workers, chunksize, passes, batch, alpha, eta, decay, offset, eval_every, iterations, gamma_threshold, random_state, minimum_probability, minimum_phi_value, per_word_topics, dtype)\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(alpha, \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m alpha \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mauto\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m 184\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mauto-tuning alpha not implemented in LdaMulticore; use plain LdaModel.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 186\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mLdaMulticore\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 187\u001b[0m \u001b[43m \u001b[49m\u001b[43mcorpus\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcorpus\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_topics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnum_topics\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 188\u001b[0m \u001b[43m \u001b[49m\u001b[43mid2word\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mid2word\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchunksize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunksize\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpasses\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpasses\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43malpha\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43malpha\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43meta\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 189\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecay\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moffset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43meval_every\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43meval_every\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43miterations\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43miterations\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 190\u001b[0m \u001b[43m \u001b[49m\u001b[43mgamma_threshold\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgamma_threshold\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrandom_state\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mminimum_probability\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimum_probability\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 191\u001b[0m \u001b[43m \u001b[49m\u001b[43mminimum_phi_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimum_phi_value\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mper_word_topics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mper_word_topics\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 192\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
1126
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/models/ldamodel.py:521\u001b[0m, in \u001b[0;36mLdaModel.__init__\u001b[0;34m(self, corpus, num_topics, id2word, distributed, chunksize, passes, update_every, alpha, eta, decay, offset, eval_every, iterations, gamma_threshold, minimum_probability, random_state, ns_conf, minimum_phi_value, per_word_topics, callbacks, dtype)\u001b[0m\n\u001b[1;32m 519\u001b[0m use_numpy \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdispatcher \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 520\u001b[0m start \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 521\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mupdate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcorpus\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchunks_as_numpy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_numpy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 522\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39madd_lifecycle_event(\n\u001b[1;32m 523\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcreated\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 524\u001b[0m msg\u001b[38;5;241m=\u001b[39m\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrained \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtime\u001b[38;5;241m.\u001b[39mtime()\u001b[38;5;250m \u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;250m \u001b[39mstart\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m.2f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124ms\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 525\u001b[0m )\n",
1127
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/models/ldamulticore.py:316\u001b[0m, in \u001b[0;36mLdaMulticore.update\u001b[0;34m(self, corpus, chunks_as_numpy)\u001b[0m\n\u001b[1;32m 312\u001b[0m \u001b[38;5;66;03m# endfor single corpus pass\u001b[39;00m\n\u001b[1;32m 313\u001b[0m \n\u001b[1;32m 314\u001b[0m \u001b[38;5;66;03m# wait for all outstanding jobs to finish\u001b[39;00m\n\u001b[1;32m 315\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m queue_size[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m--> 316\u001b[0m \u001b[43mprocess_result_queue\u001b[49m\u001b[43m(\u001b[49m\u001b[43mforce\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reallen \u001b[38;5;241m!=\u001b[39m lencorpus:\n\u001b[1;32m 319\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput corpus size changed during training (don\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt use generators as input)\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
1128
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/models/ldamulticore.py:274\u001b[0m, in \u001b[0;36mLdaMulticore.update.<locals>.process_result_queue\u001b[0;34m(force)\u001b[0m\n\u001b[1;32m 268\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 269\u001b[0m \u001b[38;5;124;03mClear the result queue, merging all intermediate results, and update the\u001b[39;00m\n\u001b[1;32m 270\u001b[0m \u001b[38;5;124;03mLDA model if necessary.\u001b[39;00m\n\u001b[1;32m 271\u001b[0m \n\u001b[1;32m 272\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 273\u001b[0m merged_new \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m--> 274\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mresult_queue\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mempty\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 275\u001b[0m other\u001b[38;5;241m.\u001b[39mmerge(result_queue\u001b[38;5;241m.\u001b[39mget())\n\u001b[1;32m 276\u001b[0m queue_size[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
1129
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/queues.py:129\u001b[0m, in \u001b[0;36mQueue.empty\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 128\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mempty\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 129\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_poll\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
1130
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/connection.py:257\u001b[0m, in \u001b[0;36m_ConnectionBase.poll\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_closed()\n\u001b[1;32m 256\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_readable()\n\u001b[0;32m--> 257\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_poll\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n",
1131
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/connection.py:440\u001b[0m, in \u001b[0;36mConnection._poll\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_poll\u001b[39m(\u001b[38;5;28mself\u001b[39m, timeout):\n\u001b[0;32m--> 440\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 441\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mbool\u001b[39m(r)\n",
1132
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/connection.py:1130\u001b[0m, in \u001b[0;36mwait\u001b[0;34m(object_list, timeout)\u001b[0m\n\u001b[1;32m 1128\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _WaitSelector() \u001b[38;5;28;01mas\u001b[39;00m selector:\n\u001b[1;32m 1129\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m object_list:\n\u001b[0;32m-> 1130\u001b[0m \u001b[43mselector\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mregister\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mselectors\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mEVENT_READ\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1132\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1133\u001b[0m deadline \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mmonotonic() \u001b[38;5;241m+\u001b[39m timeout\n",
1133
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/selectors.py:352\u001b[0m, in \u001b[0;36m_PollLikeSelector.register\u001b[0;34m(self, fileobj, events, data)\u001b[0m\n\u001b[1;32m 351\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mregister\u001b[39m(\u001b[38;5;28mself\u001b[39m, fileobj, events, data\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[0;32m--> 352\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mregister\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfileobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mevents\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 353\u001b[0m poller_events \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 354\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m events \u001b[38;5;241m&\u001b[39m EVENT_READ:\n",
1134
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/selectors.py:238\u001b[0m, in \u001b[0;36m_BaseSelectorImpl.register\u001b[0;34m(self, fileobj, events, data)\u001b[0m\n\u001b[1;32m 235\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\u001b[38;5;129;01mnot\u001b[39;00m events) \u001b[38;5;129;01mor\u001b[39;00m (events \u001b[38;5;241m&\u001b[39m \u001b[38;5;241m~\u001b[39m(EVENT_READ \u001b[38;5;241m|\u001b[39m EVENT_WRITE)):\n\u001b[1;32m 236\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid events: \u001b[39m\u001b[38;5;132;01m{!r}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(events))\n\u001b[0;32m--> 238\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[43mSelectorKey\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfileobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fileobj_lookup\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfileobj\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mevents\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 240\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m key\u001b[38;5;241m.\u001b[39mfd \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fd_to_key:\n\u001b[1;32m 241\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{!r}\u001b[39;00m\u001b[38;5;124m (FD \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m) is already registered\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;241m.\u001b[39mformat(fileobj, key\u001b[38;5;241m.\u001b[39mfd))\n",
1135
+ "File \u001b[0;32m<string>:1\u001b[0m, in \u001b[0;36m<lambda>\u001b[0;34m(_cls, fileobj, fd, events, data)\u001b[0m\n",
1136
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
1137
+ ]
1138
+ }
1139
+ ],
1140
+ "source": [
1141
+ "%%time\n",
1142
+ "\n",
1143
+ "import datetime\n",
1144
+ "import numpy as np\n",
1145
+ "from gensim.models import CoherenceModel\n",
1146
+ "\n",
1147
+ "print(datetime.datetime.now())\n",
1148
+ "\n",
1149
+ "for a in alpha:\n",
1150
+ " for b in beta:\n",
1151
+ " for num in num_topics:\n",
1152
+ " cv, pv = compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict,k=num, a=a, b=b) \n",
1153
+ "\n",
1154
+ " model_topics.append(num) \n",
1155
+ " coherence_values.append(cv) \n",
1156
+ " perplexity_values.append(pv)\n",
1157
+ " alpha_result.append(a)\n",
1158
+ " beta_result.append(b)\n",
1159
+ " print(\"#Topics: \" + str(num) + \", CV Score: \" + str(coherence_values[-1]) + \", PV Score: \" + str(perplexity_values[-1]) + \", Alpha: \" + str(alpha_result[-1]) + \", Beta: \" + str(beta_result[-1]))\n",
1160
+ " \n",
1161
+ "print(datetime.datetime.now())"
1162
+ ]
1163
+ },
1164
+ {
1165
+ "cell_type": "markdown",
1166
+ "id": "364ff6d5-e3da-4dde-a2c8-5375fc5d711f",
1167
+ "metadata": {},
1168
+ "source": [
1169
+ "The table below reveals the top 20 fine tuned models with best combinations of coherence score and perplexity score. It was sorted by the coherence score in descending order as a higher coherence score indicates a better model, and sorted the perplexity score in ascending order as a lower perplexity score indicates a better model. While coherence score evaluates the quality of the topics, the perplexity score evaluates the overall performance of the model in predicting new documents. Usually, the coherence score is a better metric to use if the goal is to obtain topics that are semantically coherent and interpretable. Perplexity score, on the other hand, is a better metric to use if the goal is to build a model that generalises well to new data, in other words, how confident the model is in predicting the new data (Sánchez-Aguayo, et al., 2022). Ultimately, we aim to get a balance between the perplexity value and coherence score when determining our final model."
1170
+ ]
1171
+ },
1172
+ {
1173
+ "cell_type": "code",
1174
+ "execution_count": null,
1175
+ "id": "78a60032-a4d7-44d4-841c-a1bd3740d5dd",
1176
+ "metadata": {},
1177
+ "outputs": [],
1178
+ "source": [
1179
+ "# Find the top 20 combinations based on Coherence Score and Perplexity Score\n",
1180
+ "result = pd.DataFrame(\n",
1181
+ " {'Topics': model_topics,\n",
1182
+ " 'Coherence Score': coherence_values,\n",
1183
+ " 'Perplexity Score': perplexity_values,\n",
1184
+ " 'Alpha': alpha_result,\n",
1185
+ " 'Beta': beta_result\n",
1186
+ " })\n",
1187
+ "result.sort_values(by=['Coherence Score', 'Perplexity Score'], ascending=[False, True]).head(20)"
1188
+ ]
1189
+ },
1190
+ {
1191
+ "cell_type": "code",
1192
+ "execution_count": null,
1193
+ "id": "3461df57-c069-4ad2-80d7-8890dec9438e",
1194
+ "metadata": {},
1195
+ "outputs": [],
1196
+ "source": [
1197
+ "result.to_csv('data/lda_fine_tuning_result_minor.csv')"
1198
+ ]
1199
+ },
1200
+ {
1201
+ "cell_type": "code",
1202
+ "execution_count": null,
1203
+ "id": "800e5a4b-7302-42e8-97b0-5b598c1c80ae",
1204
+ "metadata": {
1205
+ "scrolled": true
1206
+ },
1207
+ "outputs": [],
1208
+ "source": [
1209
+ "# Show graph Topics vs Coherence Score\n",
1210
+ "result.groupby('Alpha').plot(x='Topics', y='Coherence Score', legend = True)"
1211
+ ]
1212
+ },
1213
+ {
1214
+ "cell_type": "code",
1215
+ "execution_count": null,
1216
+ "id": "26996b89-0e7a-4f2d-8cf7-c4a716569bc2",
1217
+ "metadata": {},
1218
+ "outputs": [],
1219
+ "source": [
1220
+ "# Show graph Topics vs Perplexity Score\n",
1221
+ "\n",
1222
+ "plt.plot(model_topics, coherence_values)\n",
1223
+ "plt.xlabel(\"Num Topics\")\n",
1224
+ "plt.ylabel(\"Coherence Score\")\n",
1225
+ "plt.legend((\"Coherence Score\"), loc='best')\n",
1226
+ "plt.show()"
1227
+ ]
1228
+ },
1229
+ {
1230
+ "cell_type": "code",
1231
+ "execution_count": null,
1232
+ "id": "91d2f4c1-de77-44b6-b41b-fcc9a07233e8",
1233
+ "metadata": {},
1234
+ "outputs": [],
1235
+ "source": [
1236
+ "# Show graph Topics vs Perplexity Score\n",
1237
+ "\n",
1238
+ "plt.plot(model_topics, perplexity_values)\n",
1239
+ "plt.xlabel(\"Num Topics\")\n",
1240
+ "plt.ylabel(\"Perplexity score\")\n",
1241
+ "plt.legend((\"perplexity_values\"), loc='best')\n",
1242
+ "plt.show()"
1243
+ ]
1244
+ },
1245
+ {
1246
+ "cell_type": "markdown",
1247
+ "id": "cdc3ddd2-f743-4e5b-b6c6-2656e0b77aec",
1248
+ "metadata": {},
1249
+ "source": [
1250
+ "## Final Model"
1251
+ ]
1252
+ },
1253
+ {
1254
+ "cell_type": "markdown",
1255
+ "id": "e86c2bfe-264b-4530-9d81-10b1cdc5071c",
1256
+ "metadata": {},
1257
+ "source": [
1258
+ "refer to the script topic_modelling_severe for detailed explanation"
1259
+ ]
1260
+ },
1261
+ {
1262
+ "cell_type": "code",
1263
+ "execution_count": null,
1264
+ "id": "490734ed-077c-4fb0-930c-0b42f4f63c94",
1265
+ "metadata": {},
1266
+ "outputs": [],
1267
+ "source": [
1268
+ "# realised that there may be some overlaps for 8 topics, thus 4-6 topics are optimal\n",
1269
+ "k = 2\n",
1270
+ "# a = 'asymmetric'\n",
1271
+ "a = 0.31\n",
1272
+ "# b = 0.31\n",
1273
+ "b = 'symmetric'\n",
1274
+ "\n",
1275
+ "\n",
1276
+ "final_model = gensim.models.LdaMulticore(corpus=docs_vecs,\n",
1277
+ " id2word=doc_dict,\n",
1278
+ " num_topics=k, \n",
1279
+ " random_state=42,\n",
1280
+ " chunksize=100,\n",
1281
+ " passes=10,\n",
1282
+ " alpha=a,\n",
1283
+ " eta=b)"
1284
+ ]
1285
+ },
1286
+ {
1287
+ "cell_type": "code",
1288
+ "execution_count": null,
1289
+ "id": "afe8abf0-2d12-414e-92be-a655865addb1",
1290
+ "metadata": {
1291
+ "tags": []
1292
+ },
1293
+ "outputs": [],
1294
+ "source": [
1295
+ "compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict,k=k, a=a, b=b) "
1296
+ ]
1297
+ },
1298
+ {
1299
+ "cell_type": "code",
1300
+ "execution_count": null,
1301
+ "id": "8430a827-6dbb-4737-8ccc-78ed17a01234",
1302
+ "metadata": {
1303
+ "tags": []
1304
+ },
1305
+ "outputs": [],
1306
+ "source": [
1307
+ "# Set up the environment to display the graphical outputs\n",
1308
+ "# feed the LDA model into the pyLDAvis instance\n",
1309
+ "pyLDAvis.enable_notebook()\n",
1310
+ "visual= gensimvis.prepare(final_model, docs_vecs, doc_dict)\n",
1311
+ "\n",
1312
+ "# Save the output to the html file\n",
1313
+ "pyLDAvis.save_html(visual, \"data/topic_viz2_minor_training.html\")"
1314
+ ]
1315
+ },
1316
+ {
1317
+ "cell_type": "code",
1318
+ "execution_count": null,
1319
+ "id": "5e30d71a-a3c7-40c7-94c0-7eea1bedc887",
1320
+ "metadata": {
1321
+ "tags": []
1322
+ },
1323
+ "outputs": [],
1324
+ "source": [
1325
+ "final_model.print_topics(num_words=30)"
1326
+ ]
1327
+ },
1328
+ {
1329
+ "cell_type": "code",
1330
+ "execution_count": null,
1331
+ "id": "b958c148-5dbe-4896-bcba-85e0f78b2941",
1332
+ "metadata": {},
1333
+ "outputs": [],
1334
+ "source": [
1335
+ "break"
1336
+ ]
1337
+ },
1338
+ {
1339
+ "cell_type": "code",
1340
+ "execution_count": null,
1341
+ "id": "1939c00d-12e4-4670-9f18-f287f8b86bef",
1342
+ "metadata": {},
1343
+ "outputs": [],
1344
+ "source": [
1345
+ "# Map the topic ID with appropriate topic names, this part should be updated accordinly whenever the model is updated\n",
1346
+ "topic_mapping = {0: \"finance\", 1: \"tech\", 2: \"education\", 3: \"sports\", 4: \"leisure\"}"
1347
+ ]
1348
+ },
1349
+ {
1350
+ "cell_type": "code",
1351
+ "execution_count": null,
1352
+ "id": "1a9a921e-3cdd-4fe5-a58f-7b50e5feeecd",
1353
+ "metadata": {},
1354
+ "outputs": [],
1355
+ "source": [
1356
+ "# Get the topics and their top keywords into a dataframe\n",
1357
+ "topics = final_model.show_topics(num_words=30) \n",
1358
+ "\n",
1359
+ "topic_keywords = pd.DataFrame()\n",
1360
+ "for topic_id, topic in topics:\n",
1361
+ " topic_keywords.at[topic_id, 'Topic Keywords'] = topic\n",
1362
+ "\n",
1363
+ "topic_keywords['Topic ID'] = topic_keywords.index\n",
1364
+ "topic_keywords['Topic Name'] = topic_mapping \n",
1365
+ "topic_keywords"
1366
+ ]
1367
+ },
1368
+ {
1369
+ "cell_type": "markdown",
1370
+ "id": "607d2cfd-b3ca-4f99-9e01-d320ca98a2a0",
1371
+ "metadata": {},
1372
+ "source": [
1373
+ "# Save the final model "
1374
+ ]
1375
+ },
1376
+ {
1377
+ "cell_type": "code",
1378
+ "execution_count": null,
1379
+ "id": "84eb2746-173a-4283-bca5-681f77548698",
1380
+ "metadata": {},
1381
+ "outputs": [],
1382
+ "source": [
1383
+ "#Save a model to disk, or reload a pre-trained model\n",
1384
+ "# naming convention: final_model_topic_alpha_eta\n",
1385
+ "final_model.save(\"models/final_model_5_asym_91\")"
1386
+ ]
1387
+ },
1388
+ {
1389
+ "cell_type": "markdown",
1390
+ "id": "a7b6e4d9-a577-4dfb-ba6e-fc74365880f4",
1391
+ "metadata": {},
1392
+ "source": [
1393
+ "# Find dominant topic(s) for each news article"
1394
+ ]
1395
+ },
1396
+ {
1397
+ "cell_type": "markdown",
1398
+ "id": "0eeecbcb-358c-44f9-8463-75cdfac0ba90",
1399
+ "metadata": {},
1400
+ "source": [
1401
+ "Attach the dominant topics back to the news dataset for classifying purpose."
1402
+ ]
1403
+ },
1404
+ {
1405
+ "cell_type": "markdown",
1406
+ "id": "8bebb269-dbb0-4c46-925c-38de0f2bcfd7",
1407
+ "metadata": {},
1408
+ "source": [
1409
+ "Made use of gensim lda's own function: https://radimrehurek.com/gensim/models/ldamodel.html"
1410
+ ]
1411
+ },
1412
+ {
1413
+ "cell_type": "code",
1414
+ "execution_count": null,
1415
+ "id": "f585ff52-b60d-4d70-ae64-a7c23d2cc6c1",
1416
+ "metadata": {},
1417
+ "outputs": [],
1418
+ "source": [
1419
+ "import warnings\n",
1420
+ "warnings.filterwarnings('ignore')\n",
1421
+ "\n",
1422
+ "def format_topics_sentences(ldamodel, corpus, data):\n",
1423
+ " # Preallocate memory for the DataFrame\n",
1424
+ " num_docs = len(corpus)\n",
1425
+ " sent_topics = {'Dominant_Topic': [0] * num_docs, 'Perc_Contribution': [0.0] * num_docs, 'Topic_Distribution': [()] * num_docs}\n",
1426
+ " \n",
1427
+ " # Get main topic in each document\n",
1428
+ " for i, row in enumerate(ldamodel[corpus]):\n",
1429
+ " row = sorted(row, key=lambda x: (x[1]), reverse=True)\n",
1430
+ " if row:\n",
1431
+ " # Get the Dominant topic, Perc Contribution and Keywords for each document\n",
1432
+ " dominant_topic, perc_contribution = row[0]\n",
1433
+ " topic_distribution = row\n",
1434
+ " sent_topics['Dominant_Topic'][i] = int(dominant_topic)\n",
1435
+ " sent_topics['Perc_Contribution'][i] = round(perc_contribution, 4)\n",
1436
+ " sent_topics['Topic_Distribution'][i] = topic_distribution\n",
1437
+ "\n",
1438
+ " # Create the DataFrame\n",
1439
+ " sent_topics_df = pd.DataFrame(sent_topics)\n",
1440
+ " sent_topics_df['Text'] = data\n",
1441
+ "\n",
1442
+ " return sent_topics_df"
1443
+ ]
1444
+ },
1445
+ {
1446
+ "cell_type": "code",
1447
+ "execution_count": null,
1448
+ "id": "24d3ff60-035e-4133-9ffd-88cce5cdccb1",
1449
+ "metadata": {},
1450
+ "outputs": [],
1451
+ "source": [
1452
+ "df_topic_sents_keywords = format_topics_sentences(ldamodel=final_model, corpus=docs_vecs, data=cleaned.Headline_Details)"
1453
+ ]
1454
+ },
1455
+ {
1456
+ "cell_type": "code",
1457
+ "execution_count": null,
1458
+ "id": "c88b088b",
1459
+ "metadata": {},
1460
+ "outputs": [],
1461
+ "source": [
1462
+ "# Format\n",
1463
+ "df_dominant_topic = df_topic_sents_keywords.reset_index()\n",
1464
+ "df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Topic_Distribution', 'Text']\n",
1465
+ "\n",
1466
+ "# Show\n",
1467
+ "df_dominant_topic.head(10)"
1468
+ ]
1469
+ },
1470
+ {
1471
+ "cell_type": "markdown",
1472
+ "id": "560da382-aa86-4df1-8b85-56b057a27cd4",
1473
+ "metadata": {},
1474
+ "source": [
1475
+ "# Result Analysis"
1476
+ ]
1477
+ },
1478
+ {
1479
+ "cell_type": "code",
1480
+ "execution_count": null,
1481
+ "id": "4fe6b40b-6922-4de3-8d9e-dac7474b6303",
1482
+ "metadata": {},
1483
+ "outputs": [],
1484
+ "source": [
1485
+ "df_dominant_topic[\"Dominant_Topic\"].value_counts()"
1486
+ ]
1487
+ },
1488
+ {
1489
+ "cell_type": "code",
1490
+ "execution_count": null,
1491
+ "id": "b9917340-31cf-48af-871f-b481128fdf22",
1492
+ "metadata": {},
1493
+ "outputs": [],
1494
+ "source": [
1495
+ "import matplotlib.pyplot as plt\n",
1496
+ "\n",
1497
+ "# Get value counts of each topic\n",
1498
+ "topic_counts = df_dominant_topic[\"Dominant_Topic\"].value_counts()\n",
1499
+ "\n",
1500
+ "# Create a bar plot\n",
1501
+ "plt.figure(figsize=(8, 6))\n",
1502
+ "topic_counts.plot(kind=\"bar\", color=\"skyblue\")\n",
1503
+ "\n",
1504
+ "# Add labels to the bars\n",
1505
+ "for i, count in enumerate(topic_counts):\n",
1506
+ " plt.text(i, count, str(count), ha=\"center\", va=\"bottom\")\n",
1507
+ "\n",
1508
+ "# Add labels and title\n",
1509
+ "plt.xlabel(\"Topics\")\n",
1510
+ "plt.ylabel(\"Number of News\")\n",
1511
+ "plt.title(\"Topic Distribution\")\n",
1512
+ "\n",
1513
+ "# Show the plot\n",
1514
+ "plt.xticks(rotation=45) # Rotate x-axis labels for better readability\n",
1515
+ "plt.tight_layout()\n",
1516
+ "plt.show()\n"
1517
+ ]
1518
+ },
1519
+ {
1520
+ "cell_type": "code",
1521
+ "execution_count": null,
1522
+ "id": "fffa1e57-f975-4469-a42b-19d76c60fb66",
1523
+ "metadata": {},
1524
+ "outputs": [],
1525
+ "source": [
1526
+ "df_dominant_topic.sort_values(by='Topic_Perc_Contrib', ascending=True).head(20)"
1527
+ ]
1528
+ },
1529
+ {
1530
+ "cell_type": "code",
1531
+ "execution_count": null,
1532
+ "id": "8510f506-141f-4382-b668-251df1afc95f",
1533
+ "metadata": {},
1534
+ "outputs": [],
1535
+ "source": [
1536
+ "# Sample 100 rows, can change the random_state for different samples\n",
1537
+ "sampled_data = df_dominant_topic.sample(n=100, random_state=42) \n",
1538
+ "sampled_df = pd.DataFrame(sampled_data).reset_index()\n",
1539
+ "sampled_df.to_csv('data/sample_minor.csv')"
1540
+ ]
1541
+ }
1542
+ ],
1543
+ "metadata": {
1544
+ "kernelspec": {
1545
+ "display_name": "Python 3",
1546
+ "language": "python",
1547
+ "name": "python3"
1548
+ },
1549
+ "language_info": {
1550
+ "codemirror_mode": {
1551
+ "name": "ipython",
1552
+ "version": 3
1553
+ },
1554
+ "file_extension": ".py",
1555
+ "mimetype": "text/x-python",
1556
+ "name": "python",
1557
+ "nbconvert_exporter": "python",
1558
+ "pygments_lexer": "ipython3",
1559
+ "version": "3.12.4"
1560
+ }
1561
+ },
1562
+ "nbformat": 4,
1563
+ "nbformat_minor": 5
1564
+ }
notebooks/08_topic_modelling_moderate.ipynb ADDED
@@ -0,0 +1,1760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "46322fb5-5918-4b70-9689-9e0781439ac4",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "workding dir: /Users/inflaton/code/engd/papers/maritime/global-incidents\n",
14
+ "loading env vars from: /Users/inflaton/code/engd/papers/maritime/global-incidents/.env\n"
15
+ ]
16
+ },
17
+ {
18
+ "data": {
19
+ "text/plain": [
20
+ "True"
21
+ ]
22
+ },
23
+ "execution_count": 1,
24
+ "metadata": {},
25
+ "output_type": "execute_result"
26
+ }
27
+ ],
28
+ "source": [
29
+ "%load_ext autoreload\n",
30
+ "%autoreload 2\n",
31
+ "\n",
32
+ "import os\n",
33
+ "import sys\n",
34
+ "from pathlib import Path\n",
35
+ "\n",
36
+ "workding_dir = str(Path.cwd().parent)\n",
37
+ "os.chdir(workding_dir)\n",
38
+ "sys.path.append(workding_dir)\n",
39
+ "print(\"workding dir:\", workding_dir)\n",
40
+ "\n",
41
+ "from dotenv import find_dotenv, load_dotenv\n",
42
+ "\n",
43
+ "found_dotenv = find_dotenv(\".env\")\n",
44
+ "\n",
45
+ "if len(found_dotenv) == 0:\n",
46
+ " found_dotenv = find_dotenv(\".env.example\")\n",
47
+ "print(f\"loading env vars from: {found_dotenv}\")\n",
48
+ "load_dotenv(found_dotenv, override=True)"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 2,
54
+ "id": "daf1e3d1-75ac-4299-8bed-2f413a49f9a6",
55
+ "metadata": {
56
+ "tags": []
57
+ },
58
+ "outputs": [],
59
+ "source": [
60
+ "import nltk\n",
61
+ "from nltk.tokenize import sent_tokenize\n",
62
+ "from nltk.tokenize import word_tokenize\n",
63
+ "\n",
64
+ "import gensim\n",
65
+ "from gensim import corpora\n",
66
+ "from gensim import similarities\n",
67
+ "from gensim import models\n",
68
+ "from gensim.models import CoherenceModel\n",
69
+ "\n",
70
+ "# from wordcloud import WordCloud, ImageColorGenerator\n",
71
+ "import matplotlib.pyplot as plt\n",
72
+ "import seaborn as sns\n",
73
+ "import pandas as pd\n",
74
+ "import re\n",
75
+ "import os\n",
76
+ "import datetime\n",
77
+ "\n",
78
+ "import warnings\n",
79
+ "warnings.filterwarnings('ignore')\n",
80
+ "\n",
81
+ "from pprint import pprint\n",
82
+ "import pyLDAvis\n",
83
+ "import pyLDAvis.gensim_models as gensimvis"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "markdown",
88
+ "id": "49e6de6b-71bd-4948-8827-52601406058f",
89
+ "metadata": {},
90
+ "source": [
91
+ "# Import Data"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": 4,
97
+ "id": "49222182-7811-4fa6-8c0a-21d3a546863e",
98
+ "metadata": {},
99
+ "outputs": [],
100
+ "source": [
101
+ "df = pd.read_parquet('data/processed_data2.parquet')"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": 5,
107
+ "id": "3fb59a30",
108
+ "metadata": {},
109
+ "outputs": [
110
+ {
111
+ "data": {
112
+ "text/html": [
113
+ "<div>\n",
114
+ "<style scoped>\n",
115
+ " .dataframe tbody tr th:only-of-type {\n",
116
+ " vertical-align: middle;\n",
117
+ " }\n",
118
+ "\n",
119
+ " .dataframe tbody tr th {\n",
120
+ " vertical-align: top;\n",
121
+ " }\n",
122
+ "\n",
123
+ " .dataframe thead th {\n",
124
+ " text-align: right;\n",
125
+ " }\n",
126
+ "</style>\n",
127
+ "<table border=\"1\" class=\"dataframe\">\n",
128
+ " <thead>\n",
129
+ " <tr style=\"text-align: right;\">\n",
130
+ " <th></th>\n",
131
+ " <th>id</th>\n",
132
+ " <th>Headline</th>\n",
133
+ " <th>Details</th>\n",
134
+ " <th>Severity</th>\n",
135
+ " <th>Category</th>\n",
136
+ " <th>Region</th>\n",
137
+ " <th>Datetime</th>\n",
138
+ " <th>Year</th>\n",
139
+ " <th>lat</th>\n",
140
+ " <th>lon</th>\n",
141
+ " <th>...</th>\n",
142
+ " <th>if_labeled</th>\n",
143
+ " <th>Month</th>\n",
144
+ " <th>Week</th>\n",
145
+ " <th>Headline_Details</th>\n",
146
+ " <th>url</th>\n",
147
+ " <th>title</th>\n",
148
+ " <th>content</th>\n",
149
+ " <th>cleaned_content</th>\n",
150
+ " <th>binary_content</th>\n",
151
+ " <th>word_count</th>\n",
152
+ " </tr>\n",
153
+ " </thead>\n",
154
+ " <tbody>\n",
155
+ " <tr>\n",
156
+ " <th>0</th>\n",
157
+ " <td>1</td>\n",
158
+ " <td>Grasberg Mine- Grasberg mine workers extend st...</td>\n",
159
+ " <td>Media sources indicate that workers at the Gra...</td>\n",
160
+ " <td>Moderate</td>\n",
161
+ " <td>Mine Workers Strike</td>\n",
162
+ " <td>Indonesia</td>\n",
163
+ " <td>28/5/17 17:08</td>\n",
164
+ " <td>2017.0</td>\n",
165
+ " <td>-4.05608</td>\n",
166
+ " <td>137.11302</td>\n",
167
+ " <td>...</td>\n",
168
+ " <td>False</td>\n",
169
+ " <td>5.0</td>\n",
170
+ " <td>21.0</td>\n",
171
+ " <td>Grasberg Mine- Grasberg mine workers extend st...</td>\n",
172
+ " <td>https://news.google.com/rss/articles/CBMiZ2h0d...</td>\n",
173
+ " <td>Freeport Indonesia mine workers extend strike ...</td>\n",
174
+ " <td>Trucks are seen on a road in the Grasberg copp...</td>\n",
175
+ " <td>[truck, be, see, on, road, in, grasberg, coppe...</td>\n",
176
+ " <td>[adkerson_jakarta_try, agreement_freeport_indo...</td>\n",
177
+ " <td>53</td>\n",
178
+ " </tr>\n",
179
+ " <tr>\n",
180
+ " <th>1</th>\n",
181
+ " <td>3</td>\n",
182
+ " <td>Shanghai port congestion impacts terminals in ...</td>\n",
183
+ " <td>The persisting port congestion at Shanghai’s Y...</td>\n",
184
+ " <td>Minor</td>\n",
185
+ " <td>Port Congestion</td>\n",
186
+ " <td>China</td>\n",
187
+ " <td>27/4/17 9:16</td>\n",
188
+ " <td>2017.0</td>\n",
189
+ " <td>29.52000</td>\n",
190
+ " <td>121.33190</td>\n",
191
+ " <td>...</td>\n",
192
+ " <td>False</td>\n",
193
+ " <td>4.0</td>\n",
194
+ " <td>17.0</td>\n",
195
+ " <td>Shanghai port congestion impacts terminals in ...</td>\n",
196
+ " <td>https://news.google.com/rss/articles/CBMiVWh0d...</td>\n",
197
+ " <td>Typhoon Muifa to shut China ports for second t...</td>\n",
198
+ " <td>By Sam Whelan 13/09/2022\\n\\nAnother typhoon ha...</td>\n",
199
+ " <td>[by, sam, whelan, typhoon, have, prompt, port,...</td>\n",
200
+ " <td>[additional_ripple_effect, avoid_path_typhoon,...</td>\n",
201
+ " <td>44</td>\n",
202
+ " </tr>\n",
203
+ " <tr>\n",
204
+ " <th>2</th>\n",
205
+ " <td>5</td>\n",
206
+ " <td>UPDATE - Indonesia: Police confirm two explosi...</td>\n",
207
+ " <td>According to local police in Jakarta, two expl...</td>\n",
208
+ " <td>Extreme</td>\n",
209
+ " <td>Bombing, Police Operations</td>\n",
210
+ " <td>Indonesia</td>\n",
211
+ " <td>24/5/17 16:20</td>\n",
212
+ " <td>2017.0</td>\n",
213
+ " <td>NaN</td>\n",
214
+ " <td>NaN</td>\n",
215
+ " <td>...</td>\n",
216
+ " <td>True</td>\n",
217
+ " <td>5.0</td>\n",
218
+ " <td>21.0</td>\n",
219
+ " <td>UPDATE - Indonesia: Police confirm two explosi...</td>\n",
220
+ " <td>https://news.google.com/rss/articles/CBMiZWh0d...</td>\n",
221
+ " <td>Jakarta Police Receive 2 More Reports on Coldp...</td>\n",
222
+ " <td>TEMPO.CO, Jakarta - South Jakarta Metro Police...</td>\n",
223
+ " <td>[jakarta, south, jakarta, metro, police, recei...</td>\n",
224
+ " <td>[actress_accord, available_day_concert, click_...</td>\n",
225
+ " <td>24</td>\n",
226
+ " </tr>\n",
227
+ " <tr>\n",
228
+ " <th>3</th>\n",
229
+ " <td>6</td>\n",
230
+ " <td>UPDATE - Indonesia: Severe winds damage infras...</td>\n",
231
+ " <td>Severe winds have downed billboards and trees ...</td>\n",
232
+ " <td>Moderate</td>\n",
233
+ " <td>Roadway Closure / Disruption, Flooding, Severe...</td>\n",
234
+ " <td>Indonesia</td>\n",
235
+ " <td>19/4/17 9:10</td>\n",
236
+ " <td>2017.0</td>\n",
237
+ " <td>-6.91264</td>\n",
238
+ " <td>107.65700</td>\n",
239
+ " <td>...</td>\n",
240
+ " <td>True</td>\n",
241
+ " <td>4.0</td>\n",
242
+ " <td>16.0</td>\n",
243
+ " <td>UPDATE - Indonesia: Severe winds damage infras...</td>\n",
244
+ " <td>https://news.google.com/rss/articles/CBMiSWh0d...</td>\n",
245
+ " <td>Indonesia hit by some of strongest winds recorded</td>\n",
246
+ " <td>A man stands near damaged houses following a t...</td>\n",
247
+ " <td>[man, stand, near, damage, house, follow, torn...</td>\n",
248
+ " <td>[bbc_indonesia, climatologist_government_resea...</td>\n",
249
+ " <td>28</td>\n",
250
+ " </tr>\n",
251
+ " <tr>\n",
252
+ " <th>4</th>\n",
253
+ " <td>14</td>\n",
254
+ " <td>2 miles E of Chesterfield - A tornado has touc...</td>\n",
255
+ " <td>Government sources are reporting a tornado has...</td>\n",
256
+ " <td>Minor</td>\n",
257
+ " <td>Tornado</td>\n",
258
+ " <td>United States</td>\n",
259
+ " <td>17/9/18 19:55</td>\n",
260
+ " <td>2018.0</td>\n",
261
+ " <td>37.51000</td>\n",
262
+ " <td>-77.61000</td>\n",
263
+ " <td>...</td>\n",
264
+ " <td>True</td>\n",
265
+ " <td>9.0</td>\n",
266
+ " <td>38.0</td>\n",
267
+ " <td>2 miles E of Chesterfield - A tornado has touc...</td>\n",
268
+ " <td>https://news.google.com/rss/articles/CBMigAFod...</td>\n",
269
+ " <td>UPDATE: Number of homes without power down to ...</td>\n",
270
+ " <td>More than 90,000 homes and businesses across t...</td>\n",
271
+ " <td>[more, than, home, business, across, richmond,...</td>\n",
272
+ " <td>[advise_seek_alternate, affect_richmond, alter...</td>\n",
273
+ " <td>134</td>\n",
274
+ " </tr>\n",
275
+ " </tbody>\n",
276
+ "</table>\n",
277
+ "<p>5 rows × 23 columns</p>\n",
278
+ "</div>"
279
+ ],
280
+ "text/plain": [
281
+ " id Headline \\\n",
282
+ "0 1 Grasberg Mine- Grasberg mine workers extend st... \n",
283
+ "1 3 Shanghai port congestion impacts terminals in ... \n",
284
+ "2 5 UPDATE - Indonesia: Police confirm two explosi... \n",
285
+ "3 6 UPDATE - Indonesia: Severe winds damage infras... \n",
286
+ "4 14 2 miles E of Chesterfield - A tornado has touc... \n",
287
+ "\n",
288
+ " Details Severity \\\n",
289
+ "0 Media sources indicate that workers at the Gra... Moderate \n",
290
+ "1 The persisting port congestion at Shanghai’s Y... Minor \n",
291
+ "2 According to local police in Jakarta, two expl... Extreme \n",
292
+ "3 Severe winds have downed billboards and trees ... Moderate \n",
293
+ "4 Government sources are reporting a tornado has... Minor \n",
294
+ "\n",
295
+ " Category Region \\\n",
296
+ "0 Mine Workers Strike Indonesia \n",
297
+ "1 Port Congestion China \n",
298
+ "2 Bombing, Police Operations Indonesia \n",
299
+ "3 Roadway Closure / Disruption, Flooding, Severe... Indonesia \n",
300
+ "4 Tornado United States \n",
301
+ "\n",
302
+ " Datetime Year lat lon ... if_labeled Month Week \\\n",
303
+ "0 28/5/17 17:08 2017.0 -4.05608 137.11302 ... False 5.0 21.0 \n",
304
+ "1 27/4/17 9:16 2017.0 29.52000 121.33190 ... False 4.0 17.0 \n",
305
+ "2 24/5/17 16:20 2017.0 NaN NaN ... True 5.0 21.0 \n",
306
+ "3 19/4/17 9:10 2017.0 -6.91264 107.65700 ... True 4.0 16.0 \n",
307
+ "4 17/9/18 19:55 2018.0 37.51000 -77.61000 ... True 9.0 38.0 \n",
308
+ "\n",
309
+ " Headline_Details \\\n",
310
+ "0 Grasberg Mine- Grasberg mine workers extend st... \n",
311
+ "1 Shanghai port congestion impacts terminals in ... \n",
312
+ "2 UPDATE - Indonesia: Police confirm two explosi... \n",
313
+ "3 UPDATE - Indonesia: Severe winds damage infras... \n",
314
+ "4 2 miles E of Chesterfield - A tornado has touc... \n",
315
+ "\n",
316
+ " url \\\n",
317
+ "0 https://news.google.com/rss/articles/CBMiZ2h0d... \n",
318
+ "1 https://news.google.com/rss/articles/CBMiVWh0d... \n",
319
+ "2 https://news.google.com/rss/articles/CBMiZWh0d... \n",
320
+ "3 https://news.google.com/rss/articles/CBMiSWh0d... \n",
321
+ "4 https://news.google.com/rss/articles/CBMigAFod... \n",
322
+ "\n",
323
+ " title \\\n",
324
+ "0 Freeport Indonesia mine workers extend strike ... \n",
325
+ "1 Typhoon Muifa to shut China ports for second t... \n",
326
+ "2 Jakarta Police Receive 2 More Reports on Coldp... \n",
327
+ "3 Indonesia hit by some of strongest winds recorded \n",
328
+ "4 UPDATE: Number of homes without power down to ... \n",
329
+ "\n",
330
+ " content \\\n",
331
+ "0 Trucks are seen on a road in the Grasberg copp... \n",
332
+ "1 By Sam Whelan 13/09/2022\\n\\nAnother typhoon ha... \n",
333
+ "2 TEMPO.CO, Jakarta - South Jakarta Metro Police... \n",
334
+ "3 A man stands near damaged houses following a t... \n",
335
+ "4 More than 90,000 homes and businesses across t... \n",
336
+ "\n",
337
+ " cleaned_content \\\n",
338
+ "0 [truck, be, see, on, road, in, grasberg, coppe... \n",
339
+ "1 [by, sam, whelan, typhoon, have, prompt, port,... \n",
340
+ "2 [jakarta, south, jakarta, metro, police, recei... \n",
341
+ "3 [man, stand, near, damage, house, follow, torn... \n",
342
+ "4 [more, than, home, business, across, richmond,... \n",
343
+ "\n",
344
+ " binary_content word_count \n",
345
+ "0 [adkerson_jakarta_try, agreement_freeport_indo... 53 \n",
346
+ "1 [additional_ripple_effect, avoid_path_typhoon,... 44 \n",
347
+ "2 [actress_accord, available_day_concert, click_... 24 \n",
348
+ "3 [bbc_indonesia, climatologist_government_resea... 28 \n",
349
+ "4 [advise_seek_alternate, affect_richmond, alter... 134 \n",
350
+ "\n",
351
+ "[5 rows x 23 columns]"
352
+ ]
353
+ },
354
+ "execution_count": 5,
355
+ "metadata": {},
356
+ "output_type": "execute_result"
357
+ }
358
+ ],
359
+ "source": [
360
+ "df.head()"
361
+ ]
362
+ },
363
+ {
364
+ "cell_type": "code",
365
+ "execution_count": 6,
366
+ "id": "09113e88-66cc-414c-a953-da04db83c4ae",
367
+ "metadata": {},
368
+ "outputs": [
369
+ {
370
+ "data": {
371
+ "text/plain": [
372
+ "(3681, 23)"
373
+ ]
374
+ },
375
+ "execution_count": 6,
376
+ "metadata": {},
377
+ "output_type": "execute_result"
378
+ }
379
+ ],
380
+ "source": [
381
+ "df.shape"
382
+ ]
383
+ },
384
+ {
385
+ "cell_type": "markdown",
386
+ "id": "037e74fc-bbcd-43e3-8346-799920cca8d8",
387
+ "metadata": {},
388
+ "source": [
389
+ "# Vectorisation"
390
+ ]
391
+ },
392
+ {
393
+ "cell_type": "markdown",
394
+ "id": "d67cef3a-59fb-4dd8-adc8-2cf288b90728",
395
+ "metadata": {},
396
+ "source": [
397
+ "NLP vectorization refers to the process of converting text data into numerical vectors that machine learning algorithms can understand and process. \n",
398
+ "\n",
399
+ "Bag-of-Words (BoW) is used here that represents text as a collection of unique words along with their frequencies. Each word is assigned an index, and the vector contains the count of each word present in the document."
400
+ ]
401
+ },
402
+ {
403
+ "cell_type": "code",
404
+ "execution_count": 7,
405
+ "id": "c95b7b8a-9767-469d-812d-c9a9d9fee0e9",
406
+ "metadata": {},
407
+ "outputs": [],
408
+ "source": [
409
+ "df_copy = df.copy()"
410
+ ]
411
+ },
412
+ {
413
+ "cell_type": "code",
414
+ "execution_count": 8,
415
+ "id": "dfb2001e-04c1-49dc-b423-a64ea47af5a9",
416
+ "metadata": {},
417
+ "outputs": [],
418
+ "source": [
419
+ "# choose only the extreme and severe cases for modelling\n",
420
+ "cleaned = df_copy[df_copy['Severity'].isin(['Moderate'])]\n",
421
+ "cleaned.reset_index(drop=True, inplace=True)"
422
+ ]
423
+ },
424
+ {
425
+ "cell_type": "code",
426
+ "execution_count": 9,
427
+ "id": "de71c523-a59e-44b2-aa96-5f17d872c9c6",
428
+ "metadata": {},
429
+ "outputs": [],
430
+ "source": [
431
+ "headline = cleaned.binary_content"
432
+ ]
433
+ },
434
+ {
435
+ "cell_type": "code",
436
+ "execution_count": 10,
437
+ "id": "5b1e34e1",
438
+ "metadata": {},
439
+ "outputs": [
440
+ {
441
+ "data": {
442
+ "text/plain": [
443
+ "array(['accord_video', 'aftermath_shootout', 'allow_couple_flee',\n",
444
+ " 'approach_truck', 'barricade_patrol_car', 'cam_footage',\n",
445
+ " 'chaotic_morning_begin', 'chase_truck', 'decide_attempt',\n",
446
+ " 'emergency_room_stab', 'entry_suspect_fire', 'escape_vehicle_hand',\n",
447
+ " 'explain_officer_form', 'force_cop_dive', 'free_driver',\n",
448
+ " 'give_police', 'greene_madison_montgomery', 'harrow_footage',\n",
449
+ " 'hijack_attempt', 'hold_hostage', 'hole_truck', 'hospital_dayton',\n",
450
+ " 'hostage_access_road', 'id_woman_purse', 'initial_officer_catch',\n",
451
+ " 'international_airport_access', 'international_airport_london',\n",
452
+ " 'late_die_morning', 'lead_officer', 'left_front_tire',\n",
453
+ " 'london_police_officer', 'man_wife', 'minivan_foot_chase',\n",
454
+ " 'montgomery_truck_roll', 'multiple_bullet_hole',\n",
455
+ " 'name_ronald_barbara', 'negotiate_couple_release',\n",
456
+ " 'numerous_bullet_hole', 'officer_stop_minivan', 'ohio_highway_cop',\n",
457
+ " 'ohio_highway_patrol', 'ohio_state_highway', 'open_fire',\n",
458
+ " 'original_driver_truck', 'parked_car', 'parking_lot',\n",
459
+ " 'police_car_chase', 'rear_light', 'return_fire', 'road_accord',\n",
460
+ " 'several_bullet', 'several_county', 'several_county_stop',\n",
461
+ " 'shoot_truck', 'shut_negotiation', 'special_response_team',\n",
462
+ " 'spike_strip_stop', 'stop_dayton', 'stop_till_morning',\n",
463
+ " 'strip_chase', 'suspect_point_handgun', 'tactical_gear_approach',\n",
464
+ " 'taylor_male_driver', 'transport_miami', 'trooper_return_fire',\n",
465
+ " 'trooper_see', 'truck_drive', 'truck_drive_police',\n",
466
+ " 'turn_hostage_standoff', 'tv_officer_multiple',\n",
467
+ " 'unload_round_pistol', 'victim_truck',\n",
468
+ " 'visible_registration_harrow', 'wbn_body', 'wbns_truck',\n",
469
+ " 'white_werner'], dtype=object)"
470
+ ]
471
+ },
472
+ "execution_count": 10,
473
+ "metadata": {},
474
+ "output_type": "execute_result"
475
+ }
476
+ ],
477
+ "source": [
478
+ "headline[5]"
479
+ ]
480
+ },
481
+ {
482
+ "cell_type": "code",
483
+ "execution_count": 11,
484
+ "id": "677055b4-978e-4253-90f4-3f903662e225",
485
+ "metadata": {
486
+ "tags": []
487
+ },
488
+ "outputs": [],
489
+ "source": [
490
+ "# vectorise the words\n",
491
+ "doc_dict = gensim.corpora.Dictionary(headline)\n",
492
+ "docs_vecs = [doc_dict.doc2bow(doc) for doc in headline]"
493
+ ]
494
+ },
495
+ {
496
+ "cell_type": "code",
497
+ "execution_count": 12,
498
+ "id": "a54d1768-b069-4936-a156-deaf0b506d93",
499
+ "metadata": {
500
+ "tags": []
501
+ },
502
+ "outputs": [
503
+ {
504
+ "name": "stdout",
505
+ "output_type": "stream",
506
+ "text": [
507
+ "Number of unique tokens: 128764\n",
508
+ "Number of articles: 1761\n"
509
+ ]
510
+ }
511
+ ],
512
+ "source": [
513
+ "print('Number of unique tokens: %d' % len(doc_dict)) \n",
514
+ "print('Number of articles: %d' % len(docs_vecs)) "
515
+ ]
516
+ },
517
+ {
518
+ "cell_type": "code",
519
+ "execution_count": 13,
520
+ "id": "9147fa86-1503-4252-bd9b-92fea1e6a926",
521
+ "metadata": {
522
+ "scrolled": true,
523
+ "tags": []
524
+ },
525
+ "outputs": [
526
+ {
527
+ "name": "stdout",
528
+ "output_type": "stream",
529
+ "text": [
530
+ "[('heavy_rain', 134),\n",
531
+ " ('supply_chain', 92),\n",
532
+ " ('strong_wind', 77),\n",
533
+ " ('national_weather_service', 72),\n",
534
+ " ('tropical_storm', 59),\n",
535
+ " ('critical_destination_port', 58),\n",
536
+ " ('industrial_action', 53),\n",
537
+ " ('global_supply_chain', 53),\n",
538
+ " ('heavy_rainfall', 49),\n",
539
+ " ('international_airport', 46),\n",
540
+ " ('united_state', 46),\n",
541
+ " ('high_yard_density', 43),\n",
542
+ " ('social_medium', 41),\n",
543
+ " ('hong_kong', 41),\n",
544
+ " ('national_hurricane_center', 41),\n",
545
+ " ('global_port_tracker', 39),\n",
546
+ " ('global_shipping_disruption', 38),\n",
547
+ " ('sign_confidence_consumer', 38),\n",
548
+ " ('upgrade_import_forecast', 38),\n",
549
+ " ('port_authority', 37),\n",
550
+ " ('new_york_city', 36),\n",
551
+ " ('schedule_reliability', 36),\n",
552
+ " ('new_york', 34),\n",
553
+ " ('coastal_area', 34),\n",
554
+ " ('east_coast', 34),\n",
555
+ " ('global_economy', 34),\n",
556
+ " ('economic_growth', 33),\n",
557
+ " ('mediterranean_demand', 33),\n",
558
+ " ('trade_trade_statement', 33),\n",
559
+ " ('high_wind', 32),\n",
560
+ " ('customer_demand', 32),\n",
561
+ " ('trade_statement', 32),\n",
562
+ " ('severe_weather', 31),\n",
563
+ " ('american_market', 31),\n",
564
+ " ('relevant_information', 31),\n",
565
+ " ('tropical_cyclone', 31),\n",
566
+ " ('severe_thunderstorm', 31),\n",
567
+ " ('several_day', 31),\n",
568
+ " ('matadi_cape_town', 31),\n",
569
+ " ('pacific_trade_statement', 31),\n",
570
+ " ('situation_port_face', 31),\n",
571
+ " ('boost_business', 30),\n",
572
+ " ('important_transport_route', 30),\n",
573
+ " ('industry_share_market', 30),\n",
574
+ " ('part_commitment_provide', 30),\n",
575
+ " ('relevant_news_hope', 30),\n",
576
+ " ('state_port', 30),\n",
577
+ " ('current_situation', 29),\n",
578
+ " ('fourth_quarter', 29),\n",
579
+ " ('shanghai_ningbo_shekou', 29),\n",
580
+ " ('additional_capacity_cater', 29),\n",
581
+ " ('new_zealand', 29),\n",
582
+ " ('full_network', 29),\n",
583
+ " ('coast_port', 28),\n",
584
+ " ('cargo_move_supply', 28),\n",
585
+ " ('crew_member', 28),\n",
586
+ " ('storm_surge', 28),\n",
587
+ " ('great_china', 28),\n",
588
+ " ('many_company', 28),\n",
589
+ " ('day_trade_asia', 28),\n",
590
+ " ('dp_world', 27),\n",
591
+ " ('international_container', 27),\n",
592
+ " ('power_outage', 27),\n",
593
+ " ('abijian_conakry_maputo', 27),\n",
594
+ " ('america_latin', 27),\n",
595
+ " ('america_trade_maersk', 27),\n",
596
+ " ('arrival_trade_asia', 27),\n",
597
+ " ('click_link_stay', 27),\n",
598
+ " ('connect_customer_supply', 27),\n",
599
+ " ('currency_depreciation_demand', 27),\n",
600
+ " ('day_coast_day', 27),\n",
601
+ " ('fiscal_support_consumer', 27),\n",
602
+ " ('lift_outlook', 27),\n",
603
+ " ('lome_onne', 27),\n",
604
+ " ('monthly_please', 27),\n",
605
+ " ('pacific_america_trade', 27),\n",
606
+ " ('pointe_noire_balboa', 27),\n",
607
+ " ('post_subscribe_asia', 27),\n",
608
+ " ('professional_find_market', 27),\n",
609
+ " ('question_supply_chain', 27),\n",
610
+ " ('service_china', 27),\n",
611
+ " ('useful_subscribe_maersk', 27),\n",
612
+ " ('help_business', 26),\n",
613
+ " ('new_jersey', 26),\n",
614
+ " ('port_congestion', 26),\n",
615
+ " ('asia_trade_statement', 26),\n",
616
+ " ('america_trade_statement', 26),\n",
617
+ " ('international_longshore_warehouse', 25),\n",
618
+ " ('next_day', 25),\n",
619
+ " ('many_country', 25),\n",
620
+ " ('meet_firm', 25),\n",
621
+ " ('website_see_service', 25),\n",
622
+ " ('situation_trade', 25),\n",
623
+ " ('vietnam_cambodia_myanmar', 25),\n",
624
+ " ('main_port', 25),\n",
625
+ " ('high_inflation', 24),\n",
626
+ " ('port_situation', 24),\n",
627
+ " ('high_level', 24),\n",
628
+ " ('china_area', 24),\n",
629
+ " ('passenger_service', 23)]\n"
630
+ ]
631
+ }
632
+ ],
633
+ "source": [
634
+ "# Calculate word frequencies\n",
635
+ "word_frequencies = {doc_dict[word_id]: freq for word_id, freq in doc_dict.cfs.items()}\n",
636
+ "sorted_words = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)\n",
637
+ "\n",
638
+ "pprint(sorted_words[:100])"
639
+ ]
640
+ },
641
+ {
642
+ "cell_type": "markdown",
643
+ "id": "5ed78239-2ce1-4784-a8f4-4c7438c8627b",
644
+ "metadata": {},
645
+ "source": [
646
+ "# LDA Modelling"
647
+ ]
648
+ },
649
+ {
650
+ "cell_type": "markdown",
651
+ "id": "9db83273-461d-4f70-b23f-ec967579d94f",
652
+ "metadata": {},
653
+ "source": [
654
+ "## Benchmark Model"
655
+ ]
656
+ },
657
+ {
658
+ "cell_type": "code",
659
+ "execution_count": 15,
660
+ "id": "e6d577bd-9936-4d45-be90-345af2eb4827",
661
+ "metadata": {
662
+ "tags": []
663
+ },
664
+ "outputs": [
665
+ {
666
+ "name": "stdout",
667
+ "output_type": "stream",
668
+ "text": [
669
+ "CPU times: user 1.32 s, sys: 837 ms, total: 2.15 s\n",
670
+ "Wall time: 1min 21s\n"
671
+ ]
672
+ }
673
+ ],
674
+ "source": [
675
+ "%%time\n",
676
+ "\n",
677
+ "# Build LDA benchmark model\n",
678
+ "lda_model = gensim.models.LdaMulticore(corpus=docs_vecs,\n",
679
+ " id2word=doc_dict,\n",
680
+ " num_topics=4, \n",
681
+ " random_state=42,\n",
682
+ " chunksize=100,\n",
683
+ " passes=10,\n",
684
+ " per_word_topics=True)"
685
+ ]
686
+ },
687
+ {
688
+ "cell_type": "code",
689
+ "execution_count": 16,
690
+ "id": "c4f1521f-5f43-40d2-a3a3-a8ac2ca6fec2",
691
+ "metadata": {
692
+ "tags": []
693
+ },
694
+ "outputs": [
695
+ {
696
+ "name": "stdout",
697
+ "output_type": "stream",
698
+ "text": [
699
+ "[(0,\n",
700
+ " '0.000*\"heavy_rain\" + 0.000*\"tropical_storm\" + '\n",
701
+ " '0.000*\"national_weather_service\" + 0.000*\"strong_wind\" + '\n",
702
+ " '0.000*\"average_day\" + 0.000*\"united_state\" + 0.000*\"supply_chain\" + '\n",
703
+ " '0.000*\"port_congestion\" + 0.000*\"heavy_rainfall\" + '\n",
704
+ " '0.000*\"national_hurricane_center\"'),\n",
705
+ " (1,\n",
706
+ " '0.000*\"global_port_tracker\" + 0.000*\"global_shipping_disruption\" + '\n",
707
+ " '0.000*\"upgrade_import_forecast\" + 0.000*\"sign_confidence_consumer\" + '\n",
708
+ " '0.000*\"supply_chain\" + 0.000*\"american_market\" + '\n",
709
+ " '0.000*\"relevant_information\" + 0.000*\"relevant_news_hope\" + '\n",
710
+ " '0.000*\"state_port\" + 0.000*\"industry_share_market\"'),\n",
711
+ " (2,\n",
712
+ " '0.001*\"heavy_rain\" + 0.000*\"supply_chain\" + 0.000*\"united_state\" + '\n",
713
+ " '0.000*\"critical_destination_port\" + 0.000*\"national_weather_service\" + '\n",
714
+ " '0.000*\"strong_wind\" + 0.000*\"high_yard_density\" + '\n",
715
+ " '0.000*\"mediterranean_demand\" + 0.000*\"pacific_trade_statement\" + '\n",
716
+ " '0.000*\"trade_trade_statement\"'),\n",
717
+ " (3,\n",
718
+ " '0.000*\"heavy_rain\" + 0.000*\"critical_destination_port\" + '\n",
719
+ " '0.000*\"global_supply_chain\" + 0.000*\"trade_statement\" + '\n",
720
+ " '0.000*\"schedule_reliability\" + 0.000*\"day_trade_asia\" + '\n",
721
+ " '0.000*\"full_network\" + 0.000*\"full_truck\" + 0.000*\"january_trade_maersk\" + '\n",
722
+ " '0.000*\"stable_network\"')]\n"
723
+ ]
724
+ }
725
+ ],
726
+ "source": [
727
+ "from pprint import pprint\n",
728
+ "\n",
729
+ "# Print the Keyword in the 10 topics\n",
730
+ "pprint(lda_model.print_topics())\n",
731
+ "doc_lda = lda_model[docs_vecs]"
732
+ ]
733
+ },
734
+ {
735
+ "cell_type": "code",
736
+ "execution_count": 17,
737
+ "id": "fd57b1f4-a6cd-41e8-964f-d8a1d30aa3c9",
738
+ "metadata": {
739
+ "tags": []
740
+ },
741
+ "outputs": [
742
+ {
743
+ "name": "stdout",
744
+ "output_type": "stream",
745
+ "text": [
746
+ "\n",
747
+ "Coherence Score LDAModel: 0.5135998374037359\n"
748
+ ]
749
+ }
750
+ ],
751
+ "source": [
752
+ "# Compute Benchmark Coherence Score\n",
753
+ "coherence_model_lda = CoherenceModel(model=lda_model, texts=headline, dictionary=doc_dict, coherence='c_v')\n",
754
+ "coherence_lda = coherence_model_lda.get_coherence()\n",
755
+ "print('\\nCoherence Score LDAModel: ', coherence_lda)"
756
+ ]
757
+ },
758
+ {
759
+ "cell_type": "code",
760
+ "execution_count": 18,
761
+ "id": "152e5a3a-7afe-4fb8-a02f-d7492ad80936",
762
+ "metadata": {
763
+ "tags": []
764
+ },
765
+ "outputs": [
766
+ {
767
+ "name": "stdout",
768
+ "output_type": "stream",
769
+ "text": [
770
+ "\n",
771
+ "Perplexity for LDAModel: -11.69195177074239\n"
772
+ ]
773
+ }
774
+ ],
775
+ "source": [
776
+ "# Compute Benchmark Perplexity\n",
777
+ "perplex= lda_model.log_perplexity(docs_vecs, total_docs=None) #For LDAModel\n",
778
+ " # a measure of how good the model is. lower the better.\n",
779
+ "\n",
780
+ "print('\\nPerplexity for LDAModel: ', perplex)"
781
+ ]
782
+ },
783
+ {
784
+ "cell_type": "code",
785
+ "execution_count": 20,
786
+ "id": "7dd3a60a-5c6f-4249-9868-30528a5b0ac8",
787
+ "metadata": {},
788
+ "outputs": [],
789
+ "source": [
790
+ "from pprint import pprint\n",
791
+ "import pyLDAvis\n",
792
+ "import pyLDAvis.gensim_models as gensimvis\n",
793
+ "\n",
794
+ "# feed the LDA model into the pyLDAvis instance\n",
795
+ "pyLDAvis.enable_notebook()\n",
796
+ "visual= gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n",
797
+ "\n",
798
+ "# Save the output to the html file\n",
799
+ "pyLDAvis.save_html(visual, \"data/topic_viz_benchmark_moderate.html\")"
800
+ ]
801
+ },
802
+ {
803
+ "cell_type": "code",
804
+ "execution_count": 21,
805
+ "id": "3a5612f7-6358-49c8-aba9-8aa54e275c6f",
806
+ "metadata": {},
807
+ "outputs": [
808
+ {
809
+ "data": {
810
+ "text/html": [
811
+ "<div>\n",
812
+ "<style scoped>\n",
813
+ " .dataframe tbody tr th:only-of-type {\n",
814
+ " vertical-align: middle;\n",
815
+ " }\n",
816
+ "\n",
817
+ " .dataframe tbody tr th {\n",
818
+ " vertical-align: top;\n",
819
+ " }\n",
820
+ "\n",
821
+ " .dataframe thead th {\n",
822
+ " text-align: right;\n",
823
+ " }\n",
824
+ "</style>\n",
825
+ "<table border=\"1\" class=\"dataframe\">\n",
826
+ " <thead>\n",
827
+ " <tr style=\"text-align: right;\">\n",
828
+ " <th></th>\n",
829
+ " <th>Topic Keywords</th>\n",
830
+ " <th>Topic ID</th>\n",
831
+ " </tr>\n",
832
+ " </thead>\n",
833
+ " <tbody>\n",
834
+ " <tr>\n",
835
+ " <th>0</th>\n",
836
+ " <td>0.000*\"heavy_rain\" + 0.000*\"tropical_storm\" + 0.000*\"national_weather_service\" + 0.000*\"strong_wind\" + 0.000*\"average_day\" + 0.000*\"united_state\"</td>\n",
837
+ " <td>0</td>\n",
838
+ " </tr>\n",
839
+ " <tr>\n",
840
+ " <th>1</th>\n",
841
+ " <td>0.000*\"global_port_tracker\" + 0.000*\"global_shipping_disruption\" + 0.000*\"upgrade_import_forecast\" + 0.000*\"sign_confidence_consumer\" + 0.000*\"supply_chain\" + 0.000*\"american_market\"</td>\n",
842
+ " <td>1</td>\n",
843
+ " </tr>\n",
844
+ " <tr>\n",
845
+ " <th>2</th>\n",
846
+ " <td>0.001*\"heavy_rain\" + 0.000*\"supply_chain\" + 0.000*\"united_state\" + 0.000*\"critical_destination_port\" + 0.000*\"national_weather_service\" + 0.000*\"strong_wind\"</td>\n",
847
+ " <td>2</td>\n",
848
+ " </tr>\n",
849
+ " <tr>\n",
850
+ " <th>3</th>\n",
851
+ " <td>0.000*\"heavy_rain\" + 0.000*\"critical_destination_port\" + 0.000*\"global_supply_chain\" + 0.000*\"trade_statement\" + 0.000*\"schedule_reliability\" + 0.000*\"day_trade_asia\"</td>\n",
852
+ " <td>3</td>\n",
853
+ " </tr>\n",
854
+ " </tbody>\n",
855
+ "</table>\n",
856
+ "</div>"
857
+ ],
858
+ "text/plain": [
859
+ " Topic Keywords \\\n",
860
+ "0 0.000*\"heavy_rain\" + 0.000*\"tropical_storm\" + 0.000*\"national_weather_service\" + 0.000*\"strong_wind\" + 0.000*\"average_day\" + 0.000*\"united_state\" \n",
861
+ "1 0.000*\"global_port_tracker\" + 0.000*\"global_shipping_disruption\" + 0.000*\"upgrade_import_forecast\" + 0.000*\"sign_confidence_consumer\" + 0.000*\"supply_chain\" + 0.000*\"american_market\" \n",
862
+ "2 0.001*\"heavy_rain\" + 0.000*\"supply_chain\" + 0.000*\"united_state\" + 0.000*\"critical_destination_port\" + 0.000*\"national_weather_service\" + 0.000*\"strong_wind\" \n",
863
+ "3 0.000*\"heavy_rain\" + 0.000*\"critical_destination_port\" + 0.000*\"global_supply_chain\" + 0.000*\"trade_statement\" + 0.000*\"schedule_reliability\" + 0.000*\"day_trade_asia\" \n",
864
+ "\n",
865
+ " Topic ID \n",
866
+ "0 0 \n",
867
+ "1 1 \n",
868
+ "2 2 \n",
869
+ "3 3 "
870
+ ]
871
+ },
872
+ "execution_count": 21,
873
+ "metadata": {},
874
+ "output_type": "execute_result"
875
+ }
876
+ ],
877
+ "source": [
878
+ "pd.set_option('max_colwidth', 200)\n",
879
+ "# Get the topics and their top keywords into a dataframe\n",
880
+ "topics = lda_model.show_topics(num_words=6) \n",
881
+ "\n",
882
+ "topic_keywords = pd.DataFrame()\n",
883
+ "for topic_id, topic in topics:\n",
884
+ " topic_keywords.at[topic_id, 'Topic Keywords'] = topic\n",
885
+ "\n",
886
+ "topic_keywords['Topic ID'] = topic_keywords.index\n",
887
+ "# topic_keywords['Topic Name'] = topic_mapping \n",
888
+ "topic_keywords"
889
+ ]
890
+ },
891
+ {
892
+ "cell_type": "code",
893
+ "execution_count": 22,
894
+ "id": "26da4eea-06a0-4ff7-ae14-2f40fa0db04b",
895
+ "metadata": {},
896
+ "outputs": [],
897
+ "source": [
898
+ "# break "
899
+ ]
900
+ },
901
+ {
902
+ "cell_type": "markdown",
903
+ "id": "1895598f-3e5f-4acd-83a6-4491cc90f695",
904
+ "metadata": {},
905
+ "source": [
906
+ "# Hyper-Perameter Tuning and Evaluation"
907
+ ]
908
+ },
909
+ {
910
+ "cell_type": "markdown",
911
+ "id": "47136c89-ff7b-4ac9-840f-04122fe62160",
912
+ "metadata": {},
913
+ "source": [
914
+ "Run the cells below only for re-modelling with new datasets, the whole tuning and evaluation process may take hours to run."
915
+ ]
916
+ },
917
+ {
918
+ "cell_type": "code",
919
+ "execution_count": 23,
920
+ "id": "c79ca5c4-e078-43ce-a430-8c1ed93dcd64",
921
+ "metadata": {},
922
+ "outputs": [],
923
+ "source": [
924
+ "# hyper-perameter tuning (alpha and beta)\n",
925
+ "def compute_coherence_values(corpus, dictionary, k, a, b):\n",
926
+ " \n",
927
+ " lda_model = gensim.models.LdaMulticore(corpus=corpus,\n",
928
+ " id2word=dictionary,\n",
929
+ " num_topics=k, \n",
930
+ " random_state=42,\n",
931
+ " chunksize=100,\n",
932
+ " passes=10,\n",
933
+ " alpha=a,\n",
934
+ " eta=b)\n",
935
+ " \n",
936
+ " coherence_model_lda = CoherenceModel(model=lda_model, texts=headline, dictionary=doc_dict, coherence='c_v')\n",
937
+ " coherence = coherence_model_lda.get_coherence()\n",
938
+ " perplex = lda_model.log_perplexity(docs_vecs, total_docs=None) \n",
939
+ " \n",
940
+ " return coherence, perplex"
941
+ ]
942
+ },
943
+ {
944
+ "cell_type": "code",
945
+ "execution_count": 24,
946
+ "id": "1c3c8478-9336-40f2-bb30-a37db4243b67",
947
+ "metadata": {},
948
+ "outputs": [],
949
+ "source": [
950
+ "# setup\n",
951
+ "import numpy as np\n",
952
+ "\n",
953
+ "from gensim.models import CoherenceModel\n",
954
+ "\n",
955
+ "model_list = []\n",
956
+ "coherence_values = []\n",
957
+ "perplexity_values = []\n",
958
+ "model_topics = []\n",
959
+ "alpha_result = []\n",
960
+ "beta_result = []\n",
961
+ "\n",
962
+ "# topic ranges\n",
963
+ "num_topics = range(4, 13)\n",
964
+ "\n",
965
+ "# Alpha parameter\n",
966
+ "alpha = list(np.arange(0.31, 1, 0.3))\n",
967
+ "alpha.append('symmetric')\n",
968
+ "alpha.append('asymmetric')\n",
969
+ "\n",
970
+ "# Beta parameter\n",
971
+ "beta = list(np.arange(0.31, 1, 0.3))\n",
972
+ "beta.append('symmetric')"
973
+ ]
974
+ },
975
+ {
976
+ "cell_type": "markdown",
977
+ "id": "c7e6bc53-0b57-4858-879a-644eca54ddbc",
978
+ "metadata": {},
979
+ "source": [
980
+ "Rational behind the alpha and eta: https://stats.stackexchange.com/questions/37405/natural-interpretation-for-lda-hyperparameters"
981
+ ]
982
+ },
983
+ {
984
+ "cell_type": "code",
985
+ "execution_count": 25,
986
+ "id": "02877b81-32df-4168-8e62-4cbca2be100b",
987
+ "metadata": {
988
+ "tags": []
989
+ },
990
+ "outputs": [
991
+ {
992
+ "name": "stdout",
993
+ "output_type": "stream",
994
+ "text": [
995
+ "Topic range: range(4, 13)\n",
996
+ "Alpha: [0.31, 0.61, 0.9099999999999999, 'symmetric', 'asymmetric']\n",
997
+ "Beta: [0.31, 0.61, 0.9099999999999999, 'symmetric']\n"
998
+ ]
999
+ }
1000
+ ],
1001
+ "source": [
1002
+ "print(\"Topic range: \",num_topics)\n",
1003
+ "print(\"Alpha: \",alpha)\n",
1004
+ "print(\"Beta: \", beta)"
1005
+ ]
1006
+ },
1007
+ {
1008
+ "cell_type": "code",
1009
+ "execution_count": 26,
1010
+ "id": "3c1f703c-4778-467f-a12e-0c18eeb274c5",
1011
+ "metadata": {},
1012
+ "outputs": [
1013
+ {
1014
+ "name": "stdout",
1015
+ "output_type": "stream",
1016
+ "text": [
1017
+ "2024-06-30 15:33:21.408395\n",
1018
+ "#Topics: 4, CV Score: 0.5164676429141806, PV Score: -11.613380568733923, Alpha: 0.31, Beta: 0.31\n",
1019
+ "#Topics: 5, CV Score: 0.5683002282838892, PV Score: -11.563534433528984, Alpha: 0.31, Beta: 0.31\n",
1020
+ "#Topics: 6, CV Score: 0.46555259667124016, PV Score: -11.53219474749234, Alpha: 0.31, Beta: 0.31\n",
1021
+ "#Topics: 7, CV Score: 0.5159944038644447, PV Score: -11.50783852707268, Alpha: 0.31, Beta: 0.31\n",
1022
+ "#Topics: 8, CV Score: 0.4639539916466071, PV Score: -11.49178161514173, Alpha: 0.31, Beta: 0.31\n",
1023
+ "#Topics: 9, CV Score: 0.6087247241979915, PV Score: -11.474821405795332, Alpha: 0.31, Beta: 0.31\n",
1024
+ "#Topics: 10, CV Score: 0.5444179957484441, PV Score: -11.469740741939871, Alpha: 0.31, Beta: 0.31\n",
1025
+ "#Topics: 11, CV Score: 0.5694955423900976, PV Score: -11.473400251077415, Alpha: 0.31, Beta: 0.31\n",
1026
+ "#Topics: 12, CV Score: 0.49528605221915506, PV Score: -11.464966921298775, Alpha: 0.31, Beta: 0.31\n",
1027
+ "#Topics: 4, CV Score: 0.4846077269482417, PV Score: -11.615188474292983, Alpha: 0.31, Beta: 0.61\n",
1028
+ "#Topics: 5, CV Score: 0.40044534503210266, PV Score: -11.571275497697407, Alpha: 0.31, Beta: 0.61\n",
1029
+ "#Topics: 6, CV Score: 0.47710647516703003, PV Score: -11.542620661350824, Alpha: 0.31, Beta: 0.61\n",
1030
+ "#Topics: 7, CV Score: 0.41352955820583087, PV Score: -11.540942420187077, Alpha: 0.31, Beta: 0.61\n",
1031
+ "#Topics: 8, CV Score: 0.4289987234137122, PV Score: -11.52253612455723, Alpha: 0.31, Beta: 0.61\n",
1032
+ "#Topics: 9, CV Score: 0.4870040213227611, PV Score: -11.517891900057037, Alpha: 0.31, Beta: 0.61\n",
1033
+ "#Topics: 10, CV Score: 0.550960377654979, PV Score: -11.515178838931973, Alpha: 0.31, Beta: 0.61\n",
1034
+ "#Topics: 11, CV Score: 0.54629246756211, PV Score: -11.521576629251783, Alpha: 0.31, Beta: 0.61\n",
1035
+ "#Topics: 12, CV Score: 0.4734093628757035, PV Score: -11.663901784573016, Alpha: 0.31, Beta: 0.61\n",
1036
+ "#Topics: 4, CV Score: 0.5226264223935461, PV Score: -11.581600670664274, Alpha: 0.31, Beta: 0.9099999999999999\n",
1037
+ "#Topics: 5, CV Score: 0.4815626090375563, PV Score: -11.57887038938498, Alpha: 0.31, Beta: 0.9099999999999999\n",
1038
+ "#Topics: 6, CV Score: 0.4822722108681728, PV Score: -11.559852785176927, Alpha: 0.31, Beta: 0.9099999999999999\n",
1039
+ "#Topics: 7, CV Score: 0.5190214814534763, PV Score: -11.58883143718404, Alpha: 0.31, Beta: 0.9099999999999999\n",
1040
+ "#Topics: 8, CV Score: 0.45997514950408147, PV Score: -11.55820866243133, Alpha: 0.31, Beta: 0.9099999999999999\n",
1041
+ "#Topics: 9, CV Score: 0.36374367474380254, PV Score: -11.63640550749839, Alpha: 0.31, Beta: 0.9099999999999999\n",
1042
+ "#Topics: 10, CV Score: 0.5643564534964567, PV Score: -11.557503103730841, Alpha: 0.31, Beta: 0.9099999999999999\n",
1043
+ "#Topics: 11, CV Score: 0.5469185375162117, PV Score: -11.5647045020449, Alpha: 0.31, Beta: 0.9099999999999999\n",
1044
+ "#Topics: 12, CV Score: 0.5792875415068292, PV Score: -11.806654395764777, Alpha: 0.31, Beta: 0.9099999999999999\n",
1045
+ "#Topics: 4, CV Score: 0.49804879882635616, PV Score: -11.642897316807662, Alpha: 0.31, Beta: symmetric\n",
1046
+ "#Topics: 5, CV Score: 0.5778525494321111, PV Score: -11.619415198399734, Alpha: 0.31, Beta: symmetric\n",
1047
+ "#Topics: 6, CV Score: 0.46263841554243873, PV Score: -11.603945568112582, Alpha: 0.31, Beta: symmetric\n",
1048
+ "#Topics: 7, CV Score: 0.4751517261203773, PV Score: -11.610468303646345, Alpha: 0.31, Beta: symmetric\n",
1049
+ "#Topics: 8, CV Score: 0.3914935846000445, PV Score: -11.642292032598931, Alpha: 0.31, Beta: symmetric\n",
1050
+ "#Topics: 9, CV Score: 0.6211150499938938, PV Score: -11.582063142378686, Alpha: 0.31, Beta: symmetric\n",
1051
+ "#Topics: 10, CV Score: 0.6020085770917671, PV Score: -11.583284044840946, Alpha: 0.31, Beta: symmetric\n",
1052
+ "#Topics: 11, CV Score: 0.6175933853797404, PV Score: -11.59165919505012, Alpha: 0.31, Beta: symmetric\n",
1053
+ "#Topics: 12, CV Score: 0.5908368128582339, PV Score: -11.564916248235656, Alpha: 0.31, Beta: symmetric\n",
1054
+ "#Topics: 4, CV Score: 0.5003561625277873, PV Score: -11.637393333313193, Alpha: 0.61, Beta: 0.31\n",
1055
+ "#Topics: 5, CV Score: 0.5317007800262091, PV Score: -11.641278862591506, Alpha: 0.61, Beta: 0.31\n",
1056
+ "#Topics: 6, CV Score: 0.557100339849869, PV Score: -11.584370827514544, Alpha: 0.61, Beta: 0.31\n",
1057
+ "#Topics: 7, CV Score: 0.5172402989720184, PV Score: -11.566723792196806, Alpha: 0.61, Beta: 0.31\n",
1058
+ "#Topics: 8, CV Score: 0.4886338324086602, PV Score: -11.54297247369616, Alpha: 0.61, Beta: 0.31\n",
1059
+ "#Topics: 9, CV Score: 0.5747517437985423, PV Score: -11.543691538358624, Alpha: 0.61, Beta: 0.31\n",
1060
+ "#Topics: 10, CV Score: 0.6423710177739427, PV Score: -11.54859461537098, Alpha: 0.61, Beta: 0.31\n",
1061
+ "#Topics: 11, CV Score: 0.5144410257534869, PV Score: -11.73660202888069, Alpha: 0.61, Beta: 0.31\n",
1062
+ "#Topics: 12, CV Score: 0.5717578206426642, PV Score: -11.530830842170962, Alpha: 0.61, Beta: 0.31\n",
1063
+ "#Topics: 4, CV Score: 0.5180085860214778, PV Score: -11.597568405435847, Alpha: 0.61, Beta: 0.61\n",
1064
+ "#Topics: 5, CV Score: 0.48198140101849224, PV Score: -11.580472178461592, Alpha: 0.61, Beta: 0.61\n",
1065
+ "#Topics: 6, CV Score: 0.5224885861991673, PV Score: -11.607306821970512, Alpha: 0.61, Beta: 0.61\n",
1066
+ "#Topics: 7, CV Score: 0.5116028282066244, PV Score: -11.566861788342026, Alpha: 0.61, Beta: 0.61\n",
1067
+ "#Topics: 8, CV Score: 0.4497755401591277, PV Score: -11.575814895292964, Alpha: 0.61, Beta: 0.61\n",
1068
+ "#Topics: 9, CV Score: 0.5327566529438476, PV Score: -11.586548862112604, Alpha: 0.61, Beta: 0.61\n",
1069
+ "#Topics: 10, CV Score: 0.5214711529865992, PV Score: -11.628991593567058, Alpha: 0.61, Beta: 0.61\n",
1070
+ "#Topics: 11, CV Score: 0.6139640248470433, PV Score: -11.594641144555219, Alpha: 0.61, Beta: 0.61\n",
1071
+ "#Topics: 12, CV Score: 0.5356515378529471, PV Score: -11.59298627864858, Alpha: 0.61, Beta: 0.61\n",
1072
+ "#Topics: 4, CV Score: 0.49160818384314564, PV Score: -11.60400992430138, Alpha: 0.61, Beta: 0.9099999999999999\n",
1073
+ "#Topics: 5, CV Score: 0.489468146274511, PV Score: -11.596778903516313, Alpha: 0.61, Beta: 0.9099999999999999\n",
1074
+ "#Topics: 6, CV Score: 0.47456800821881234, PV Score: -11.59473954643288, Alpha: 0.61, Beta: 0.9099999999999999\n",
1075
+ "#Topics: 7, CV Score: 0.5576401494669467, PV Score: -11.594596066878076, Alpha: 0.61, Beta: 0.9099999999999999\n",
1076
+ "#Topics: 8, CV Score: 0.4667184261166916, PV Score: -11.60378701602699, Alpha: 0.61, Beta: 0.9099999999999999\n",
1077
+ "#Topics: 9, CV Score: 0.5695086900271756, PV Score: -11.609075892863483, Alpha: 0.61, Beta: 0.9099999999999999\n",
1078
+ "#Topics: 10, CV Score: 0.6005670921966928, PV Score: -11.614071327825993, Alpha: 0.61, Beta: 0.9099999999999999\n",
1079
+ "#Topics: 11, CV Score: 0.5480401775800159, PV Score: -11.623267686418437, Alpha: 0.61, Beta: 0.9099999999999999\n",
1080
+ "#Topics: 12, CV Score: 0.5396539221868254, PV Score: -11.712120797560939, Alpha: 0.61, Beta: 0.9099999999999999\n",
1081
+ "#Topics: 4, CV Score: 0.45957893891496127, PV Score: -11.6729179593589, Alpha: 0.61, Beta: symmetric\n",
1082
+ "#Topics: 5, CV Score: 0.5497056258415677, PV Score: -11.65337679958747, Alpha: 0.61, Beta: symmetric\n",
1083
+ "#Topics: 6, CV Score: 0.5704158041118883, PV Score: -11.645450690343361, Alpha: 0.61, Beta: symmetric\n",
1084
+ "#Topics: 7, CV Score: 0.5268609558304878, PV Score: -11.638214852770608, Alpha: 0.61, Beta: symmetric\n",
1085
+ "#Topics: 8, CV Score: 0.4658802544250874, PV Score: -11.63919464483393, Alpha: 0.61, Beta: symmetric\n",
1086
+ "#Topics: 9, CV Score: 0.6506499863493337, PV Score: -11.63849523064737, Alpha: 0.61, Beta: symmetric\n",
1087
+ "#Topics: 10, CV Score: 0.649136503615956, PV Score: -11.616778698707527, Alpha: 0.61, Beta: symmetric\n",
1088
+ "#Topics: 11, CV Score: 0.6461235115651532, PV Score: -11.660982724024588, Alpha: 0.61, Beta: symmetric\n",
1089
+ "#Topics: 12, CV Score: 0.5795686612759658, PV Score: -11.654057997902278, Alpha: 0.61, Beta: symmetric\n",
1090
+ "#Topics: 4, CV Score: 0.43540817795833214, PV Score: -11.686659200023914, Alpha: 0.9099999999999999, Beta: 0.31\n",
1091
+ "#Topics: 5, CV Score: 0.5462127792495466, PV Score: -11.62457497965543, Alpha: 0.9099999999999999, Beta: 0.31\n",
1092
+ "#Topics: 6, CV Score: 0.5524384036237932, PV Score: -11.604354138411734, Alpha: 0.9099999999999999, Beta: 0.31\n",
1093
+ "#Topics: 7, CV Score: 0.5902480026727759, PV Score: -11.588582708471723, Alpha: 0.9099999999999999, Beta: 0.31\n",
1094
+ "#Topics: 8, CV Score: 0.47181028096737715, PV Score: -11.583687892570778, Alpha: 0.9099999999999999, Beta: 0.31\n",
1095
+ "#Topics: 9, CV Score: 0.6963152362329474, PV Score: -11.581137216572229, Alpha: 0.9099999999999999, Beta: 0.31\n",
1096
+ "#Topics: 10, CV Score: 0.6356546949964338, PV Score: -11.583413852955387, Alpha: 0.9099999999999999, Beta: 0.31\n",
1097
+ "#Topics: 11, CV Score: 0.6110954828509537, PV Score: -11.578235683186872, Alpha: 0.9099999999999999, Beta: 0.31\n",
1098
+ "#Topics: 12, CV Score: 0.5948748370571434, PV Score: -11.591490601197242, Alpha: 0.9099999999999999, Beta: 0.31\n",
1099
+ "#Topics: 4, CV Score: 0.48818167146918845, PV Score: -11.61574038870984, Alpha: 0.9099999999999999, Beta: 0.61\n",
1100
+ "#Topics: 5, CV Score: 0.4509752119451145, PV Score: -11.604195597608339, Alpha: 0.9099999999999999, Beta: 0.61\n",
1101
+ "#Topics: 6, CV Score: 0.35645184464437624, PV Score: -11.637862647568788, Alpha: 0.9099999999999999, Beta: 0.61\n",
1102
+ "#Topics: 7, CV Score: 0.526882700494322, PV Score: -11.653081895856653, Alpha: 0.9099999999999999, Beta: 0.61\n",
1103
+ "#Topics: 8, CV Score: 0.42882071728968496, PV Score: -11.609087232613648, Alpha: 0.9099999999999999, Beta: 0.61\n",
1104
+ "#Topics: 9, CV Score: 0.6693854566085815, PV Score: -11.614610403392634, Alpha: 0.9099999999999999, Beta: 0.61\n",
1105
+ "#Topics: 10, CV Score: 0.6179132712789255, PV Score: -11.623350711437743, Alpha: 0.9099999999999999, Beta: 0.61\n",
1106
+ "#Topics: 11, CV Score: 0.6085517947565203, PV Score: -11.63547085027884, Alpha: 0.9099999999999999, Beta: 0.61\n",
1107
+ "#Topics: 12, CV Score: 0.5577009173064904, PV Score: -11.651522052657057, Alpha: 0.9099999999999999, Beta: 0.61\n",
1108
+ "#Topics: 4, CV Score: 0.5230671592592115, PV Score: -11.623717800797593, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1109
+ "#Topics: 5, CV Score: 0.42768193852006214, PV Score: -11.62144694067679, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1110
+ "#Topics: 6, CV Score: 0.5364419205784232, PV Score: -11.624175638013885, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1111
+ "#Topics: 7, CV Score: 0.4857096769423311, PV Score: -11.63428665005544, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1112
+ "#Topics: 8, CV Score: 0.5044375689801471, PV Score: -11.642138170613716, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1113
+ "#Topics: 9, CV Score: 0.5696949744243359, PV Score: -11.654881948442265, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1114
+ "#Topics: 10, CV Score: 0.6326338442828181, PV Score: -11.66680515031184, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1115
+ "#Topics: 11, CV Score: 0.5727395073356502, PV Score: -11.69273499062231, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1116
+ "#Topics: 12, CV Score: 0.559542950238316, PV Score: -11.748344593064868, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1117
+ "#Topics: 4, CV Score: 0.3804434166795542, PV Score: -11.768219382154312, Alpha: 0.9099999999999999, Beta: symmetric\n",
1118
+ "#Topics: 5, CV Score: 0.5346775324429552, PV Score: -11.668832927089504, Alpha: 0.9099999999999999, Beta: symmetric\n",
1119
+ "#Topics: 6, CV Score: 0.35761369835980755, PV Score: -11.7234783761902, Alpha: 0.9099999999999999, Beta: symmetric\n",
1120
+ "#Topics: 7, CV Score: 0.5739180802234388, PV Score: -11.650366733021364, Alpha: 0.9099999999999999, Beta: symmetric\n",
1121
+ "#Topics: 8, CV Score: 0.4690450833134745, PV Score: -11.671911408908096, Alpha: 0.9099999999999999, Beta: symmetric\n",
1122
+ "#Topics: 9, CV Score: 0.6456959957990995, PV Score: -11.692264722739667, Alpha: 0.9099999999999999, Beta: symmetric\n",
1123
+ "#Topics: 10, CV Score: 0.6358178052250946, PV Score: -11.694376238908927, Alpha: 0.9099999999999999, Beta: symmetric\n",
1124
+ "#Topics: 11, CV Score: 0.6545497380387766, PV Score: -11.728265629814926, Alpha: 0.9099999999999999, Beta: symmetric\n",
1125
+ "#Topics: 12, CV Score: 0.5668227847904856, PV Score: -11.831189552283824, Alpha: 0.9099999999999999, Beta: symmetric\n",
1126
+ "#Topics: 4, CV Score: 0.5079465498144028, PV Score: -11.744019308081587, Alpha: symmetric, Beta: 0.31\n",
1127
+ "#Topics: 5, CV Score: 0.5207685525937951, PV Score: -11.56250304658784, Alpha: symmetric, Beta: 0.31\n",
1128
+ "#Topics: 6, CV Score: 0.4520436901506503, PV Score: -11.517440615925524, Alpha: symmetric, Beta: 0.31\n",
1129
+ "#Topics: 7, CV Score: 0.3984484028061592, PV Score: -11.49927289529406, Alpha: symmetric, Beta: 0.31\n",
1130
+ "#Topics: 8, CV Score: 0.46282464845379434, PV Score: -11.447392562926892, Alpha: symmetric, Beta: 0.31\n",
1131
+ "#Topics: 9, CV Score: 0.5628759566317312, PV Score: -11.431744800328639, Alpha: symmetric, Beta: 0.31\n",
1132
+ "#Topics: 10, CV Score: 0.5550416515331734, PV Score: -11.414885267043056, Alpha: symmetric, Beta: 0.31\n",
1133
+ "#Topics: 11, CV Score: 0.5292928552625016, PV Score: -11.412137160368818, Alpha: symmetric, Beta: 0.31\n",
1134
+ "#Topics: 12, CV Score: 0.4300247772463878, PV Score: -11.42635756927043, Alpha: symmetric, Beta: 0.31\n",
1135
+ "#Topics: 4, CV Score: 0.551938709076145, PV Score: -11.646172984411194, Alpha: symmetric, Beta: 0.61\n",
1136
+ "#Topics: 5, CV Score: 0.4809237301776637, PV Score: -11.559942575549936, Alpha: symmetric, Beta: 0.61\n",
1137
+ "#Topics: 6, CV Score: 0.46228300273399897, PV Score: -11.515382800949316, Alpha: symmetric, Beta: 0.61\n",
1138
+ "#Topics: 7, CV Score: 0.46287366351823206, PV Score: -11.4950582564519, Alpha: symmetric, Beta: 0.61\n",
1139
+ "#Topics: 8, CV Score: 0.4606576501635787, PV Score: -11.486398241110296, Alpha: symmetric, Beta: 0.61\n",
1140
+ "#Topics: 9, CV Score: 0.4481608851641272, PV Score: -11.520590899826077, Alpha: symmetric, Beta: 0.61\n",
1141
+ "#Topics: 10, CV Score: 0.5543837883238459, PV Score: -11.57085700936084, Alpha: symmetric, Beta: 0.61\n",
1142
+ "#Topics: 11, CV Score: 0.5316944180923253, PV Score: -11.578237826778107, Alpha: symmetric, Beta: 0.61\n",
1143
+ "#Topics: 12, CV Score: 0.4687550311670818, PV Score: -11.453118532016195, Alpha: symmetric, Beta: 0.61\n",
1144
+ "#Topics: 4, CV Score: 0.5365308671952822, PV Score: -11.58505859292843, Alpha: symmetric, Beta: 0.9099999999999999\n",
1145
+ "#Topics: 5, CV Score: 0.4573271215291844, PV Score: -11.55400501711275, Alpha: symmetric, Beta: 0.9099999999999999\n",
1146
+ "#Topics: 6, CV Score: 0.4845637557935425, PV Score: -11.538682634708005, Alpha: symmetric, Beta: 0.9099999999999999\n",
1147
+ "#Topics: 7, CV Score: 0.43505334283558816, PV Score: -11.560310334374622, Alpha: symmetric, Beta: 0.9099999999999999\n",
1148
+ "#Topics: 8, CV Score: 0.48972094471582783, PV Score: -11.52262741120793, Alpha: symmetric, Beta: 0.9099999999999999\n",
1149
+ "#Topics: 9, CV Score: 0.5105059500845196, PV Score: -11.51304907591031, Alpha: symmetric, Beta: 0.9099999999999999\n",
1150
+ "#Topics: 10, CV Score: 0.5591753304786676, PV Score: -11.5054622260105, Alpha: symmetric, Beta: 0.9099999999999999\n",
1151
+ "#Topics: 11, CV Score: 0.5528861076804188, PV Score: -11.504385112555493, Alpha: symmetric, Beta: 0.9099999999999999\n",
1152
+ "#Topics: 12, CV Score: 0.49367285382840453, PV Score: -11.500990267519892, Alpha: symmetric, Beta: 0.9099999999999999\n",
1153
+ "#Topics: 4, CV Score: 0.46594738594111, PV Score: -11.638161524533613, Alpha: symmetric, Beta: symmetric\n",
1154
+ "#Topics: 5, CV Score: 0.5212817101806027, PV Score: -11.605645781016982, Alpha: symmetric, Beta: symmetric\n",
1155
+ "#Topics: 6, CV Score: 0.44807641083986144, PV Score: -11.582140562178676, Alpha: symmetric, Beta: symmetric\n",
1156
+ "#Topics: 7, CV Score: 0.509705458396568, PV Score: -11.559129856046134, Alpha: symmetric, Beta: symmetric\n",
1157
+ "#Topics: 8, CV Score: 0.45072428914784013, PV Score: -11.521973552776846, Alpha: symmetric, Beta: symmetric\n",
1158
+ "#Topics: 9, CV Score: 0.6163956153172733, PV Score: -11.532030259535352, Alpha: symmetric, Beta: symmetric\n",
1159
+ "#Topics: 10, CV Score: 0.5731211242304707, PV Score: -11.526033204967101, Alpha: symmetric, Beta: symmetric\n",
1160
+ "#Topics: 11, CV Score: 0.6000508615312063, PV Score: -11.49509548343454, Alpha: symmetric, Beta: symmetric\n",
1161
+ "#Topics: 12, CV Score: 0.4536127133023553, PV Score: -11.499639773335504, Alpha: symmetric, Beta: symmetric\n",
1162
+ "#Topics: 4, CV Score: 0.5517238147309061, PV Score: -11.613287330943077, Alpha: asymmetric, Beta: 0.31\n",
1163
+ "#Topics: 5, CV Score: 0.653586278160635, PV Score: -11.544483709714015, Alpha: asymmetric, Beta: 0.31\n",
1164
+ "#Topics: 6, CV Score: 0.5297708553518212, PV Score: -11.565107511094201, Alpha: asymmetric, Beta: 0.31\n",
1165
+ "#Topics: 7, CV Score: 0.4308728891563382, PV Score: -11.482917171438388, Alpha: asymmetric, Beta: 0.31\n",
1166
+ "#Topics: 8, CV Score: 0.5225479879550474, PV Score: -11.452961250714646, Alpha: asymmetric, Beta: 0.31\n",
1167
+ "#Topics: 9, CV Score: 0.6266251392086057, PV Score: -11.436045988851452, Alpha: asymmetric, Beta: 0.31\n",
1168
+ "#Topics: 10, CV Score: 0.6244003133589429, PV Score: -11.420447490550442, Alpha: asymmetric, Beta: 0.31\n"
1169
+ ]
1170
+ },
1171
+ {
1172
+ "ename": "KeyboardInterrupt",
1173
+ "evalue": "",
1174
+ "output_type": "error",
1175
+ "traceback": [
1176
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1177
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
1178
+ "File \u001b[0;32m<timed exec>:10\u001b[0m\n",
1179
+ "Cell \u001b[0;32mIn[23], line 4\u001b[0m, in \u001b[0;36mcompute_coherence_values\u001b[0;34m(corpus, dictionary, k, a, b)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompute_coherence_values\u001b[39m(corpus, dictionary, k, a, b):\n\u001b[0;32m----> 4\u001b[0m lda_model \u001b[38;5;241m=\u001b[39m \u001b[43mgensim\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodels\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mLdaMulticore\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcorpus\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcorpus\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43mid2word\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdictionary\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_topics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m42\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunksize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m100\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43mpasses\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43malpha\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43ma\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[43meta\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 13\u001b[0m coherence_model_lda \u001b[38;5;241m=\u001b[39m CoherenceModel(model\u001b[38;5;241m=\u001b[39mlda_model, texts\u001b[38;5;241m=\u001b[39mheadline, dictionary\u001b[38;5;241m=\u001b[39mdoc_dict, coherence\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mc_v\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 14\u001b[0m coherence \u001b[38;5;241m=\u001b[39m coherence_model_lda\u001b[38;5;241m.\u001b[39mget_coherence()\n",
1180
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/models/ldamulticore.py:186\u001b[0m, in \u001b[0;36mLdaMulticore.__init__\u001b[0;34m(self, corpus, num_topics, id2word, workers, chunksize, passes, batch, alpha, eta, decay, offset, eval_every, iterations, gamma_threshold, random_state, minimum_probability, minimum_phi_value, per_word_topics, dtype)\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(alpha, \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m alpha \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mauto\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m 184\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mauto-tuning alpha not implemented in LdaMulticore; use plain LdaModel.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 186\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mLdaMulticore\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 187\u001b[0m \u001b[43m \u001b[49m\u001b[43mcorpus\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcorpus\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_topics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnum_topics\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 188\u001b[0m \u001b[43m \u001b[49m\u001b[43mid2word\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mid2word\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchunksize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunksize\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpasses\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpasses\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43malpha\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43malpha\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43meta\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 189\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecay\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moffset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43meval_every\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43meval_every\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43miterations\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43miterations\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 190\u001b[0m \u001b[43m \u001b[49m\u001b[43mgamma_threshold\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgamma_threshold\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrandom_state\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mminimum_probability\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimum_probability\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 191\u001b[0m \u001b[43m \u001b[49m\u001b[43mminimum_phi_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimum_phi_value\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mper_word_topics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mper_word_topics\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 192\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
1181
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/models/ldamodel.py:521\u001b[0m, in \u001b[0;36mLdaModel.__init__\u001b[0;34m(self, corpus, num_topics, id2word, distributed, chunksize, passes, update_every, alpha, eta, decay, offset, eval_every, iterations, gamma_threshold, minimum_probability, random_state, ns_conf, minimum_phi_value, per_word_topics, callbacks, dtype)\u001b[0m\n\u001b[1;32m 519\u001b[0m use_numpy \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdispatcher \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 520\u001b[0m start \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 521\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mupdate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcorpus\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchunks_as_numpy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_numpy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 522\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39madd_lifecycle_event(\n\u001b[1;32m 523\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcreated\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 524\u001b[0m msg\u001b[38;5;241m=\u001b[39m\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrained \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtime\u001b[38;5;241m.\u001b[39mtime()\u001b[38;5;250m \u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;250m \u001b[39mstart\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m.2f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124ms\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 525\u001b[0m )\n",
1182
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/models/ldamulticore.py:286\u001b[0m, in \u001b[0;36mLdaMulticore.update\u001b[0;34m(self, corpus, chunks_as_numpy)\u001b[0m\n\u001b[1;32m 283\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlog_perplexity(chunk, total_docs\u001b[38;5;241m=\u001b[39mlencorpus)\n\u001b[1;32m 285\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtraining LDA model using \u001b[39m\u001b[38;5;132;01m%i\u001b[39;00m\u001b[38;5;124m processes\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mworkers)\n\u001b[0;32m--> 286\u001b[0m pool \u001b[38;5;241m=\u001b[39m \u001b[43mPool\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mworkers\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mworker_e_step\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mjob_queue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult_queue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 287\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m pass_ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpasses):\n\u001b[1;32m 288\u001b[0m queue_size, reallen \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m0\u001b[39m], \u001b[38;5;241m0\u001b[39m\n",
1183
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/context.py:119\u001b[0m, in \u001b[0;36mBaseContext.Pool\u001b[0;34m(self, processes, initializer, initargs, maxtasksperchild)\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m'''Returns a process pool object'''\u001b[39;00m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpool\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Pool\n\u001b[0;32m--> 119\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mPool\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocesses\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minitializer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minitargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmaxtasksperchild\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 120\u001b[0m \u001b[43m \u001b[49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_context\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
1184
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/pool.py:215\u001b[0m, in \u001b[0;36mPool.__init__\u001b[0;34m(self, processes, initializer, initargs, maxtasksperchild, context)\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_processes \u001b[38;5;241m=\u001b[39m processes\n\u001b[1;32m 214\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 215\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_repopulate_pool\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 217\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m p \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pool:\n",
1185
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/pool.py:306\u001b[0m, in \u001b[0;36mPool._repopulate_pool\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 305\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_repopulate_pool\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 306\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_repopulate_pool_static\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_ctx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mProcess\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 307\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_processes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 308\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_pool\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_inqueue\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 309\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_outqueue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_initializer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 310\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_initargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 311\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_maxtasksperchild\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 312\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wrap_exception\u001b[49m\u001b[43m)\u001b[49m\n",
1186
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/pool.py:329\u001b[0m, in \u001b[0;36mPool._repopulate_pool_static\u001b[0;34m(ctx, Process, processes, pool, inqueue, outqueue, initializer, initargs, maxtasksperchild, wrap_exception)\u001b[0m\n\u001b[1;32m 327\u001b[0m w\u001b[38;5;241m.\u001b[39mname \u001b[38;5;241m=\u001b[39m w\u001b[38;5;241m.\u001b[39mname\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mProcess\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mPoolWorker\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 328\u001b[0m w\u001b[38;5;241m.\u001b[39mdaemon \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 329\u001b[0m \u001b[43mw\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 330\u001b[0m pool\u001b[38;5;241m.\u001b[39mappend(w)\n\u001b[1;32m 331\u001b[0m util\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124madded worker\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
1187
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/process.py:121\u001b[0m, in \u001b[0;36mBaseProcess.start\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _current_process\u001b[38;5;241m.\u001b[39m_config\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdaemon\u001b[39m\u001b[38;5;124m'\u001b[39m), \\\n\u001b[1;32m 119\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdaemonic processes are not allowed to have children\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 120\u001b[0m _cleanup()\n\u001b[0;32m--> 121\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_popen \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_Popen\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 122\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sentinel \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_popen\u001b[38;5;241m.\u001b[39msentinel\n\u001b[1;32m 123\u001b[0m \u001b[38;5;66;03m# Avoid a refcycle if the target function holds an indirect\u001b[39;00m\n\u001b[1;32m 124\u001b[0m \u001b[38;5;66;03m# reference to the process object (see bpo-30775)\u001b[39;00m\n",
1188
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/context.py:289\u001b[0m, in \u001b[0;36mSpawnProcess._Popen\u001b[0;34m(process_obj)\u001b[0m\n\u001b[1;32m 286\u001b[0m \u001b[38;5;129m@staticmethod\u001b[39m\n\u001b[1;32m 287\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_Popen\u001b[39m(process_obj):\n\u001b[1;32m 288\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpopen_spawn_posix\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Popen\n\u001b[0;32m--> 289\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mPopen\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocess_obj\u001b[49m\u001b[43m)\u001b[49m\n",
1189
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/popen_spawn_posix.py:32\u001b[0m, in \u001b[0;36mPopen.__init__\u001b[0;34m(self, process_obj)\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, process_obj):\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fds \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m---> 32\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mprocess_obj\u001b[49m\u001b[43m)\u001b[49m\n",
1190
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/popen_fork.py:19\u001b[0m, in \u001b[0;36mPopen.__init__\u001b[0;34m(self, process_obj)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturncode \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfinalizer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m---> 19\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_launch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocess_obj\u001b[49m\u001b[43m)\u001b[49m\n",
1191
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/popen_spawn_posix.py:62\u001b[0m, in \u001b[0;36mPopen._launch\u001b[0;34m(self, process_obj)\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msentinel \u001b[38;5;241m=\u001b[39m parent_r\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(parent_w, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mwb\u001b[39m\u001b[38;5;124m'\u001b[39m, closefd\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[0;32m---> 62\u001b[0m \u001b[43mf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetbuffer\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 63\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 64\u001b[0m fds_to_close \u001b[38;5;241m=\u001b[39m []\n",
1192
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
1193
+ ]
1194
+ }
1195
+ ],
1196
+ "source": [
1197
+ "%%time\n",
1198
+ "\n",
1199
+ "import datetime\n",
1200
+ "import numpy as np\n",
1201
+ "from gensim.models import CoherenceModel\n",
1202
+ "\n",
1203
+ "print(datetime.datetime.now())\n",
1204
+ "\n",
1205
+ "for a in alpha:\n",
1206
+ " for b in beta:\n",
1207
+ " for num in num_topics:\n",
1208
+ " cv, pv = compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict,k=num, a=a, b=b) \n",
1209
+ "\n",
1210
+ " model_topics.append(num) \n",
1211
+ " coherence_values.append(cv) \n",
1212
+ " perplexity_values.append(pv)\n",
1213
+ " alpha_result.append(a)\n",
1214
+ " beta_result.append(b)\n",
1215
+ " print(\"#Topics: \" + str(num) + \", CV Score: \" + str(coherence_values[-1]) + \", PV Score: \" + str(perplexity_values[-1]) + \", Alpha: \" + str(alpha_result[-1]) + \", Beta: \" + str(beta_result[-1]))\n",
1216
+ " \n",
1217
+ "print(datetime.datetime.now())"
1218
+ ]
1219
+ },
1220
+ {
1221
+ "cell_type": "markdown",
1222
+ "id": "364ff6d5-e3da-4dde-a2c8-5375fc5d711f",
1223
+ "metadata": {},
1224
+ "source": [
1225
+ "The table below reveals the top 20 fine tuned models with best combinations of coherence score and perplexity score. It was sorted by the coherence score in descending order as a higher coherence score indicates a better model, and sorted the perplexity score in ascending order as a lower perplexity score indicates a better model. While coherence score evaluates the quality of the topics, the perplexity score evaluates the overall performance of the model in predicting new documents. Usually, the coherence score is a better metric to use if the goal is to obtain topics that are semantically coherent and interpretable. Perplexity score, on the other hand, is a better metric to use if the goal is to build a model that generalises well to new data, in other words, how confident the model is in predicting the new data (Sánchez-Aguayo, et al., 2022). Ultimately, we aim to get a balance between the perplexity value and coherence score when determining our final model."
1226
+ ]
1227
+ },
1228
+ {
1229
+ "cell_type": "code",
1230
+ "execution_count": null,
1231
+ "id": "78a60032-a4d7-44d4-841c-a1bd3740d5dd",
1232
+ "metadata": {},
1233
+ "outputs": [
1234
+ {
1235
+ "ename": "",
1236
+ "evalue": "",
1237
+ "output_type": "error",
1238
+ "traceback": [
1239
+ "\u001b[1;31mFailed to start the Kernel. \n",
1240
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1241
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1242
+ ]
1243
+ }
1244
+ ],
1245
+ "source": [
1246
+ "# Find the top 20 combinations based on Coherence Score and Perplexity Score\n",
1247
+ "result = pd.DataFrame(\n",
1248
+ " {'Topics': model_topics,\n",
1249
+ " 'Coherence Score': coherence_values,\n",
1250
+ " 'Perplexity Score': perplexity_values,\n",
1251
+ " 'Alpha': alpha_result,\n",
1252
+ " 'Beta': beta_result\n",
1253
+ " })\n",
1254
+ "result.sort_values(by=['Coherence Score', 'Perplexity Score'], ascending=[False, True]).head(20)"
1255
+ ]
1256
+ },
1257
+ {
1258
+ "cell_type": "code",
1259
+ "execution_count": null,
1260
+ "id": "3461df57-c069-4ad2-80d7-8890dec9438e",
1261
+ "metadata": {},
1262
+ "outputs": [
1263
+ {
1264
+ "ename": "",
1265
+ "evalue": "",
1266
+ "output_type": "error",
1267
+ "traceback": [
1268
+ "\u001b[1;31mFailed to start the Kernel. \n",
1269
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1270
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1271
+ ]
1272
+ }
1273
+ ],
1274
+ "source": [
1275
+ "result.to_csv('data/lda_fine_tuning_result_moderate.csv')"
1276
+ ]
1277
+ },
1278
+ {
1279
+ "cell_type": "code",
1280
+ "execution_count": null,
1281
+ "id": "800e5a4b-7302-42e8-97b0-5b598c1c80ae",
1282
+ "metadata": {
1283
+ "scrolled": true
1284
+ },
1285
+ "outputs": [
1286
+ {
1287
+ "ename": "",
1288
+ "evalue": "",
1289
+ "output_type": "error",
1290
+ "traceback": [
1291
+ "\u001b[1;31mFailed to start the Kernel. \n",
1292
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1293
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1294
+ ]
1295
+ }
1296
+ ],
1297
+ "source": [
1298
+ "# Show graph Topics vs Coherence Score\n",
1299
+ "result.groupby('Alpha').plot(x='Topics', y='Coherence Score', legend = True)"
1300
+ ]
1301
+ },
1302
+ {
1303
+ "cell_type": "code",
1304
+ "execution_count": null,
1305
+ "id": "26996b89-0e7a-4f2d-8cf7-c4a716569bc2",
1306
+ "metadata": {},
1307
+ "outputs": [
1308
+ {
1309
+ "ename": "",
1310
+ "evalue": "",
1311
+ "output_type": "error",
1312
+ "traceback": [
1313
+ "\u001b[1;31mFailed to start the Kernel. \n",
1314
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1315
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1316
+ ]
1317
+ }
1318
+ ],
1319
+ "source": [
1320
+ "# Show graph Topics vs Perplexity Score\n",
1321
+ "\n",
1322
+ "plt.plot(model_topics, coherence_values)\n",
1323
+ "plt.xlabel(\"Num Topics\")\n",
1324
+ "plt.ylabel(\"Coherence Score\")\n",
1325
+ "plt.legend((\"Coherence Score\"), loc='best')\n",
1326
+ "plt.show()"
1327
+ ]
1328
+ },
1329
+ {
1330
+ "cell_type": "code",
1331
+ "execution_count": null,
1332
+ "id": "91d2f4c1-de77-44b6-b41b-fcc9a07233e8",
1333
+ "metadata": {},
1334
+ "outputs": [
1335
+ {
1336
+ "ename": "",
1337
+ "evalue": "",
1338
+ "output_type": "error",
1339
+ "traceback": [
1340
+ "\u001b[1;31mFailed to start the Kernel. \n",
1341
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1342
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1343
+ ]
1344
+ }
1345
+ ],
1346
+ "source": [
1347
+ "# Show graph Topics vs Perplexity Score\n",
1348
+ "\n",
1349
+ "plt.plot(model_topics, perplexity_values)\n",
1350
+ "plt.xlabel(\"Num Topics\")\n",
1351
+ "plt.ylabel(\"Perplexity score\")\n",
1352
+ "plt.legend((\"perplexity_values\"), loc='best')\n",
1353
+ "plt.show()"
1354
+ ]
1355
+ },
1356
+ {
1357
+ "cell_type": "markdown",
1358
+ "id": "cdc3ddd2-f743-4e5b-b6c6-2656e0b77aec",
1359
+ "metadata": {},
1360
+ "source": [
1361
+ "## Final Model"
1362
+ ]
1363
+ },
1364
+ {
1365
+ "cell_type": "code",
1366
+ "execution_count": null,
1367
+ "id": "490734ed-077c-4fb0-930c-0b42f4f63c94",
1368
+ "metadata": {},
1369
+ "outputs": [
1370
+ {
1371
+ "ename": "",
1372
+ "evalue": "",
1373
+ "output_type": "error",
1374
+ "traceback": [
1375
+ "\u001b[1;31mFailed to start the Kernel. \n",
1376
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1377
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1378
+ ]
1379
+ }
1380
+ ],
1381
+ "source": [
1382
+ "# realised that there may be some overlaps for more than 5 topics, but below 5 topics results in low differentiation and high ambiguity among the topics.\n",
1383
+ "# LDA is not suitable for this dataset\n",
1384
+ "k = 9\n",
1385
+ "a = 'symmetric'\n",
1386
+ "# a = 0.31\n",
1387
+ "# b = 0.31\n",
1388
+ "b = 'symmetric'\n",
1389
+ "\n",
1390
+ "\n",
1391
+ "final_model = gensim.models.LdaMulticore(corpus=docs_vecs,\n",
1392
+ " id2word=doc_dict,\n",
1393
+ " num_topics=k, \n",
1394
+ " random_state=42,\n",
1395
+ " chunksize=100,\n",
1396
+ " passes=10,\n",
1397
+ " alpha=a,\n",
1398
+ " eta=b)"
1399
+ ]
1400
+ },
1401
+ {
1402
+ "cell_type": "code",
1403
+ "execution_count": null,
1404
+ "id": "afe8abf0-2d12-414e-92be-a655865addb1",
1405
+ "metadata": {
1406
+ "tags": []
1407
+ },
1408
+ "outputs": [
1409
+ {
1410
+ "ename": "",
1411
+ "evalue": "",
1412
+ "output_type": "error",
1413
+ "traceback": [
1414
+ "\u001b[1;31mFailed to start the Kernel. \n",
1415
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1416
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1417
+ ]
1418
+ }
1419
+ ],
1420
+ "source": [
1421
+ "compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict,k=k, a=a, b=b) "
1422
+ ]
1423
+ },
1424
+ {
1425
+ "cell_type": "code",
1426
+ "execution_count": null,
1427
+ "id": "8430a827-6dbb-4737-8ccc-78ed17a01234",
1428
+ "metadata": {
1429
+ "tags": []
1430
+ },
1431
+ "outputs": [
1432
+ {
1433
+ "ename": "",
1434
+ "evalue": "",
1435
+ "output_type": "error",
1436
+ "traceback": [
1437
+ "\u001b[1;31mFailed to start the Kernel. \n",
1438
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1439
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1440
+ ]
1441
+ }
1442
+ ],
1443
+ "source": [
1444
+ "#Set up the environment to display the graphical outputs\n",
1445
+ "# feed the LDA model into the pyLDAvis instance\n",
1446
+ "pyLDAvis.enable_notebook()\n",
1447
+ "visual= gensimvis.prepare(final_model, docs_vecs, doc_dict)\n",
1448
+ "\n",
1449
+ "#Save the output to the html file\n",
1450
+ "pyLDAvis.save_html(visual, \"data/topic_viz12_mod_training.html\")"
1451
+ ]
1452
+ },
1453
+ {
1454
+ "cell_type": "code",
1455
+ "execution_count": null,
1456
+ "id": "5e30d71a-a3c7-40c7-94c0-7eea1bedc887",
1457
+ "metadata": {
1458
+ "tags": []
1459
+ },
1460
+ "outputs": [
1461
+ {
1462
+ "ename": "",
1463
+ "evalue": "",
1464
+ "output_type": "error",
1465
+ "traceback": [
1466
+ "\u001b[1;31mFailed to start the Kernel. \n",
1467
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1468
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1469
+ ]
1470
+ }
1471
+ ],
1472
+ "source": [
1473
+ "final_model.print_topics(num_words=30)"
1474
+ ]
1475
+ },
1476
+ {
1477
+ "cell_type": "markdown",
1478
+ "id": "607d2cfd-b3ca-4f99-9e01-d320ca98a2a0",
1479
+ "metadata": {},
1480
+ "source": [
1481
+ "# Save the final model "
1482
+ ]
1483
+ },
1484
+ {
1485
+ "cell_type": "code",
1486
+ "execution_count": null,
1487
+ "id": "84eb2746-173a-4283-bca5-681f77548698",
1488
+ "metadata": {},
1489
+ "outputs": [
1490
+ {
1491
+ "ename": "",
1492
+ "evalue": "",
1493
+ "output_type": "error",
1494
+ "traceback": [
1495
+ "\u001b[1;31mFailed to start the Kernel. \n",
1496
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1497
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1498
+ ]
1499
+ }
1500
+ ],
1501
+ "source": [
1502
+ "# Save a model to disk, or reload a pre-trained model\n",
1503
+ "# naming convention: final_model_topic_alpha_eta\n",
1504
+ "final_model.save(\"models/final_model_9_sym_sym\")"
1505
+ ]
1506
+ },
1507
+ {
1508
+ "cell_type": "markdown",
1509
+ "id": "a7b6e4d9-a577-4dfb-ba6e-fc74365880f4",
1510
+ "metadata": {},
1511
+ "source": [
1512
+ "# Find dominant topic(s) for each news article"
1513
+ ]
1514
+ },
1515
+ {
1516
+ "cell_type": "markdown",
1517
+ "id": "0eeecbcb-358c-44f9-8463-75cdfac0ba90",
1518
+ "metadata": {},
1519
+ "source": [
1520
+ "Attach the dominant topics back to the news dataset for classifying purpose."
1521
+ ]
1522
+ },
1523
+ {
1524
+ "cell_type": "markdown",
1525
+ "id": "8bebb269-dbb0-4c46-925c-38de0f2bcfd7",
1526
+ "metadata": {},
1527
+ "source": [
1528
+ "Made use of gensim lda's own function: https://radimrehurek.com/gensim/models/ldamodel.html"
1529
+ ]
1530
+ },
1531
+ {
1532
+ "cell_type": "code",
1533
+ "execution_count": null,
1534
+ "id": "f585ff52-b60d-4d70-ae64-a7c23d2cc6c1",
1535
+ "metadata": {},
1536
+ "outputs": [
1537
+ {
1538
+ "ename": "",
1539
+ "evalue": "",
1540
+ "output_type": "error",
1541
+ "traceback": [
1542
+ "\u001b[1;31mFailed to start the Kernel. \n",
1543
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1544
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1545
+ ]
1546
+ }
1547
+ ],
1548
+ "source": [
1549
+ "import warnings\n",
1550
+ "warnings.filterwarnings('ignore')\n",
1551
+ "\n",
1552
+ "def format_topics_sentences(ldamodel, corpus, data):\n",
1553
+ " # Preallocate memory for the DataFrame\n",
1554
+ " num_docs = len(corpus)\n",
1555
+ " sent_topics = {'Dominant_Topic': [0] * num_docs, 'Perc_Contribution': [0.0] * num_docs, 'Topic_Distribution': [()] * num_docs}\n",
1556
+ " \n",
1557
+ " # Get main topic in each document\n",
1558
+ " for i, row in enumerate(ldamodel[corpus]):\n",
1559
+ " row = sorted(row, key=lambda x: (x[1]), reverse=True)\n",
1560
+ " if row:\n",
1561
+ " # Get the Dominant topic, Perc Contribution and Keywords for each document\n",
1562
+ " dominant_topic, perc_contribution = row[0]\n",
1563
+ " topic_distribution = row\n",
1564
+ " sent_topics['Dominant_Topic'][i] = int(dominant_topic)\n",
1565
+ " sent_topics['Perc_Contribution'][i] = round(perc_contribution, 4)\n",
1566
+ " sent_topics['Topic_Distribution'][i] = topic_distribution\n",
1567
+ "\n",
1568
+ " # Create the DataFrame\n",
1569
+ " sent_topics_df = pd.DataFrame(sent_topics)\n",
1570
+ " sent_topics_df['Text'] = data\n",
1571
+ "\n",
1572
+ " return sent_topics_df"
1573
+ ]
1574
+ },
1575
+ {
1576
+ "cell_type": "code",
1577
+ "execution_count": null,
1578
+ "id": "24d3ff60-035e-4133-9ffd-88cce5cdccb1",
1579
+ "metadata": {},
1580
+ "outputs": [
1581
+ {
1582
+ "ename": "",
1583
+ "evalue": "",
1584
+ "output_type": "error",
1585
+ "traceback": [
1586
+ "\u001b[1;31mFailed to start the Kernel. \n",
1587
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1588
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1589
+ ]
1590
+ }
1591
+ ],
1592
+ "source": [
1593
+ "df_topic_sents_keywords = format_topics_sentences(ldamodel=final_model, corpus=docs_vecs, data=cleaned.Headline_Details)"
1594
+ ]
1595
+ },
1596
+ {
1597
+ "cell_type": "code",
1598
+ "execution_count": null,
1599
+ "id": "c88b088b",
1600
+ "metadata": {},
1601
+ "outputs": [
1602
+ {
1603
+ "ename": "",
1604
+ "evalue": "",
1605
+ "output_type": "error",
1606
+ "traceback": [
1607
+ "\u001b[1;31mFailed to start the Kernel. \n",
1608
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1609
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1610
+ ]
1611
+ }
1612
+ ],
1613
+ "source": [
1614
+ "# Format\n",
1615
+ "df_dominant_topic = df_topic_sents_keywords.reset_index()\n",
1616
+ "df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Topic_Distribution', 'Text']\n",
1617
+ "\n",
1618
+ "# Show\n",
1619
+ "df_dominant_topic.head(10)"
1620
+ ]
1621
+ },
1622
+ {
1623
+ "cell_type": "markdown",
1624
+ "id": "560da382-aa86-4df1-8b85-56b057a27cd4",
1625
+ "metadata": {},
1626
+ "source": [
1627
+ "# Result Analysis"
1628
+ ]
1629
+ },
1630
+ {
1631
+ "cell_type": "code",
1632
+ "execution_count": null,
1633
+ "id": "4fe6b40b-6922-4de3-8d9e-dac7474b6303",
1634
+ "metadata": {},
1635
+ "outputs": [
1636
+ {
1637
+ "ename": "",
1638
+ "evalue": "",
1639
+ "output_type": "error",
1640
+ "traceback": [
1641
+ "\u001b[1;31mFailed to start the Kernel. \n",
1642
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1643
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1644
+ ]
1645
+ }
1646
+ ],
1647
+ "source": [
1648
+ "df_dominant_topic[\"Dominant_Topic\"].value_counts()"
1649
+ ]
1650
+ },
1651
+ {
1652
+ "cell_type": "code",
1653
+ "execution_count": null,
1654
+ "id": "b9917340-31cf-48af-871f-b481128fdf22",
1655
+ "metadata": {},
1656
+ "outputs": [
1657
+ {
1658
+ "ename": "",
1659
+ "evalue": "",
1660
+ "output_type": "error",
1661
+ "traceback": [
1662
+ "\u001b[1;31mFailed to start the Kernel. \n",
1663
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1664
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1665
+ ]
1666
+ }
1667
+ ],
1668
+ "source": [
1669
+ "import matplotlib.pyplot as plt\n",
1670
+ "\n",
1671
+ "# Get value counts of each topic\n",
1672
+ "topic_counts = df_dominant_topic[\"Dominant_Topic\"].value_counts()\n",
1673
+ "\n",
1674
+ "# Create a bar plot\n",
1675
+ "plt.figure(figsize=(8, 6))\n",
1676
+ "topic_counts.plot(kind=\"bar\", color=\"skyblue\")\n",
1677
+ "\n",
1678
+ "# Add labels to the bars\n",
1679
+ "for i, count in enumerate(topic_counts):\n",
1680
+ " plt.text(i, count, str(count), ha=\"center\", va=\"bottom\")\n",
1681
+ "\n",
1682
+ "# Add labels and title\n",
1683
+ "plt.xlabel(\"Topics\")\n",
1684
+ "plt.ylabel(\"Number of News\")\n",
1685
+ "plt.title(\"Topic Distribution\")\n",
1686
+ "\n",
1687
+ "# Show the plot\n",
1688
+ "plt.xticks(rotation=45) # Rotate x-axis labels for better readability\n",
1689
+ "plt.tight_layout()\n",
1690
+ "plt.show()\n"
1691
+ ]
1692
+ },
1693
+ {
1694
+ "cell_type": "code",
1695
+ "execution_count": null,
1696
+ "id": "70e7d652-4421-45e0-93f8-aaa51c186422",
1697
+ "metadata": {},
1698
+ "outputs": [
1699
+ {
1700
+ "ename": "",
1701
+ "evalue": "",
1702
+ "output_type": "error",
1703
+ "traceback": [
1704
+ "\u001b[1;31mFailed to start the Kernel. \n",
1705
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1706
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1707
+ ]
1708
+ }
1709
+ ],
1710
+ "source": [
1711
+ "df_dominant_topic.shape"
1712
+ ]
1713
+ },
1714
+ {
1715
+ "cell_type": "code",
1716
+ "execution_count": null,
1717
+ "id": "69932a6e-7159-46b1-98f8-827d99b95c54",
1718
+ "metadata": {},
1719
+ "outputs": [
1720
+ {
1721
+ "ename": "",
1722
+ "evalue": "",
1723
+ "output_type": "error",
1724
+ "traceback": [
1725
+ "\u001b[1;31mFailed to start the Kernel. \n",
1726
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1727
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1728
+ ]
1729
+ }
1730
+ ],
1731
+ "source": [
1732
+ "# Sample 100 rows, can change the random_state for different samples\n",
1733
+ "sampled_data = df_dominant_topic.sample(n=100, random_state=42) \n",
1734
+ "sampled_df = pd.DataFrame(sampled_data).reset_index()\n",
1735
+ "sampled_df.to_csv('data/sample_moderate.csv')"
1736
+ ]
1737
+ }
1738
+ ],
1739
+ "metadata": {
1740
+ "kernelspec": {
1741
+ "display_name": "Python 3",
1742
+ "language": "python",
1743
+ "name": "python3"
1744
+ },
1745
+ "language_info": {
1746
+ "codemirror_mode": {
1747
+ "name": "ipython",
1748
+ "version": 3
1749
+ },
1750
+ "file_extension": ".py",
1751
+ "mimetype": "text/x-python",
1752
+ "name": "python",
1753
+ "nbconvert_exporter": "python",
1754
+ "pygments_lexer": "ipython3",
1755
+ "version": "3.12.4"
1756
+ }
1757
+ },
1758
+ "nbformat": 4,
1759
+ "nbformat_minor": 5
1760
+ }
notebooks/09_topic_modelling_severe.ipynb ADDED
@@ -0,0 +1,1859 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "46322fb5-5918-4b70-9689-9e0781439ac4",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "workding dir: /Users/inflaton/code/engd/papers/maritime/global-incidents\n",
14
+ "loading env vars from: /Users/inflaton/code/engd/papers/maritime/global-incidents/.env\n"
15
+ ]
16
+ },
17
+ {
18
+ "data": {
19
+ "text/plain": [
20
+ "True"
21
+ ]
22
+ },
23
+ "execution_count": 1,
24
+ "metadata": {},
25
+ "output_type": "execute_result"
26
+ }
27
+ ],
28
+ "source": [
29
+ "%load_ext autoreload\n",
30
+ "%autoreload 2\n",
31
+ "\n",
32
+ "import os\n",
33
+ "import sys\n",
34
+ "from pathlib import Path\n",
35
+ "\n",
36
+ "workding_dir = str(Path.cwd().parent)\n",
37
+ "os.chdir(workding_dir)\n",
38
+ "sys.path.append(workding_dir)\n",
39
+ "print(\"workding dir:\", workding_dir)\n",
40
+ "\n",
41
+ "from dotenv import find_dotenv, load_dotenv\n",
42
+ "\n",
43
+ "found_dotenv = find_dotenv(\".env\")\n",
44
+ "\n",
45
+ "if len(found_dotenv) == 0:\n",
46
+ " found_dotenv = find_dotenv(\".env.example\")\n",
47
+ "print(f\"loading env vars from: {found_dotenv}\")\n",
48
+ "load_dotenv(found_dotenv, override=True)"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 2,
54
+ "id": "daf1e3d1-75ac-4299-8bed-2f413a49f9a6",
55
+ "metadata": {
56
+ "tags": []
57
+ },
58
+ "outputs": [],
59
+ "source": [
60
+ "import nltk\n",
61
+ "from nltk.tokenize import sent_tokenize\n",
62
+ "from nltk.tokenize import word_tokenize\n",
63
+ "\n",
64
+ "import gensim\n",
65
+ "from gensim import corpora\n",
66
+ "from gensim import similarities\n",
67
+ "from gensim import models\n",
68
+ "from gensim.models import CoherenceModel\n",
69
+ "\n",
70
+ "# from wordcloud import WordCloud, ImageColorGenerator\n",
71
+ "import matplotlib.pyplot as plt\n",
72
+ "import seaborn as sns\n",
73
+ "import pandas as pd\n",
74
+ "import re\n",
75
+ "import os\n",
76
+ "import datetime\n",
77
+ "\n",
78
+ "import warnings\n",
79
+ "warnings.filterwarnings('ignore')\n",
80
+ "\n",
81
+ "from pprint import pprint\n",
82
+ "import pyLDAvis\n",
83
+ "import pyLDAvis.gensim_models as gensimvis"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": 3,
89
+ "id": "c673c907-e1d8-4d64-9a73-c15c15b78e7f",
90
+ "metadata": {},
91
+ "outputs": [
92
+ {
93
+ "name": "stdout",
94
+ "output_type": "stream",
95
+ "text": [
96
+ "2024-06-30 15:39:16.255404\n"
97
+ ]
98
+ }
99
+ ],
100
+ "source": [
101
+ "print(datetime.datetime.now())"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "markdown",
106
+ "id": "49e6de6b-71bd-4948-8827-52601406058f",
107
+ "metadata": {},
108
+ "source": [
109
+ "# Import the data with full news content"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": 4,
115
+ "id": "49222182-7811-4fa6-8c0a-21d3a546863e",
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "df = pd.read_parquet('data/processed_data2.parquet')"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": 5,
125
+ "id": "3fb59a30",
126
+ "metadata": {},
127
+ "outputs": [
128
+ {
129
+ "data": {
130
+ "text/html": [
131
+ "<div>\n",
132
+ "<style scoped>\n",
133
+ " .dataframe tbody tr th:only-of-type {\n",
134
+ " vertical-align: middle;\n",
135
+ " }\n",
136
+ "\n",
137
+ " .dataframe tbody tr th {\n",
138
+ " vertical-align: top;\n",
139
+ " }\n",
140
+ "\n",
141
+ " .dataframe thead th {\n",
142
+ " text-align: right;\n",
143
+ " }\n",
144
+ "</style>\n",
145
+ "<table border=\"1\" class=\"dataframe\">\n",
146
+ " <thead>\n",
147
+ " <tr style=\"text-align: right;\">\n",
148
+ " <th></th>\n",
149
+ " <th>id</th>\n",
150
+ " <th>Headline</th>\n",
151
+ " <th>Details</th>\n",
152
+ " <th>Severity</th>\n",
153
+ " <th>Category</th>\n",
154
+ " <th>Region</th>\n",
155
+ " <th>Datetime</th>\n",
156
+ " <th>Year</th>\n",
157
+ " <th>lat</th>\n",
158
+ " <th>lon</th>\n",
159
+ " <th>...</th>\n",
160
+ " <th>if_labeled</th>\n",
161
+ " <th>Month</th>\n",
162
+ " <th>Week</th>\n",
163
+ " <th>Headline_Details</th>\n",
164
+ " <th>url</th>\n",
165
+ " <th>title</th>\n",
166
+ " <th>content</th>\n",
167
+ " <th>cleaned_content</th>\n",
168
+ " <th>binary_content</th>\n",
169
+ " <th>word_count</th>\n",
170
+ " </tr>\n",
171
+ " </thead>\n",
172
+ " <tbody>\n",
173
+ " <tr>\n",
174
+ " <th>0</th>\n",
175
+ " <td>1</td>\n",
176
+ " <td>Grasberg Mine- Grasberg mine workers extend st...</td>\n",
177
+ " <td>Media sources indicate that workers at the Gra...</td>\n",
178
+ " <td>Moderate</td>\n",
179
+ " <td>Mine Workers Strike</td>\n",
180
+ " <td>Indonesia</td>\n",
181
+ " <td>28/5/17 17:08</td>\n",
182
+ " <td>2017.0</td>\n",
183
+ " <td>-4.05608</td>\n",
184
+ " <td>137.11302</td>\n",
185
+ " <td>...</td>\n",
186
+ " <td>False</td>\n",
187
+ " <td>5.0</td>\n",
188
+ " <td>21.0</td>\n",
189
+ " <td>Grasberg Mine- Grasberg mine workers extend st...</td>\n",
190
+ " <td>https://news.google.com/rss/articles/CBMiZ2h0d...</td>\n",
191
+ " <td>Freeport Indonesia mine workers extend strike ...</td>\n",
192
+ " <td>Trucks are seen on a road in the Grasberg copp...</td>\n",
193
+ " <td>[truck, be, see, on, road, in, grasberg, coppe...</td>\n",
194
+ " <td>[adkerson_jakarta_try, agreement_freeport_indo...</td>\n",
195
+ " <td>53</td>\n",
196
+ " </tr>\n",
197
+ " <tr>\n",
198
+ " <th>1</th>\n",
199
+ " <td>3</td>\n",
200
+ " <td>Shanghai port congestion impacts terminals in ...</td>\n",
201
+ " <td>The persisting port congestion at Shanghai’s Y...</td>\n",
202
+ " <td>Minor</td>\n",
203
+ " <td>Port Congestion</td>\n",
204
+ " <td>China</td>\n",
205
+ " <td>27/4/17 9:16</td>\n",
206
+ " <td>2017.0</td>\n",
207
+ " <td>29.52000</td>\n",
208
+ " <td>121.33190</td>\n",
209
+ " <td>...</td>\n",
210
+ " <td>False</td>\n",
211
+ " <td>4.0</td>\n",
212
+ " <td>17.0</td>\n",
213
+ " <td>Shanghai port congestion impacts terminals in ...</td>\n",
214
+ " <td>https://news.google.com/rss/articles/CBMiVWh0d...</td>\n",
215
+ " <td>Typhoon Muifa to shut China ports for second t...</td>\n",
216
+ " <td>By Sam Whelan 13/09/2022\\n\\nAnother typhoon ha...</td>\n",
217
+ " <td>[by, sam, whelan, typhoon, have, prompt, port,...</td>\n",
218
+ " <td>[additional_ripple_effect, avoid_path_typhoon,...</td>\n",
219
+ " <td>44</td>\n",
220
+ " </tr>\n",
221
+ " <tr>\n",
222
+ " <th>2</th>\n",
223
+ " <td>5</td>\n",
224
+ " <td>UPDATE - Indonesia: Police confirm two explosi...</td>\n",
225
+ " <td>According to local police in Jakarta, two expl...</td>\n",
226
+ " <td>Extreme</td>\n",
227
+ " <td>Bombing, Police Operations</td>\n",
228
+ " <td>Indonesia</td>\n",
229
+ " <td>24/5/17 16:20</td>\n",
230
+ " <td>2017.0</td>\n",
231
+ " <td>NaN</td>\n",
232
+ " <td>NaN</td>\n",
233
+ " <td>...</td>\n",
234
+ " <td>True</td>\n",
235
+ " <td>5.0</td>\n",
236
+ " <td>21.0</td>\n",
237
+ " <td>UPDATE - Indonesia: Police confirm two explosi...</td>\n",
238
+ " <td>https://news.google.com/rss/articles/CBMiZWh0d...</td>\n",
239
+ " <td>Jakarta Police Receive 2 More Reports on Coldp...</td>\n",
240
+ " <td>TEMPO.CO, Jakarta - South Jakarta Metro Police...</td>\n",
241
+ " <td>[jakarta, south, jakarta, metro, police, recei...</td>\n",
242
+ " <td>[actress_accord, available_day_concert, click_...</td>\n",
243
+ " <td>24</td>\n",
244
+ " </tr>\n",
245
+ " <tr>\n",
246
+ " <th>3</th>\n",
247
+ " <td>6</td>\n",
248
+ " <td>UPDATE - Indonesia: Severe winds damage infras...</td>\n",
249
+ " <td>Severe winds have downed billboards and trees ...</td>\n",
250
+ " <td>Moderate</td>\n",
251
+ " <td>Roadway Closure / Disruption, Flooding, Severe...</td>\n",
252
+ " <td>Indonesia</td>\n",
253
+ " <td>19/4/17 9:10</td>\n",
254
+ " <td>2017.0</td>\n",
255
+ " <td>-6.91264</td>\n",
256
+ " <td>107.65700</td>\n",
257
+ " <td>...</td>\n",
258
+ " <td>True</td>\n",
259
+ " <td>4.0</td>\n",
260
+ " <td>16.0</td>\n",
261
+ " <td>UPDATE - Indonesia: Severe winds damage infras...</td>\n",
262
+ " <td>https://news.google.com/rss/articles/CBMiSWh0d...</td>\n",
263
+ " <td>Indonesia hit by some of strongest winds recorded</td>\n",
264
+ " <td>A man stands near damaged houses following a t...</td>\n",
265
+ " <td>[man, stand, near, damage, house, follow, torn...</td>\n",
266
+ " <td>[bbc_indonesia, climatologist_government_resea...</td>\n",
267
+ " <td>28</td>\n",
268
+ " </tr>\n",
269
+ " <tr>\n",
270
+ " <th>4</th>\n",
271
+ " <td>14</td>\n",
272
+ " <td>2 miles E of Chesterfield - A tornado has touc...</td>\n",
273
+ " <td>Government sources are reporting a tornado has...</td>\n",
274
+ " <td>Minor</td>\n",
275
+ " <td>Tornado</td>\n",
276
+ " <td>United States</td>\n",
277
+ " <td>17/9/18 19:55</td>\n",
278
+ " <td>2018.0</td>\n",
279
+ " <td>37.51000</td>\n",
280
+ " <td>-77.61000</td>\n",
281
+ " <td>...</td>\n",
282
+ " <td>True</td>\n",
283
+ " <td>9.0</td>\n",
284
+ " <td>38.0</td>\n",
285
+ " <td>2 miles E of Chesterfield - A tornado has touc...</td>\n",
286
+ " <td>https://news.google.com/rss/articles/CBMigAFod...</td>\n",
287
+ " <td>UPDATE: Number of homes without power down to ...</td>\n",
288
+ " <td>More than 90,000 homes and businesses across t...</td>\n",
289
+ " <td>[more, than, home, business, across, richmond,...</td>\n",
290
+ " <td>[advise_seek_alternate, affect_richmond, alter...</td>\n",
291
+ " <td>134</td>\n",
292
+ " </tr>\n",
293
+ " </tbody>\n",
294
+ "</table>\n",
295
+ "<p>5 rows × 23 columns</p>\n",
296
+ "</div>"
297
+ ],
298
+ "text/plain": [
299
+ " id Headline \\\n",
300
+ "0 1 Grasberg Mine- Grasberg mine workers extend st... \n",
301
+ "1 3 Shanghai port congestion impacts terminals in ... \n",
302
+ "2 5 UPDATE - Indonesia: Police confirm two explosi... \n",
303
+ "3 6 UPDATE - Indonesia: Severe winds damage infras... \n",
304
+ "4 14 2 miles E of Chesterfield - A tornado has touc... \n",
305
+ "\n",
306
+ " Details Severity \\\n",
307
+ "0 Media sources indicate that workers at the Gra... Moderate \n",
308
+ "1 The persisting port congestion at Shanghai’s Y... Minor \n",
309
+ "2 According to local police in Jakarta, two expl... Extreme \n",
310
+ "3 Severe winds have downed billboards and trees ... Moderate \n",
311
+ "4 Government sources are reporting a tornado has... Minor \n",
312
+ "\n",
313
+ " Category Region \\\n",
314
+ "0 Mine Workers Strike Indonesia \n",
315
+ "1 Port Congestion China \n",
316
+ "2 Bombing, Police Operations Indonesia \n",
317
+ "3 Roadway Closure / Disruption, Flooding, Severe... Indonesia \n",
318
+ "4 Tornado United States \n",
319
+ "\n",
320
+ " Datetime Year lat lon ... if_labeled Month Week \\\n",
321
+ "0 28/5/17 17:08 2017.0 -4.05608 137.11302 ... False 5.0 21.0 \n",
322
+ "1 27/4/17 9:16 2017.0 29.52000 121.33190 ... False 4.0 17.0 \n",
323
+ "2 24/5/17 16:20 2017.0 NaN NaN ... True 5.0 21.0 \n",
324
+ "3 19/4/17 9:10 2017.0 -6.91264 107.65700 ... True 4.0 16.0 \n",
325
+ "4 17/9/18 19:55 2018.0 37.51000 -77.61000 ... True 9.0 38.0 \n",
326
+ "\n",
327
+ " Headline_Details \\\n",
328
+ "0 Grasberg Mine- Grasberg mine workers extend st... \n",
329
+ "1 Shanghai port congestion impacts terminals in ... \n",
330
+ "2 UPDATE - Indonesia: Police confirm two explosi... \n",
331
+ "3 UPDATE - Indonesia: Severe winds damage infras... \n",
332
+ "4 2 miles E of Chesterfield - A tornado has touc... \n",
333
+ "\n",
334
+ " url \\\n",
335
+ "0 https://news.google.com/rss/articles/CBMiZ2h0d... \n",
336
+ "1 https://news.google.com/rss/articles/CBMiVWh0d... \n",
337
+ "2 https://news.google.com/rss/articles/CBMiZWh0d... \n",
338
+ "3 https://news.google.com/rss/articles/CBMiSWh0d... \n",
339
+ "4 https://news.google.com/rss/articles/CBMigAFod... \n",
340
+ "\n",
341
+ " title \\\n",
342
+ "0 Freeport Indonesia mine workers extend strike ... \n",
343
+ "1 Typhoon Muifa to shut China ports for second t... \n",
344
+ "2 Jakarta Police Receive 2 More Reports on Coldp... \n",
345
+ "3 Indonesia hit by some of strongest winds recorded \n",
346
+ "4 UPDATE: Number of homes without power down to ... \n",
347
+ "\n",
348
+ " content \\\n",
349
+ "0 Trucks are seen on a road in the Grasberg copp... \n",
350
+ "1 By Sam Whelan 13/09/2022\\n\\nAnother typhoon ha... \n",
351
+ "2 TEMPO.CO, Jakarta - South Jakarta Metro Police... \n",
352
+ "3 A man stands near damaged houses following a t... \n",
353
+ "4 More than 90,000 homes and businesses across t... \n",
354
+ "\n",
355
+ " cleaned_content \\\n",
356
+ "0 [truck, be, see, on, road, in, grasberg, coppe... \n",
357
+ "1 [by, sam, whelan, typhoon, have, prompt, port,... \n",
358
+ "2 [jakarta, south, jakarta, metro, police, recei... \n",
359
+ "3 [man, stand, near, damage, house, follow, torn... \n",
360
+ "4 [more, than, home, business, across, richmond,... \n",
361
+ "\n",
362
+ " binary_content word_count \n",
363
+ "0 [adkerson_jakarta_try, agreement_freeport_indo... 53 \n",
364
+ "1 [additional_ripple_effect, avoid_path_typhoon,... 44 \n",
365
+ "2 [actress_accord, available_day_concert, click_... 24 \n",
366
+ "3 [bbc_indonesia, climatologist_government_resea... 28 \n",
367
+ "4 [advise_seek_alternate, affect_richmond, alter... 134 \n",
368
+ "\n",
369
+ "[5 rows x 23 columns]"
370
+ ]
371
+ },
372
+ "execution_count": 5,
373
+ "metadata": {},
374
+ "output_type": "execute_result"
375
+ }
376
+ ],
377
+ "source": [
378
+ "df.head()"
379
+ ]
380
+ },
381
+ {
382
+ "cell_type": "code",
383
+ "execution_count": 6,
384
+ "id": "09113e88-66cc-414c-a953-da04db83c4ae",
385
+ "metadata": {},
386
+ "outputs": [
387
+ {
388
+ "data": {
389
+ "text/plain": [
390
+ "(3681, 23)"
391
+ ]
392
+ },
393
+ "execution_count": 6,
394
+ "metadata": {},
395
+ "output_type": "execute_result"
396
+ }
397
+ ],
398
+ "source": [
399
+ "df.shape"
400
+ ]
401
+ },
402
+ {
403
+ "cell_type": "markdown",
404
+ "id": "037e74fc-bbcd-43e3-8346-799920cca8d8",
405
+ "metadata": {},
406
+ "source": [
407
+ "# Vectorisation"
408
+ ]
409
+ },
410
+ {
411
+ "cell_type": "markdown",
412
+ "id": "d67cef3a-59fb-4dd8-adc8-2cf288b90728",
413
+ "metadata": {},
414
+ "source": [
415
+ "NLP vectorization refers to the process of converting text data into numerical vectors that machine learning algorithms can understand and process. \n",
416
+ "\n",
417
+ "Bag-of-Words (BoW) is used here that represents text as a collection of unique words along with their frequencies. Each word is assigned an index, and the vector contains the count of each word present in the document."
418
+ ]
419
+ },
420
+ {
421
+ "cell_type": "code",
422
+ "execution_count": 7,
423
+ "id": "c95b7b8a-9767-469d-812d-c9a9d9fee0e9",
424
+ "metadata": {},
425
+ "outputs": [],
426
+ "source": [
427
+ "df_copy = df.copy()"
428
+ ]
429
+ },
430
+ {
431
+ "cell_type": "code",
432
+ "execution_count": 8,
433
+ "id": "dfb2001e-04c1-49dc-b423-a64ea47af5a9",
434
+ "metadata": {},
435
+ "outputs": [],
436
+ "source": [
437
+ "# choose only the extreme and severe cases for modelling\n",
438
+ "cleaned = df_copy[df_copy['Severity'].isin(['Extreme', 'Severe'])]\n",
439
+ "cleaned.reset_index(drop=True, inplace=True)"
440
+ ]
441
+ },
442
+ {
443
+ "cell_type": "code",
444
+ "execution_count": 9,
445
+ "id": "de71c523-a59e-44b2-aa96-5f17d872c9c6",
446
+ "metadata": {},
447
+ "outputs": [],
448
+ "source": [
449
+ "headline = cleaned.binary_content"
450
+ ]
451
+ },
452
+ {
453
+ "cell_type": "code",
454
+ "execution_count": 10,
455
+ "id": "5b1e34e1",
456
+ "metadata": {},
457
+ "outputs": [
458
+ {
459
+ "data": {
460
+ "text/plain": [
461
+ "array(['number_container', 'accord_detective_llamas', 'anyone_talk_crime',\n",
462
+ " 'arizmendi_girlfriend_become', 'auto_theft_robbery',\n",
463
+ " 'clothing_makeup_shoe', 'clue_loot', 'decode_container_stack',\n",
464
+ " 'detective_chavez', 'detective_put', 'electric_bicycle',\n",
465
+ " 'empire_farther_south', 'freight_train_repeat',\n",
466
+ " 'google_placard_lock', 'hard_drive_tablet', 'homicide_drug_gang',\n",
467
+ " 'inside_container_secure', 'llama_straight_tell',\n",
468
+ " 'llama_work_connie', 'metal_lock_size', 'mile_east',\n",
469
+ " 'motel_room_storage', 'plentiful_tv_beer',\n",
470
+ " 'southern_california_couple', 'succumb_bolt_cutter', 'sure_sign',\n",
471
+ " 'upgraded_lock'], dtype=object)"
472
+ ]
473
+ },
474
+ "execution_count": 10,
475
+ "metadata": {},
476
+ "output_type": "execute_result"
477
+ }
478
+ ],
479
+ "source": [
480
+ "headline[5]"
481
+ ]
482
+ },
483
+ {
484
+ "cell_type": "code",
485
+ "execution_count": 11,
486
+ "id": "677055b4-978e-4253-90f4-3f903662e225",
487
+ "metadata": {
488
+ "tags": []
489
+ },
490
+ "outputs": [],
491
+ "source": [
492
+ "# vectorise the words\n",
493
+ "doc_dict = gensim.corpora.Dictionary(headline)\n",
494
+ "docs_vecs = [doc_dict.doc2bow(doc) for doc in headline]"
495
+ ]
496
+ },
497
+ {
498
+ "cell_type": "code",
499
+ "execution_count": 12,
500
+ "id": "a54d1768-b069-4936-a156-deaf0b506d93",
501
+ "metadata": {
502
+ "tags": []
503
+ },
504
+ "outputs": [
505
+ {
506
+ "name": "stdout",
507
+ "output_type": "stream",
508
+ "text": [
509
+ "Number of unique tokens: 30464\n",
510
+ "Number of articles: 300\n"
511
+ ]
512
+ }
513
+ ],
514
+ "source": [
515
+ "print('Number of unique tokens: %d' % len(doc_dict)) \n",
516
+ "print('Number of articles: %d' % len(docs_vecs)) "
517
+ ]
518
+ },
519
+ {
520
+ "cell_type": "code",
521
+ "execution_count": 13,
522
+ "id": "9147fa86-1503-4252-bd9b-92fea1e6a926",
523
+ "metadata": {
524
+ "scrolled": true,
525
+ "tags": []
526
+ },
527
+ "outputs": [
528
+ {
529
+ "name": "stdout",
530
+ "output_type": "stream",
531
+ "text": [
532
+ "[('heavy_rain', 15),\n",
533
+ " ('global_supply_chain', 15),\n",
534
+ " ('national_hurricane_center', 13),\n",
535
+ " ('heavy_rainfall', 12),\n",
536
+ " ('port_los', 12),\n",
537
+ " ('hong_kong', 12),\n",
538
+ " ('united_state', 11),\n",
539
+ " ('critical_destination_port', 11),\n",
540
+ " ('global_port_tracker', 11),\n",
541
+ " ('global_shipping_disruption', 11),\n",
542
+ " ('sign_confidence_consumer', 11),\n",
543
+ " ('upgrade_import_forecast', 11),\n",
544
+ " ('national_weather_service', 10),\n",
545
+ " ('social_medium', 10),\n",
546
+ " ('moment_exception_request', 10),\n",
547
+ " ('sorry_site', 10),\n",
548
+ " ('technical_difficulty_please', 10),\n",
549
+ " ('trade_statement', 9),\n",
550
+ " ('tropical_storm', 9),\n",
551
+ " ('help_business', 9),\n",
552
+ " ('meet_firm', 9),\n",
553
+ " ('website_see_service', 9),\n",
554
+ " ('supply_chain', 8),\n",
555
+ " ('strong_wind', 8),\n",
556
+ " ('coastal_area', 7),\n",
557
+ " ('geological_survey', 7),\n",
558
+ " ('asian_country', 7),\n",
559
+ " ('day_trade_asia', 7),\n",
560
+ " ('global_demand', 7),\n",
561
+ " ('global_economy', 7),\n",
562
+ " ('high_yard_density', 7),\n",
563
+ " ('inch_rain', 7),\n",
564
+ " ('coast_port', 7),\n",
565
+ " ('empty_container', 7),\n",
566
+ " ('union_worker', 7),\n",
567
+ " ('many_area', 6),\n",
568
+ " ('customer_demand', 6),\n",
569
+ " ('economic_growth', 6),\n",
570
+ " ('free_day', 6),\n",
571
+ " ('full_network', 6),\n",
572
+ " ('import_volume', 6),\n",
573
+ " ('major_economy', 6),\n",
574
+ " ('negative_impact', 6),\n",
575
+ " ('vertical_insight', 6),\n",
576
+ " ('supply_chain_issue', 6),\n",
577
+ " ('economic_recovery', 6),\n",
578
+ " ('death_toll', 6),\n",
579
+ " ('new_york_city', 6),\n",
580
+ " ('america_trade_statement', 6),\n",
581
+ " ('average_day', 6),\n",
582
+ " ('late_issue', 5),\n",
583
+ " ('paul_brashier_vice', 5),\n",
584
+ " ('president_drayage', 5),\n",
585
+ " ('strike_action', 5),\n",
586
+ " ('large_number', 5),\n",
587
+ " ('current_situation', 5),\n",
588
+ " ('high_inflation', 5),\n",
589
+ " ('severe_weather_event', 5),\n",
590
+ " ('accurate_quote_market', 5),\n",
591
+ " ('adapt_supply_chain', 5),\n",
592
+ " ('america_space', 5),\n",
593
+ " ('apapa_tin_tema', 5),\n",
594
+ " ('asia_day', 5),\n",
595
+ " ('china_area_chb', 5),\n",
596
+ " ('china_area_warehouse', 5),\n",
597
+ " ('company_face', 5),\n",
598
+ " ('critical_resource', 5),\n",
599
+ " ('date_change', 5),\n",
600
+ " ('energy_price', 5),\n",
601
+ " ('energy_price_fall', 5),\n",
602
+ " ('export_volume', 5),\n",
603
+ " ('fourth_quarter', 5),\n",
604
+ " ('full_truck', 5),\n",
605
+ " ('future_consumer_demand', 5),\n",
606
+ " ('general_administration_custom', 5),\n",
607
+ " ('high_flexibility_transit', 5),\n",
608
+ " ('high_interest_rate', 5),\n",
609
+ " ('high_inventory', 5),\n",
610
+ " ('increase_capacity', 5),\n",
611
+ " ('india_freight_cost', 5),\n",
612
+ " ('indirect_service', 5),\n",
613
+ " ('january_trade_maersk', 5),\n",
614
+ " ('load_truck_box', 5),\n",
615
+ " ('low_confidence', 5),\n",
616
+ " ('low_figure', 5),\n",
617
+ " ('main_product_continue', 5),\n",
618
+ " ('main_route_area', 5),\n",
619
+ " ('many_company', 5),\n",
620
+ " ('market_average', 5),\n",
621
+ " ('matadi_cape_town', 5),\n",
622
+ " ('mile_service', 5),\n",
623
+ " ('monetary_fund', 5),\n",
624
+ " ('new_air', 5),\n",
625
+ " ('new_law_limit', 5),\n",
626
+ " ('new_sea_rail', 5),\n",
627
+ " ('ocean_market', 5),\n",
628
+ " ('ocean_network', 5),\n",
629
+ " ('ok_day', 5),\n",
630
+ " ('operational_disruption', 5),\n",
631
+ " ('relevant_rate', 5)]\n"
632
+ ]
633
+ }
634
+ ],
635
+ "source": [
636
+ "# Calculate word frequencies\n",
637
+ "word_frequencies = {doc_dict[word_id]: freq for word_id, freq in doc_dict.cfs.items()}\n",
638
+ "sorted_words = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)\n",
639
+ "\n",
640
+ "pprint(sorted_words[:100])"
641
+ ]
642
+ },
643
+ {
644
+ "cell_type": "markdown",
645
+ "id": "5ed78239-2ce1-4784-a8f4-4c7438c8627b",
646
+ "metadata": {},
647
+ "source": [
648
+ "# LDA Modelling"
649
+ ]
650
+ },
651
+ {
652
+ "cell_type": "markdown",
653
+ "id": "aacc2eb2-bce9-462f-b048-cc26baa2383d",
654
+ "metadata": {},
655
+ "source": [
656
+ "We initially selected a fixed topic number for model pipelien development and benchmark model setup. Then we used the full dataset for fine-tuning and evaluation."
657
+ ]
658
+ },
659
+ {
660
+ "cell_type": "markdown",
661
+ "id": "9db83273-461d-4f70-b23f-ec967579d94f",
662
+ "metadata": {},
663
+ "source": [
664
+ "## Benchmark Model"
665
+ ]
666
+ },
667
+ {
668
+ "cell_type": "code",
669
+ "execution_count": 14,
670
+ "id": "e6d577bd-9936-4d45-be90-345af2eb4827",
671
+ "metadata": {
672
+ "tags": []
673
+ },
674
+ "outputs": [],
675
+ "source": [
676
+ "# Build LDA benchmark model\n",
677
+ "lda_model = gensim.models.LdaMulticore(corpus=docs_vecs,\n",
678
+ " id2word=doc_dict,\n",
679
+ " num_topics=4, \n",
680
+ " random_state=42,\n",
681
+ " chunksize=100,\n",
682
+ " passes=10,\n",
683
+ " per_word_topics=True)"
684
+ ]
685
+ },
686
+ {
687
+ "cell_type": "code",
688
+ "execution_count": 15,
689
+ "id": "c4f1521f-5f43-40d2-a3a3-a8ac2ca6fec2",
690
+ "metadata": {
691
+ "tags": []
692
+ },
693
+ "outputs": [
694
+ {
695
+ "name": "stdout",
696
+ "output_type": "stream",
697
+ "text": [
698
+ "[(0,\n",
699
+ " '0.001*\"technical_difficulty_please\" + 0.001*\"moment_exception_request\" + '\n",
700
+ " '0.001*\"sorry_site\" + 0.001*\"heavy_rain\" + 0.000*\"heavy_rainfall\" + '\n",
701
+ " '0.000*\"port_los\" + 0.000*\"national_weather_service\" + 0.000*\"coast_port\" + '\n",
702
+ " '0.000*\"united_state\" + 0.000*\"empty_container\"'),\n",
703
+ " (1,\n",
704
+ " '0.001*\"upgrade_import_forecast\" + 0.001*\"sign_confidence_consumer\" + '\n",
705
+ " '0.001*\"global_shipping_disruption\" + 0.001*\"global_port_tracker\" + '\n",
706
+ " '0.001*\"meet_firm\" + 0.001*\"website_see_service\" + 0.001*\"help_business\" + '\n",
707
+ " '0.000*\"national_hurricane_center\" + 0.000*\"passenger_service\" + '\n",
708
+ " '0.000*\"hong_kong\"'),\n",
709
+ " (2,\n",
710
+ " '0.001*\"global_supply_chain\" + 0.000*\"negative_impact\" + '\n",
711
+ " '0.000*\"critical_destination_port\" + 0.000*\"trade_statement\" + '\n",
712
+ " '0.000*\"warm_winter_china\" + 0.000*\"import_volume\" + 0.000*\"global_demand\" + '\n",
713
+ " '0.000*\"day_trade_asia\" + 0.000*\"slow_react\" + 0.000*\"full_network\"'),\n",
714
+ " (3,\n",
715
+ " '0.000*\"critical_destination_port\" + 0.000*\"social_medium\" + '\n",
716
+ " '0.000*\"meteorological_agency\" + 0.000*\"hong_kong\" + 0.000*\"many_area\" + '\n",
717
+ " '0.000*\"trade_statement\" + 0.000*\"prime_minister\" + '\n",
718
+ " '0.000*\"america_trade_statement\" + 0.000*\"oakland_day\" + '\n",
719
+ " '0.000*\"ready_rescue_operation\"')]\n"
720
+ ]
721
+ }
722
+ ],
723
+ "source": [
724
+ "from pprint import pprint\n",
725
+ "\n",
726
+ "# Print the Keyword in the 10 topics\n",
727
+ "pprint(lda_model.print_topics())\n",
728
+ "doc_lda = lda_model[docs_vecs]"
729
+ ]
730
+ },
731
+ {
732
+ "cell_type": "code",
733
+ "execution_count": 17,
734
+ "id": "fd57b1f4-a6cd-41e8-964f-d8a1d30aa3c9",
735
+ "metadata": {
736
+ "tags": []
737
+ },
738
+ "outputs": [
739
+ {
740
+ "name": "stdout",
741
+ "output_type": "stream",
742
+ "text": [
743
+ "\n",
744
+ "Coherence Score LDAModel: 0.37181231277776183\n",
745
+ "CPU times: user 216 ms, sys: 147 ms, total: 364 ms\n",
746
+ "Wall time: 8min 58s\n"
747
+ ]
748
+ }
749
+ ],
750
+ "source": [
751
+ "%%time\n",
752
+ "\n",
753
+ "# Compute Benchmark Coherence Score\n",
754
+ "coherence_model_lda = CoherenceModel(model=lda_model, texts=headline, dictionary=doc_dict, coherence='c_v')\n",
755
+ "coherence_lda = coherence_model_lda.get_coherence()\n",
756
+ "print('\\nCoherence Score LDAModel: ', coherence_lda)"
757
+ ]
758
+ },
759
+ {
760
+ "cell_type": "code",
761
+ "execution_count": 18,
762
+ "id": "152e5a3a-7afe-4fb8-a02f-d7492ad80936",
763
+ "metadata": {
764
+ "tags": []
765
+ },
766
+ "outputs": [
767
+ {
768
+ "name": "stdout",
769
+ "output_type": "stream",
770
+ "text": [
771
+ "\n",
772
+ "Perplexity for LDAModel: -10.57378514568444\n"
773
+ ]
774
+ }
775
+ ],
776
+ "source": [
777
+ "# Compute Benchmark Perplexity\n",
778
+ "perplex= lda_model.log_perplexity(docs_vecs, total_docs=None) #For LDAModel\n",
779
+ " # a measure of how good the model is. lower the better.\n",
780
+ "\n",
781
+ "print('\\nPerplexity for LDAModel: ', perplex)"
782
+ ]
783
+ },
784
+ {
785
+ "cell_type": "code",
786
+ "execution_count": 19,
787
+ "id": "7dd3a60a-5c6f-4249-9868-30528a5b0ac8",
788
+ "metadata": {},
789
+ "outputs": [
790
+ {
791
+ "name": "stderr",
792
+ "output_type": "stream",
793
+ "text": [
794
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
795
+ " pid = os.fork()\n",
796
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
797
+ " pid = os.fork()\n",
798
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
799
+ " pid = os.fork()\n",
800
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
801
+ " pid = os.fork()\n",
802
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
803
+ " pid = os.fork()\n",
804
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
805
+ " pid = os.fork()\n",
806
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
807
+ " pid = os.fork()\n",
808
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
809
+ " pid = os.fork()\n",
810
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
811
+ " pid = os.fork()\n",
812
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
813
+ " pid = os.fork()\n",
814
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
815
+ " pid = os.fork()\n",
816
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
817
+ " pid = os.fork()\n",
818
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
819
+ " pid = os.fork()\n",
820
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
821
+ " pid = os.fork()\n",
822
+ "/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/site-packages/joblib/externals/loky/backend/fork_exec.py:38: DeprecationWarning: This process (pid=56923) is multi-threaded, use of fork() may lead to deadlocks in the child.\n",
823
+ " pid = os.fork()\n"
824
+ ]
825
+ }
826
+ ],
827
+ "source": [
828
+ "from pprint import pprint\n",
829
+ "import pyLDAvis\n",
830
+ "import pyLDAvis.gensim_models as gensimvis\n",
831
+ "\n",
832
+ "# feed the LDA model into the pyLDAvis instance\n",
833
+ "pyLDAvis.enable_notebook()\n",
834
+ "visual= gensimvis.prepare(lda_model, docs_vecs, doc_dict)\n",
835
+ "\n",
836
+ "#Save the output to the html file\n",
837
+ "pyLDAvis.save_html(visual, \"data/topic_viz_benchmark_severe.html\")"
838
+ ]
839
+ },
840
+ {
841
+ "cell_type": "code",
842
+ "execution_count": 20,
843
+ "id": "3a5612f7-6358-49c8-aba9-8aa54e275c6f",
844
+ "metadata": {},
845
+ "outputs": [
846
+ {
847
+ "data": {
848
+ "text/html": [
849
+ "<div>\n",
850
+ "<style scoped>\n",
851
+ " .dataframe tbody tr th:only-of-type {\n",
852
+ " vertical-align: middle;\n",
853
+ " }\n",
854
+ "\n",
855
+ " .dataframe tbody tr th {\n",
856
+ " vertical-align: top;\n",
857
+ " }\n",
858
+ "\n",
859
+ " .dataframe thead th {\n",
860
+ " text-align: right;\n",
861
+ " }\n",
862
+ "</style>\n",
863
+ "<table border=\"1\" class=\"dataframe\">\n",
864
+ " <thead>\n",
865
+ " <tr style=\"text-align: right;\">\n",
866
+ " <th></th>\n",
867
+ " <th>Topic Keywords</th>\n",
868
+ " <th>Topic ID</th>\n",
869
+ " </tr>\n",
870
+ " </thead>\n",
871
+ " <tbody>\n",
872
+ " <tr>\n",
873
+ " <th>0</th>\n",
874
+ " <td>0.001*\"technical_difficulty_please\" + 0.001*\"moment_exception_request\" + 0.001*\"sorry_site\" + 0.001*\"heavy_rain\" + 0.000*\"heavy_rainfall\" + 0.000*\"port_los\"</td>\n",
875
+ " <td>0</td>\n",
876
+ " </tr>\n",
877
+ " <tr>\n",
878
+ " <th>1</th>\n",
879
+ " <td>0.001*\"upgrade_import_forecast\" + 0.001*\"sign_confidence_consumer\" + 0.001*\"global_shipping_disruption\" + 0.001*\"global_port_tracker\" + 0.001*\"meet_firm\" + 0.001*\"website_see_service\"</td>\n",
880
+ " <td>1</td>\n",
881
+ " </tr>\n",
882
+ " <tr>\n",
883
+ " <th>2</th>\n",
884
+ " <td>0.001*\"global_supply_chain\" + 0.000*\"negative_impact\" + 0.000*\"critical_destination_port\" + 0.000*\"trade_statement\" + 0.000*\"warm_winter_china\" + 0.000*\"import_volume\"</td>\n",
885
+ " <td>2</td>\n",
886
+ " </tr>\n",
887
+ " <tr>\n",
888
+ " <th>3</th>\n",
889
+ " <td>0.000*\"critical_destination_port\" + 0.000*\"social_medium\" + 0.000*\"meteorological_agency\" + 0.000*\"hong_kong\" + 0.000*\"many_area\" + 0.000*\"trade_statement\"</td>\n",
890
+ " <td>3</td>\n",
891
+ " </tr>\n",
892
+ " </tbody>\n",
893
+ "</table>\n",
894
+ "</div>"
895
+ ],
896
+ "text/plain": [
897
+ " Topic Keywords \\\n",
898
+ "0 0.001*\"technical_difficulty_please\" + 0.001*\"moment_exception_request\" + 0.001*\"sorry_site\" + 0.001*\"heavy_rain\" + 0.000*\"heavy_rainfall\" + 0.000*\"port_los\" \n",
899
+ "1 0.001*\"upgrade_import_forecast\" + 0.001*\"sign_confidence_consumer\" + 0.001*\"global_shipping_disruption\" + 0.001*\"global_port_tracker\" + 0.001*\"meet_firm\" + 0.001*\"website_see_service\" \n",
900
+ "2 0.001*\"global_supply_chain\" + 0.000*\"negative_impact\" + 0.000*\"critical_destination_port\" + 0.000*\"trade_statement\" + 0.000*\"warm_winter_china\" + 0.000*\"import_volume\" \n",
901
+ "3 0.000*\"critical_destination_port\" + 0.000*\"social_medium\" + 0.000*\"meteorological_agency\" + 0.000*\"hong_kong\" + 0.000*\"many_area\" + 0.000*\"trade_statement\" \n",
902
+ "\n",
903
+ " Topic ID \n",
904
+ "0 0 \n",
905
+ "1 1 \n",
906
+ "2 2 \n",
907
+ "3 3 "
908
+ ]
909
+ },
910
+ "execution_count": 20,
911
+ "metadata": {},
912
+ "output_type": "execute_result"
913
+ }
914
+ ],
915
+ "source": [
916
+ "pd.set_option('max_colwidth', 200)\n",
917
+ "# Get the topics and their top keywords into a dataframe\n",
918
+ "topics = lda_model.show_topics(num_words=6) \n",
919
+ "\n",
920
+ "topic_keywords = pd.DataFrame()\n",
921
+ "for topic_id, topic in topics:\n",
922
+ " topic_keywords.at[topic_id, 'Topic Keywords'] = topic\n",
923
+ "\n",
924
+ "topic_keywords['Topic ID'] = topic_keywords.index\n",
925
+ "topic_keywords"
926
+ ]
927
+ },
928
+ {
929
+ "cell_type": "code",
930
+ "execution_count": 21,
931
+ "id": "26da4eea-06a0-4ff7-ae14-2f40fa0db04b",
932
+ "metadata": {},
933
+ "outputs": [],
934
+ "source": [
935
+ "# break "
936
+ ]
937
+ },
938
+ {
939
+ "cell_type": "markdown",
940
+ "id": "1895598f-3e5f-4acd-83a6-4491cc90f695",
941
+ "metadata": {},
942
+ "source": [
943
+ "# Hyper-Perameter Tuning and Evaluation"
944
+ ]
945
+ },
946
+ {
947
+ "cell_type": "markdown",
948
+ "id": "47136c89-ff7b-4ac9-840f-04122fe62160",
949
+ "metadata": {},
950
+ "source": [
951
+ "Run the cells below only for re-modelling with new datasets, the whole tuning and evaluation process may take hours to run."
952
+ ]
953
+ },
954
+ {
955
+ "cell_type": "code",
956
+ "execution_count": 22,
957
+ "id": "c79ca5c4-e078-43ce-a430-8c1ed93dcd64",
958
+ "metadata": {},
959
+ "outputs": [],
960
+ "source": [
961
+ "# hyper-perameter tuning (alpha and beta)\n",
962
+ "def compute_coherence_values(corpus, dictionary, k, a, b):\n",
963
+ " \n",
964
+ " lda_model = gensim.models.LdaMulticore(corpus=corpus,\n",
965
+ " id2word=dictionary,\n",
966
+ " num_topics=k, \n",
967
+ " random_state=42,\n",
968
+ " chunksize=100,\n",
969
+ " passes=10,\n",
970
+ " alpha=a,\n",
971
+ " eta=b)\n",
972
+ " \n",
973
+ " coherence_model_lda = CoherenceModel(model=lda_model, texts=headline, dictionary=doc_dict, coherence='c_v')\n",
974
+ " coherence = coherence_model_lda.get_coherence()\n",
975
+ " perplex = lda_model.log_perplexity(docs_vecs, total_docs=None) \n",
976
+ " \n",
977
+ " return coherence, perplex"
978
+ ]
979
+ },
980
+ {
981
+ "cell_type": "code",
982
+ "execution_count": 23,
983
+ "id": "1c3c8478-9336-40f2-bb30-a37db4243b67",
984
+ "metadata": {},
985
+ "outputs": [],
986
+ "source": [
987
+ "# setup\n",
988
+ "import numpy as np\n",
989
+ "\n",
990
+ "from gensim.models import CoherenceModel\n",
991
+ "\n",
992
+ "model_list = []\n",
993
+ "coherence_values = []\n",
994
+ "perplexity_values = []\n",
995
+ "model_topics = []\n",
996
+ "alpha_result = []\n",
997
+ "beta_result = []\n",
998
+ "\n",
999
+ "# topic ranges\n",
1000
+ "num_topics = range(4, 13)\n",
1001
+ "\n",
1002
+ "# Alpha parameter\n",
1003
+ "alpha = list(np.arange(0.31, 1, 0.3))\n",
1004
+ "alpha.append('symmetric')\n",
1005
+ "alpha.append('asymmetric')\n",
1006
+ "\n",
1007
+ "# Beta parameter\n",
1008
+ "beta = list(np.arange(0.31, 1, 0.3))\n",
1009
+ "beta.append('symmetric')"
1010
+ ]
1011
+ },
1012
+ {
1013
+ "cell_type": "markdown",
1014
+ "id": "c7e6bc53-0b57-4858-879a-644eca54ddbc",
1015
+ "metadata": {},
1016
+ "source": [
1017
+ "Rational behind the alpha and eta: https://stats.stackexchange.com/questions/37405/natural-interpretation-for-lda-hyperparameters"
1018
+ ]
1019
+ },
1020
+ {
1021
+ "cell_type": "code",
1022
+ "execution_count": 24,
1023
+ "id": "02877b81-32df-4168-8e62-4cbca2be100b",
1024
+ "metadata": {
1025
+ "tags": []
1026
+ },
1027
+ "outputs": [
1028
+ {
1029
+ "name": "stdout",
1030
+ "output_type": "stream",
1031
+ "text": [
1032
+ "Topic range: range(4, 13)\n",
1033
+ "Alpha: [0.31, 0.61, 0.9099999999999999, 'symmetric', 'asymmetric']\n",
1034
+ "Beta: [0.31, 0.61, 0.9099999999999999, 'symmetric']\n"
1035
+ ]
1036
+ }
1037
+ ],
1038
+ "source": [
1039
+ "print(\"Topic range: \",num_topics)\n",
1040
+ "print(\"Alpha: \",alpha)\n",
1041
+ "print(\"Beta: \", beta)"
1042
+ ]
1043
+ },
1044
+ {
1045
+ "cell_type": "code",
1046
+ "execution_count": 26,
1047
+ "id": "3c1f703c-4778-467f-a12e-0c18eeb274c5",
1048
+ "metadata": {},
1049
+ "outputs": [
1050
+ {
1051
+ "name": "stdout",
1052
+ "output_type": "stream",
1053
+ "text": [
1054
+ "2024-06-30 15:56:56.953954\n"
1055
+ ]
1056
+ },
1057
+ {
1058
+ "name": "stdout",
1059
+ "output_type": "stream",
1060
+ "text": [
1061
+ "#Topics: 4, CV Score: 0.3720156705867761, PV Score: -10.531015192970104, Alpha: 0.31, Beta: 0.31\n",
1062
+ "#Topics: 5, CV Score: 0.5104309491692648, PV Score: -10.501295581191243, Alpha: 0.31, Beta: 0.31\n",
1063
+ "#Topics: 6, CV Score: 0.4577416605401658, PV Score: -10.460472706228693, Alpha: 0.31, Beta: 0.31\n",
1064
+ "#Topics: 7, CV Score: 0.47905186758060786, PV Score: -10.45017220690116, Alpha: 0.31, Beta: 0.31\n",
1065
+ "#Topics: 8, CV Score: 0.5170931021465908, PV Score: -10.435198097218018, Alpha: 0.31, Beta: 0.31\n",
1066
+ "#Topics: 9, CV Score: 0.5418406648591022, PV Score: -10.41813577221336, Alpha: 0.31, Beta: 0.31\n",
1067
+ "#Topics: 10, CV Score: 0.6289688892634311, PV Score: -10.40650838076674, Alpha: 0.31, Beta: 0.31\n",
1068
+ "#Topics: 11, CV Score: 0.615861099169618, PV Score: -10.400571085548444, Alpha: 0.31, Beta: 0.31\n"
1069
+ ]
1070
+ },
1071
+ {
1072
+ "name": "stderr",
1073
+ "output_type": "stream",
1074
+ "text": [
1075
+ "Traceback (most recent call last):\n",
1076
+ " File \"<string>\", line 1, in <module>\n",
1077
+ " File \"/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/multiprocessing/spawn.py\", line 122, in spawn_main\n",
1078
+ " exitcode = _main(fd, parent_sentinel)\n",
1079
+ " ^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
1080
+ " File \"/Users/inflaton/anaconda3/envs/maritime/lib/python3.12/multiprocessing/spawn.py\", line 132, in _main\n",
1081
+ " self = reduction.pickle.load(from_parent)\n",
1082
+ " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
1083
+ "_pickle.UnpicklingError: pickle data was truncated\n"
1084
+ ]
1085
+ },
1086
+ {
1087
+ "name": "stdout",
1088
+ "output_type": "stream",
1089
+ "text": [
1090
+ "#Topics: 12, CV Score: 0.5821904373179804, PV Score: -10.405657307982493, Alpha: 0.31, Beta: 0.31\n",
1091
+ "#Topics: 4, CV Score: 0.39197924000152073, PV Score: -10.431108130993747, Alpha: 0.31, Beta: 0.61\n",
1092
+ "#Topics: 5, CV Score: 0.5158552240339984, PV Score: -10.415847323918724, Alpha: 0.31, Beta: 0.61\n",
1093
+ "#Topics: 6, CV Score: 0.4398095546006567, PV Score: -10.395709424729047, Alpha: 0.31, Beta: 0.61\n",
1094
+ "#Topics: 7, CV Score: 0.4759542844436549, PV Score: -10.390971943426882, Alpha: 0.31, Beta: 0.61\n",
1095
+ "#Topics: 8, CV Score: 0.5228046057671669, PV Score: -10.390099173623508, Alpha: 0.31, Beta: 0.61\n",
1096
+ "#Topics: 9, CV Score: 0.534380007483663, PV Score: -10.383173065174224, Alpha: 0.31, Beta: 0.61\n",
1097
+ "#Topics: 10, CV Score: 0.601346262577239, PV Score: -10.38283548973593, Alpha: 0.31, Beta: 0.61\n",
1098
+ "#Topics: 11, CV Score: 0.6182871521157967, PV Score: -10.381370037404881, Alpha: 0.31, Beta: 0.61\n",
1099
+ "#Topics: 12, CV Score: 0.6301666636692548, PV Score: -10.388156880830003, Alpha: 0.31, Beta: 0.61\n",
1100
+ "#Topics: 4, CV Score: 0.3906162608542371, PV Score: -10.401723128237816, Alpha: 0.31, Beta: 0.9099999999999999\n",
1101
+ "#Topics: 5, CV Score: 0.5939884088380677, PV Score: -10.387492626618993, Alpha: 0.31, Beta: 0.9099999999999999\n",
1102
+ "#Topics: 6, CV Score: 0.4508756651213514, PV Score: -10.37800267053485, Alpha: 0.31, Beta: 0.9099999999999999\n",
1103
+ "#Topics: 7, CV Score: 0.5023247846073803, PV Score: -10.378610286427143, Alpha: 0.31, Beta: 0.9099999999999999\n",
1104
+ "#Topics: 8, CV Score: 0.5338912827400732, PV Score: -10.382219717893106, Alpha: 0.31, Beta: 0.9099999999999999\n",
1105
+ "#Topics: 9, CV Score: 0.5350215871400255, PV Score: -10.378352577214379, Alpha: 0.31, Beta: 0.9099999999999999\n",
1106
+ "#Topics: 10, CV Score: 0.5843943719668163, PV Score: -10.379151839595481, Alpha: 0.31, Beta: 0.9099999999999999\n",
1107
+ "#Topics: 11, CV Score: 0.6022597255391141, PV Score: -10.381816469366692, Alpha: 0.31, Beta: 0.9099999999999999\n",
1108
+ "#Topics: 12, CV Score: 0.6549152327008515, PV Score: -10.3871327637851, Alpha: 0.31, Beta: 0.9099999999999999\n",
1109
+ "#Topics: 4, CV Score: 0.38133564373441475, PV Score: -10.57910573063879, Alpha: 0.31, Beta: symmetric\n",
1110
+ "#Topics: 5, CV Score: 0.504405181495512, PV Score: -10.591716275575331, Alpha: 0.31, Beta: symmetric\n",
1111
+ "#Topics: 6, CV Score: 0.4434166489578631, PV Score: -10.577016836067129, Alpha: 0.31, Beta: symmetric\n",
1112
+ "#Topics: 7, CV Score: 0.4650127509674876, PV Score: -10.59307567259223, Alpha: 0.31, Beta: symmetric\n",
1113
+ "#Topics: 8, CV Score: 0.48085842979187243, PV Score: -10.595749214509778, Alpha: 0.31, Beta: symmetric\n",
1114
+ "#Topics: 9, CV Score: 0.5229987979769645, PV Score: -10.572445904678828, Alpha: 0.31, Beta: symmetric\n",
1115
+ "#Topics: 10, CV Score: 0.6011951298180502, PV Score: -10.580254153107184, Alpha: 0.31, Beta: symmetric\n",
1116
+ "#Topics: 11, CV Score: 0.5816514196292673, PV Score: -10.572981018751022, Alpha: 0.31, Beta: symmetric\n",
1117
+ "#Topics: 12, CV Score: 0.54353358800824, PV Score: -10.58723522762571, Alpha: 0.31, Beta: symmetric\n",
1118
+ "#Topics: 4, CV Score: 0.3747888014696905, PV Score: -10.553273801572038, Alpha: 0.61, Beta: 0.31\n",
1119
+ "#Topics: 5, CV Score: 0.41949099152429126, PV Score: -10.529284579653147, Alpha: 0.61, Beta: 0.31\n",
1120
+ "#Topics: 6, CV Score: 0.42792555699640306, PV Score: -10.493600231954503, Alpha: 0.61, Beta: 0.31\n",
1121
+ "#Topics: 7, CV Score: 0.4414682530110495, PV Score: -10.49167181275119, Alpha: 0.61, Beta: 0.31\n",
1122
+ "#Topics: 8, CV Score: 0.40257655373677936, PV Score: -10.480723050906821, Alpha: 0.61, Beta: 0.31\n",
1123
+ "#Topics: 9, CV Score: 0.46588662192698593, PV Score: -10.462647038151925, Alpha: 0.61, Beta: 0.31\n",
1124
+ "#Topics: 10, CV Score: 0.6342207771608466, PV Score: -10.459408243710644, Alpha: 0.61, Beta: 0.31\n",
1125
+ "#Topics: 11, CV Score: 0.5957477827868781, PV Score: -10.457942472137361, Alpha: 0.61, Beta: 0.31\n",
1126
+ "#Topics: 12, CV Score: 0.5069147689054222, PV Score: -10.464993667751155, Alpha: 0.61, Beta: 0.31\n",
1127
+ "#Topics: 4, CV Score: 0.3819785352237406, PV Score: -10.457655853911954, Alpha: 0.61, Beta: 0.61\n",
1128
+ "#Topics: 5, CV Score: 0.521805619062306, PV Score: -10.44289355248452, Alpha: 0.61, Beta: 0.61\n",
1129
+ "#Topics: 6, CV Score: 0.4718959465808612, PV Score: -10.42865218169654, Alpha: 0.61, Beta: 0.61\n",
1130
+ "#Topics: 7, CV Score: 0.4666775368816065, PV Score: -10.435293881017774, Alpha: 0.61, Beta: 0.61\n",
1131
+ "#Topics: 8, CV Score: 0.4161034826078284, PV Score: -10.43039536604531, Alpha: 0.61, Beta: 0.61\n",
1132
+ "#Topics: 9, CV Score: 0.5356741464147949, PV Score: -10.430215877864402, Alpha: 0.61, Beta: 0.61\n",
1133
+ "#Topics: 10, CV Score: 0.6500234328720327, PV Score: -10.433181454609361, Alpha: 0.61, Beta: 0.61\n",
1134
+ "#Topics: 11, CV Score: 0.5724584031569051, PV Score: -10.436418913727316, Alpha: 0.61, Beta: 0.61\n",
1135
+ "#Topics: 12, CV Score: 0.47510144373255264, PV Score: -10.447173694241018, Alpha: 0.61, Beta: 0.61\n",
1136
+ "#Topics: 4, CV Score: 0.38858871959573177, PV Score: -10.422449139190153, Alpha: 0.61, Beta: 0.9099999999999999\n",
1137
+ "#Topics: 5, CV Score: 0.5168433414143676, PV Score: -10.415463384416736, Alpha: 0.61, Beta: 0.9099999999999999\n",
1138
+ "#Topics: 6, CV Score: 0.4623710938296017, PV Score: -10.410332603528257, Alpha: 0.61, Beta: 0.9099999999999999\n",
1139
+ "#Topics: 7, CV Score: 0.47872425272635516, PV Score: -10.414848707310925, Alpha: 0.61, Beta: 0.9099999999999999\n",
1140
+ "#Topics: 8, CV Score: 0.4916000111100134, PV Score: -10.42081190532211, Alpha: 0.61, Beta: 0.9099999999999999\n",
1141
+ "#Topics: 9, CV Score: 0.5266551042307853, PV Score: -10.423109042698925, Alpha: 0.61, Beta: 0.9099999999999999\n",
1142
+ "#Topics: 10, CV Score: 0.6308106489770785, PV Score: -10.429297527391322, Alpha: 0.61, Beta: 0.9099999999999999\n",
1143
+ "#Topics: 11, CV Score: 0.5824504074419617, PV Score: -10.435341814119317, Alpha: 0.61, Beta: 0.9099999999999999\n",
1144
+ "#Topics: 12, CV Score: 0.5655237958130822, PV Score: -10.446175294783924, Alpha: 0.61, Beta: 0.9099999999999999\n",
1145
+ "#Topics: 4, CV Score: 0.38066789853933136, PV Score: -10.600860392033141, Alpha: 0.61, Beta: symmetric\n",
1146
+ "#Topics: 5, CV Score: 0.4230110894509031, PV Score: -10.624957958665691, Alpha: 0.61, Beta: symmetric\n",
1147
+ "#Topics: 6, CV Score: 0.46006178600015324, PV Score: -10.612290512343902, Alpha: 0.61, Beta: symmetric\n",
1148
+ "#Topics: 7, CV Score: 0.4303435103117806, PV Score: -10.629508186597825, Alpha: 0.61, Beta: symmetric\n",
1149
+ "#Topics: 8, CV Score: 0.43866952185453756, PV Score: -10.634503260588486, Alpha: 0.61, Beta: symmetric\n",
1150
+ "#Topics: 9, CV Score: 0.4702721241778767, PV Score: -10.621032221974806, Alpha: 0.61, Beta: symmetric\n",
1151
+ "#Topics: 10, CV Score: 0.6087872469897164, PV Score: -10.634684373455967, Alpha: 0.61, Beta: symmetric\n",
1152
+ "#Topics: 11, CV Score: 0.5297501274565263, PV Score: -10.641694238787153, Alpha: 0.61, Beta: symmetric\n",
1153
+ "#Topics: 12, CV Score: 0.5432630625800524, PV Score: -10.654685933857545, Alpha: 0.61, Beta: symmetric\n",
1154
+ "#Topics: 4, CV Score: 0.3980327518942032, PV Score: -10.571774729929455, Alpha: 0.9099999999999999, Beta: 0.31\n",
1155
+ "#Topics: 5, CV Score: 0.47100281301765873, PV Score: -10.551206821119235, Alpha: 0.9099999999999999, Beta: 0.31\n",
1156
+ "#Topics: 6, CV Score: 0.45585476488442933, PV Score: -10.5221789742425, Alpha: 0.9099999999999999, Beta: 0.31\n",
1157
+ "#Topics: 7, CV Score: 0.42741889953329854, PV Score: -10.520073750838211, Alpha: 0.9099999999999999, Beta: 0.31\n",
1158
+ "#Topics: 8, CV Score: 0.4445519399417415, PV Score: -10.513500785821066, Alpha: 0.9099999999999999, Beta: 0.31\n",
1159
+ "#Topics: 9, CV Score: 0.5316093514063003, PV Score: -10.502280711961715, Alpha: 0.9099999999999999, Beta: 0.31\n",
1160
+ "#Topics: 10, CV Score: 0.6182445479757168, PV Score: -10.506699966303506, Alpha: 0.9099999999999999, Beta: 0.31\n",
1161
+ "#Topics: 11, CV Score: 0.5706965784614643, PV Score: -10.509385494327319, Alpha: 0.9099999999999999, Beta: 0.31\n",
1162
+ "#Topics: 12, CV Score: 0.5244982477634946, PV Score: -10.518817895245245, Alpha: 0.9099999999999999, Beta: 0.31\n",
1163
+ "#Topics: 4, CV Score: 0.34370448027723666, PV Score: -10.473583266573756, Alpha: 0.9099999999999999, Beta: 0.61\n",
1164
+ "#Topics: 5, CV Score: 0.42177545406640793, PV Score: -10.466430482135557, Alpha: 0.9099999999999999, Beta: 0.61\n",
1165
+ "#Topics: 6, CV Score: 0.48870590420062365, PV Score: -10.45681790473505, Alpha: 0.9099999999999999, Beta: 0.61\n",
1166
+ "#Topics: 7, CV Score: 0.4818942174006588, PV Score: -10.462492853055563, Alpha: 0.9099999999999999, Beta: 0.61\n",
1167
+ "#Topics: 8, CV Score: 0.49058139371286213, PV Score: -10.464079562560514, Alpha: 0.9099999999999999, Beta: 0.61\n",
1168
+ "#Topics: 9, CV Score: 0.5284476302536344, PV Score: -10.469358208054853, Alpha: 0.9099999999999999, Beta: 0.61\n",
1169
+ "#Topics: 10, CV Score: 0.5992842730225252, PV Score: -10.474371922798735, Alpha: 0.9099999999999999, Beta: 0.61\n",
1170
+ "#Topics: 11, CV Score: 0.5550594565914921, PV Score: -10.482537823118147, Alpha: 0.9099999999999999, Beta: 0.61\n",
1171
+ "#Topics: 12, CV Score: 0.5130408610330546, PV Score: -10.496326340379174, Alpha: 0.9099999999999999, Beta: 0.61\n",
1172
+ "#Topics: 4, CV Score: 0.3680605610706724, PV Score: -10.437968210441094, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1173
+ "#Topics: 5, CV Score: 0.39834507895056426, PV Score: -10.434366957326267, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1174
+ "#Topics: 6, CV Score: 0.5122840918935663, PV Score: -10.435732920054527, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1175
+ "#Topics: 7, CV Score: 0.5242817034609356, PV Score: -10.444687489964636, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1176
+ "#Topics: 8, CV Score: 0.525222502346446, PV Score: -10.450911057789357, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1177
+ "#Topics: 9, CV Score: 0.5855489762461916, PV Score: -10.459363210197557, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1178
+ "#Topics: 10, CV Score: 0.6033482162866756, PV Score: -10.46709651310468, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1179
+ "#Topics: 11, CV Score: 0.534181138841984, PV Score: -10.478818718345869, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1180
+ "#Topics: 12, CV Score: 0.5044175544841009, PV Score: -10.49287461972145, Alpha: 0.9099999999999999, Beta: 0.9099999999999999\n",
1181
+ "#Topics: 4, CV Score: 0.4030162857954225, PV Score: -10.61948175515634, Alpha: 0.9099999999999999, Beta: symmetric\n",
1182
+ "#Topics: 5, CV Score: 0.47100281301765873, PV Score: -10.643778224222777, Alpha: 0.9099999999999999, Beta: symmetric\n",
1183
+ "#Topics: 6, CV Score: 0.44431460450915367, PV Score: -10.640983049810698, Alpha: 0.9099999999999999, Beta: symmetric\n",
1184
+ "#Topics: 7, CV Score: 0.44194061169816934, PV Score: -10.6627668410305, Alpha: 0.9099999999999999, Beta: symmetric\n",
1185
+ "#Topics: 8, CV Score: 0.43220156375405006, PV Score: -10.66934582156036, Alpha: 0.9099999999999999, Beta: symmetric\n",
1186
+ "#Topics: 9, CV Score: 0.5246794752098919, PV Score: -10.666334218302142, Alpha: 0.9099999999999999, Beta: symmetric\n",
1187
+ "#Topics: 10, CV Score: 0.5839917253272302, PV Score: -10.683233193788631, Alpha: 0.9099999999999999, Beta: symmetric\n",
1188
+ "#Topics: 11, CV Score: 0.5812551603138659, PV Score: -10.68512322593847, Alpha: 0.9099999999999999, Beta: symmetric\n",
1189
+ "#Topics: 12, CV Score: 0.5267098171572085, PV Score: -10.712358225009568, Alpha: 0.9099999999999999, Beta: symmetric\n",
1190
+ "#Topics: 4, CV Score: 0.3720156705867761, PV Score: -10.526282588811258, Alpha: symmetric, Beta: 0.31\n",
1191
+ "#Topics: 5, CV Score: 0.49933029874610535, PV Score: -10.48844257371801, Alpha: symmetric, Beta: 0.31\n",
1192
+ "#Topics: 6, CV Score: 0.47205899846058585, PV Score: -10.439636548970395, Alpha: symmetric, Beta: 0.31\n",
1193
+ "#Topics: 7, CV Score: 0.48269249518744994, PV Score: -10.421634188787365, Alpha: symmetric, Beta: 0.31\n",
1194
+ "#Topics: 8, CV Score: 0.538335613683992, PV Score: -10.399271817666595, Alpha: symmetric, Beta: 0.31\n",
1195
+ "#Topics: 9, CV Score: 0.5761042276388716, PV Score: -10.381594889094856, Alpha: symmetric, Beta: 0.31\n",
1196
+ "#Topics: 10, CV Score: 0.6148075051567828, PV Score: -10.361131747776065, Alpha: symmetric, Beta: 0.31\n",
1197
+ "#Topics: 11, CV Score: 0.5588504038246359, PV Score: -10.345500948521796, Alpha: symmetric, Beta: 0.31\n",
1198
+ "#Topics: 12, CV Score: 0.5448010772010606, PV Score: -10.345617505586791, Alpha: symmetric, Beta: 0.31\n",
1199
+ "#Topics: 4, CV Score: 0.39936348065904637, PV Score: -10.426149669454736, Alpha: symmetric, Beta: 0.61\n",
1200
+ "#Topics: 5, CV Score: 0.5090823271966699, PV Score: -10.404214549545916, Alpha: symmetric, Beta: 0.61\n",
1201
+ "#Topics: 6, CV Score: 0.4278461684376491, PV Score: -10.376087516260217, Alpha: symmetric, Beta: 0.61\n",
1202
+ "#Topics: 7, CV Score: 0.5044713577089992, PV Score: -10.365065223661983, Alpha: symmetric, Beta: 0.61\n",
1203
+ "#Topics: 8, CV Score: 0.5585090181447581, PV Score: -10.357648533539216, Alpha: symmetric, Beta: 0.61\n",
1204
+ "#Topics: 9, CV Score: 0.5308131994078027, PV Score: -10.343761327293674, Alpha: symmetric, Beta: 0.61\n",
1205
+ "#Topics: 10, CV Score: 0.5933548111304747, PV Score: -10.333543849654015, Alpha: symmetric, Beta: 0.61\n",
1206
+ "#Topics: 11, CV Score: 0.5157930553152481, PV Score: -10.326695408594487, Alpha: symmetric, Beta: 0.61\n",
1207
+ "#Topics: 12, CV Score: 0.6500731393929028, PV Score: -10.325261572586404, Alpha: symmetric, Beta: 0.61\n",
1208
+ "#Topics: 4, CV Score: 0.3724208362940652, PV Score: -10.397307737784152, Alpha: symmetric, Beta: 0.9099999999999999\n",
1209
+ "#Topics: 5, CV Score: 0.6047343451735429, PV Score: -10.37632434171058, Alpha: symmetric, Beta: 0.9099999999999999\n",
1210
+ "#Topics: 6, CV Score: 0.40146108571917843, PV Score: -10.358709138199426, Alpha: symmetric, Beta: 0.9099999999999999\n",
1211
+ "#Topics: 7, CV Score: 0.5478418889572643, PV Score: -10.352775652604763, Alpha: symmetric, Beta: 0.9099999999999999\n",
1212
+ "#Topics: 8, CV Score: 0.4965510774725434, PV Score: -10.35076287539615, Alpha: symmetric, Beta: 0.9099999999999999\n",
1213
+ "#Topics: 9, CV Score: 0.5184439894201485, PV Score: -10.338520514214467, Alpha: symmetric, Beta: 0.9099999999999999\n",
1214
+ "#Topics: 10, CV Score: 0.6022496614184635, PV Score: -10.33271452286526, Alpha: symmetric, Beta: 0.9099999999999999\n",
1215
+ "#Topics: 11, CV Score: 0.4966577290320975, PV Score: -10.328283322344763, Alpha: symmetric, Beta: 0.9099999999999999\n",
1216
+ "#Topics: 12, CV Score: 0.6261571238431066, PV Score: -10.32532208498972, Alpha: symmetric, Beta: 0.9099999999999999\n",
1217
+ "#Topics: 4, CV Score: 0.37181231277776183, PV Score: -10.5737851457114, Alpha: symmetric, Beta: symmetric\n",
1218
+ "#Topics: 5, CV Score: 0.49982175351658986, PV Score: -10.57991503850104, Alpha: symmetric, Beta: symmetric\n",
1219
+ "#Topics: 6, CV Score: 0.4623746453802937, PV Score: -10.558772260964222, Alpha: symmetric, Beta: symmetric\n",
1220
+ "#Topics: 7, CV Score: 0.4884610769859329, PV Score: -10.570459578130873, Alpha: symmetric, Beta: symmetric\n",
1221
+ "#Topics: 8, CV Score: 0.5131836632736474, PV Score: -10.559869552593637, Alpha: symmetric, Beta: symmetric\n",
1222
+ "#Topics: 9, CV Score: 0.5414297105648862, PV Score: -10.529852182357658, Alpha: symmetric, Beta: symmetric\n",
1223
+ "#Topics: 10, CV Score: 0.6785044360557178, PV Score: -10.535124275209565, Alpha: symmetric, Beta: symmetric\n",
1224
+ "#Topics: 11, CV Score: 0.5657397622843224, PV Score: -10.52133404122993, Alpha: symmetric, Beta: symmetric\n",
1225
+ "#Topics: 12, CV Score: 0.582327359714131, PV Score: -10.526610298202412, Alpha: symmetric, Beta: symmetric\n",
1226
+ "#Topics: 4, CV Score: 0.43912942092956697, PV Score: -10.522848808310528, Alpha: asymmetric, Beta: 0.31\n",
1227
+ "#Topics: 5, CV Score: 0.40751184962270537, PV Score: -10.48541066901554, Alpha: asymmetric, Beta: 0.31\n",
1228
+ "#Topics: 6, CV Score: 0.4542494682857962, PV Score: -10.442773054340485, Alpha: asymmetric, Beta: 0.31\n",
1229
+ "#Topics: 7, CV Score: 0.47041200319916493, PV Score: -10.420479103000723, Alpha: asymmetric, Beta: 0.31\n",
1230
+ "#Topics: 8, CV Score: 0.49233261299212405, PV Score: -10.403763191353837, Alpha: asymmetric, Beta: 0.31\n",
1231
+ "#Topics: 9, CV Score: 0.45907092550739237, PV Score: -10.3786293878649, Alpha: asymmetric, Beta: 0.31\n",
1232
+ "#Topics: 10, CV Score: 0.6217993088001614, PV Score: -10.36148775185113, Alpha: asymmetric, Beta: 0.31\n",
1233
+ "#Topics: 11, CV Score: 0.5185376069390376, PV Score: -10.349232810275204, Alpha: asymmetric, Beta: 0.31\n",
1234
+ "#Topics: 12, CV Score: 0.5034307005049391, PV Score: -10.34076712188869, Alpha: asymmetric, Beta: 0.31\n",
1235
+ "#Topics: 4, CV Score: 0.515628236666662, PV Score: -10.422196504217826, Alpha: asymmetric, Beta: 0.61\n",
1236
+ "#Topics: 5, CV Score: 0.423162935257951, PV Score: -10.398368311505568, Alpha: asymmetric, Beta: 0.61\n",
1237
+ "#Topics: 6, CV Score: 0.45944499325883426, PV Score: -10.374638816393565, Alpha: asymmetric, Beta: 0.61\n",
1238
+ "#Topics: 7, CV Score: 0.4820831491535998, PV Score: -10.36267991668515, Alpha: asymmetric, Beta: 0.61\n",
1239
+ "#Topics: 8, CV Score: 0.4036989725279696, PV Score: -10.354531424153524, Alpha: asymmetric, Beta: 0.61\n",
1240
+ "#Topics: 9, CV Score: 0.45919112117643734, PV Score: -10.342620369274787, Alpha: asymmetric, Beta: 0.61\n",
1241
+ "#Topics: 10, CV Score: 0.6451794871459123, PV Score: -10.330084574702694, Alpha: asymmetric, Beta: 0.61\n",
1242
+ "#Topics: 11, CV Score: 0.572154524993651, PV Score: -10.324677755836108, Alpha: asymmetric, Beta: 0.61\n",
1243
+ "#Topics: 12, CV Score: 0.6194824233122164, PV Score: -10.323963042788398, Alpha: asymmetric, Beta: 0.61\n",
1244
+ "#Topics: 4, CV Score: 0.519154732232678, PV Score: -10.389407917509194, Alpha: asymmetric, Beta: 0.9099999999999999\n",
1245
+ "#Topics: 5, CV Score: 0.4151559388052012, PV Score: -10.373488513541524, Alpha: asymmetric, Beta: 0.9099999999999999\n"
1246
+ ]
1247
+ },
1248
+ {
1249
+ "ename": "KeyboardInterrupt",
1250
+ "evalue": "",
1251
+ "output_type": "error",
1252
+ "traceback": [
1253
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1254
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
1255
+ "File \u001b[0;32m<timed exec>:10\u001b[0m\n",
1256
+ "Cell \u001b[0;32mIn[22], line 14\u001b[0m, in \u001b[0;36mcompute_coherence_values\u001b[0;34m(corpus, dictionary, k, a, b)\u001b[0m\n\u001b[1;32m 4\u001b[0m lda_model \u001b[38;5;241m=\u001b[39m gensim\u001b[38;5;241m.\u001b[39mmodels\u001b[38;5;241m.\u001b[39mLdaMulticore(corpus\u001b[38;5;241m=\u001b[39mcorpus,\n\u001b[1;32m 5\u001b[0m id2word\u001b[38;5;241m=\u001b[39mdictionary,\n\u001b[1;32m 6\u001b[0m num_topics\u001b[38;5;241m=\u001b[39mk, \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 10\u001b[0m alpha\u001b[38;5;241m=\u001b[39ma,\n\u001b[1;32m 11\u001b[0m eta\u001b[38;5;241m=\u001b[39mb)\n\u001b[1;32m 13\u001b[0m coherence_model_lda \u001b[38;5;241m=\u001b[39m CoherenceModel(model\u001b[38;5;241m=\u001b[39mlda_model, texts\u001b[38;5;241m=\u001b[39mheadline, dictionary\u001b[38;5;241m=\u001b[39mdoc_dict, coherence\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mc_v\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 14\u001b[0m coherence \u001b[38;5;241m=\u001b[39m \u001b[43mcoherence_model_lda\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_coherence\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 15\u001b[0m perplex \u001b[38;5;241m=\u001b[39m lda_model\u001b[38;5;241m.\u001b[39mlog_perplexity(docs_vecs, total_docs\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m) \n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m coherence, perplex\n",
1257
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/models/coherencemodel.py:614\u001b[0m, in \u001b[0;36mCoherenceModel.get_coherence\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 605\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_coherence\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 606\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Get coherence value based on pipeline parameters.\u001b[39;00m\n\u001b[1;32m 607\u001b[0m \n\u001b[1;32m 608\u001b[0m \u001b[38;5;124;03m Returns\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 612\u001b[0m \n\u001b[1;32m 613\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 614\u001b[0m confirmed_measures \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_coherence_per_topic\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 615\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maggregate_measures(confirmed_measures)\n",
1258
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/models/coherencemodel.py:574\u001b[0m, in \u001b[0;36mCoherenceModel.get_coherence_per_topic\u001b[0;34m(self, segmented_topics, with_std, with_support)\u001b[0m\n\u001b[1;32m 572\u001b[0m segmented_topics \u001b[38;5;241m=\u001b[39m measure\u001b[38;5;241m.\u001b[39mseg(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtopics)\n\u001b[1;32m 573\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_accumulator \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 574\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mestimate_probabilities\u001b[49m\u001b[43m(\u001b[49m\u001b[43msegmented_topics\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 576\u001b[0m kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(with_std\u001b[38;5;241m=\u001b[39mwith_std, with_support\u001b[38;5;241m=\u001b[39mwith_support)\n\u001b[1;32m 577\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcoherence \u001b[38;5;129;01min\u001b[39;00m BOOLEAN_DOCUMENT_BASED \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcoherence \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mc_w2v\u001b[39m\u001b[38;5;124m'\u001b[39m:\n",
1259
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/models/coherencemodel.py:546\u001b[0m, in \u001b[0;36mCoherenceModel.estimate_probabilities\u001b[0;34m(self, segmented_topics)\u001b[0m\n\u001b[1;32m 543\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcoherence \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mc_w2v\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m 544\u001b[0m kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkeyed_vectors\n\u001b[0;32m--> 546\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_accumulator \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmeasure\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprob\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 548\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_accumulator\n",
1260
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/topic_coherence/probability_estimation.py:156\u001b[0m, in \u001b[0;36mp_boolean_sliding_window\u001b[0;34m(texts, segmented_topics, dictionary, window_size, processes)\u001b[0m\n\u001b[1;32m 154\u001b[0m accumulator \u001b[38;5;241m=\u001b[39m ParallelWordOccurrenceAccumulator(processes, top_ids, dictionary)\n\u001b[1;32m 155\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124musing \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m to estimate probabilities from sliding windows\u001b[39m\u001b[38;5;124m\"\u001b[39m, accumulator)\n\u001b[0;32m--> 156\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43maccumulator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maccumulate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtexts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwindow_size\u001b[49m\u001b[43m)\u001b[49m\n",
1261
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/topic_coherence/text_analysis.py:437\u001b[0m, in \u001b[0;36mParallelWordOccurrenceAccumulator.accumulate\u001b[0;34m(self, texts, window_size)\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21maccumulate\u001b[39m(\u001b[38;5;28mself\u001b[39m, texts, window_size):\n\u001b[0;32m--> 437\u001b[0m workers, input_q, output_q \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart_workers\u001b[49m\u001b[43m(\u001b[49m\u001b[43mwindow_size\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 438\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 439\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mqueue_all_texts(input_q, texts, window_size)\n",
1262
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/site-packages/gensim/topic_coherence/text_analysis.py:471\u001b[0m, in \u001b[0;36mParallelWordOccurrenceAccumulator.start_workers\u001b[0;34m(self, window_size)\u001b[0m\n\u001b[1;32m 469\u001b[0m accumulator \u001b[38;5;241m=\u001b[39m PatchedWordOccurrenceAccumulator(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrelevant_ids, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdictionary)\n\u001b[1;32m 470\u001b[0m worker \u001b[38;5;241m=\u001b[39m AccumulatingWorker(input_q, output_q, accumulator, window_size)\n\u001b[0;32m--> 471\u001b[0m \u001b[43mworker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 472\u001b[0m workers\u001b[38;5;241m.\u001b[39mappend(worker)\n\u001b[1;32m 474\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m workers, input_q, output_q\n",
1263
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/process.py:121\u001b[0m, in \u001b[0;36mBaseProcess.start\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _current_process\u001b[38;5;241m.\u001b[39m_config\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdaemon\u001b[39m\u001b[38;5;124m'\u001b[39m), \\\n\u001b[1;32m 119\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdaemonic processes are not allowed to have children\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 120\u001b[0m _cleanup()\n\u001b[0;32m--> 121\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_popen \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_Popen\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 122\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sentinel \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_popen\u001b[38;5;241m.\u001b[39msentinel\n\u001b[1;32m 123\u001b[0m \u001b[38;5;66;03m# Avoid a refcycle if the target function holds an indirect\u001b[39;00m\n\u001b[1;32m 124\u001b[0m \u001b[38;5;66;03m# reference to the process object (see bpo-30775)\u001b[39;00m\n",
1264
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/context.py:224\u001b[0m, in \u001b[0;36mProcess._Popen\u001b[0;34m(process_obj)\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[38;5;129m@staticmethod\u001b[39m\n\u001b[1;32m 223\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_Popen\u001b[39m(process_obj):\n\u001b[0;32m--> 224\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_default_context\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_context\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mProcess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_Popen\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocess_obj\u001b[49m\u001b[43m)\u001b[49m\n",
1265
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/context.py:289\u001b[0m, in \u001b[0;36mSpawnProcess._Popen\u001b[0;34m(process_obj)\u001b[0m\n\u001b[1;32m 286\u001b[0m \u001b[38;5;129m@staticmethod\u001b[39m\n\u001b[1;32m 287\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_Popen\u001b[39m(process_obj):\n\u001b[1;32m 288\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpopen_spawn_posix\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Popen\n\u001b[0;32m--> 289\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mPopen\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocess_obj\u001b[49m\u001b[43m)\u001b[49m\n",
1266
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/popen_spawn_posix.py:32\u001b[0m, in \u001b[0;36mPopen.__init__\u001b[0;34m(self, process_obj)\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, process_obj):\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fds \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m---> 32\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mprocess_obj\u001b[49m\u001b[43m)\u001b[49m\n",
1267
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/popen_fork.py:19\u001b[0m, in \u001b[0;36mPopen.__init__\u001b[0;34m(self, process_obj)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturncode \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfinalizer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m---> 19\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_launch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocess_obj\u001b[49m\u001b[43m)\u001b[49m\n",
1268
+ "File \u001b[0;32m~/anaconda3/envs/maritime/lib/python3.12/multiprocessing/popen_spawn_posix.py:62\u001b[0m, in \u001b[0;36mPopen._launch\u001b[0;34m(self, process_obj)\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msentinel \u001b[38;5;241m=\u001b[39m parent_r\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(parent_w, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mwb\u001b[39m\u001b[38;5;124m'\u001b[39m, closefd\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[0;32m---> 62\u001b[0m \u001b[43mf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetbuffer\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 63\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 64\u001b[0m fds_to_close \u001b[38;5;241m=\u001b[39m []\n",
1269
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
1270
+ ]
1271
+ }
1272
+ ],
1273
+ "source": [
1274
+ "%%time\n",
1275
+ "\n",
1276
+ "import datetime\n",
1277
+ "import numpy as np\n",
1278
+ "from gensim.models import CoherenceModel\n",
1279
+ "\n",
1280
+ "print(datetime.datetime.now())\n",
1281
+ "\n",
1282
+ "for a in alpha:\n",
1283
+ " for b in beta:\n",
1284
+ " for num in num_topics:\n",
1285
+ " cv, pv = compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict,k=num, a=a, b=b) \n",
1286
+ "\n",
1287
+ " model_topics.append(num) \n",
1288
+ " coherence_values.append(cv) \n",
1289
+ " perplexity_values.append(pv)\n",
1290
+ " alpha_result.append(a)\n",
1291
+ " beta_result.append(b)\n",
1292
+ " print(\"#Topics: \" + str(num) + \", CV Score: \" + str(coherence_values[-1]) + \", PV Score: \" + str(perplexity_values[-1]) + \", Alpha: \" + str(alpha_result[-1]) + \", Beta: \" + str(beta_result[-1]))\n",
1293
+ " \n",
1294
+ "print(datetime.datetime.now())"
1295
+ ]
1296
+ },
1297
+ {
1298
+ "cell_type": "markdown",
1299
+ "id": "364ff6d5-e3da-4dde-a2c8-5375fc5d711f",
1300
+ "metadata": {},
1301
+ "source": [
1302
+ "The table below reveals the top 20 fine tuned models with best combinations of coherence score and perplexity score. It was sorted by the coherence score in descending order as a higher coherence score indicates a better model, and sorted the perplexity score in ascending order as a lower perplexity score indicates a better model. While coherence score evaluates the quality of the topics, the perplexity score evaluates the overall performance of the model in predicting new documents. Usually, the coherence score is a better metric to use if the goal is to obtain topics that are semantically coherent and interpretable. Perplexity score, on the other hand, is a better metric to use if the goal is to build a model that generalises well to new data, in other words, how confident the model is in predicting the new data (Sánchez-Aguayo, et al., 2022). Ultimately, we aim to get a balance between the perplexity value and coherence score when determining our final model."
1303
+ ]
1304
+ },
1305
+ {
1306
+ "cell_type": "code",
1307
+ "execution_count": null,
1308
+ "id": "78a60032-a4d7-44d4-841c-a1bd3740d5dd",
1309
+ "metadata": {},
1310
+ "outputs": [
1311
+ {
1312
+ "ename": "",
1313
+ "evalue": "",
1314
+ "output_type": "error",
1315
+ "traceback": [
1316
+ "\u001b[1;31mFailed to start the Kernel. \n",
1317
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1318
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1319
+ ]
1320
+ }
1321
+ ],
1322
+ "source": [
1323
+ "# Find the top 20 combinations based on Coherence Score and Perplexity Score\n",
1324
+ "result = pd.DataFrame(\n",
1325
+ " {'Topics': model_topics,\n",
1326
+ " 'Coherence Score': coherence_values,\n",
1327
+ " 'Perplexity Score': perplexity_values,\n",
1328
+ " 'Alpha': alpha_result,\n",
1329
+ " 'Beta': beta_result\n",
1330
+ " })\n",
1331
+ "result.sort_values(by=['Coherence Score', 'Perplexity Score'], ascending=[False, True]).head(20)"
1332
+ ]
1333
+ },
1334
+ {
1335
+ "cell_type": "code",
1336
+ "execution_count": null,
1337
+ "id": "3461df57-c069-4ad2-80d7-8890dec9438e",
1338
+ "metadata": {},
1339
+ "outputs": [
1340
+ {
1341
+ "ename": "",
1342
+ "evalue": "",
1343
+ "output_type": "error",
1344
+ "traceback": [
1345
+ "\u001b[1;31mFailed to start the Kernel. \n",
1346
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1347
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1348
+ ]
1349
+ }
1350
+ ],
1351
+ "source": [
1352
+ "result.to_csv('data/lda_fine_tuning_result_severe.csv')"
1353
+ ]
1354
+ },
1355
+ {
1356
+ "cell_type": "code",
1357
+ "execution_count": null,
1358
+ "id": "800e5a4b-7302-42e8-97b0-5b598c1c80ae",
1359
+ "metadata": {
1360
+ "scrolled": true
1361
+ },
1362
+ "outputs": [
1363
+ {
1364
+ "ename": "",
1365
+ "evalue": "",
1366
+ "output_type": "error",
1367
+ "traceback": [
1368
+ "\u001b[1;31mFailed to start the Kernel. \n",
1369
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1370
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1371
+ ]
1372
+ }
1373
+ ],
1374
+ "source": [
1375
+ "# Show graph Topics vs Coherence Score\n",
1376
+ "result.groupby('Alpha').plot(x='Topics', y='Coherence Score', legend = True)"
1377
+ ]
1378
+ },
1379
+ {
1380
+ "cell_type": "code",
1381
+ "execution_count": null,
1382
+ "id": "26996b89-0e7a-4f2d-8cf7-c4a716569bc2",
1383
+ "metadata": {},
1384
+ "outputs": [
1385
+ {
1386
+ "ename": "",
1387
+ "evalue": "",
1388
+ "output_type": "error",
1389
+ "traceback": [
1390
+ "\u001b[1;31mFailed to start the Kernel. \n",
1391
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1392
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1393
+ ]
1394
+ }
1395
+ ],
1396
+ "source": [
1397
+ "# Show graph Topics vs Perplexity Score\n",
1398
+ "\n",
1399
+ "plt.plot(model_topics, coherence_values)\n",
1400
+ "plt.xlabel(\"Num Topics\")\n",
1401
+ "plt.ylabel(\"Coherence Score\")\n",
1402
+ "plt.legend((\"Coherence Score\"), loc='best')\n",
1403
+ "plt.show()"
1404
+ ]
1405
+ },
1406
+ {
1407
+ "cell_type": "code",
1408
+ "execution_count": null,
1409
+ "id": "91d2f4c1-de77-44b6-b41b-fcc9a07233e8",
1410
+ "metadata": {},
1411
+ "outputs": [
1412
+ {
1413
+ "ename": "",
1414
+ "evalue": "",
1415
+ "output_type": "error",
1416
+ "traceback": [
1417
+ "\u001b[1;31mFailed to start the Kernel. \n",
1418
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1419
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1420
+ ]
1421
+ }
1422
+ ],
1423
+ "source": [
1424
+ "# Show graph Topics vs Perplexity Score\n",
1425
+ "\n",
1426
+ "plt.plot(model_topics, perplexity_values)\n",
1427
+ "plt.xlabel(\"Num Topics\")\n",
1428
+ "plt.ylabel(\"Perplexity score\")\n",
1429
+ "plt.legend((\"perplexity_values\"), loc='best')\n",
1430
+ "plt.show()"
1431
+ ]
1432
+ },
1433
+ {
1434
+ "cell_type": "markdown",
1435
+ "id": "cdc3ddd2-f743-4e5b-b6c6-2656e0b77aec",
1436
+ "metadata": {},
1437
+ "source": [
1438
+ "## Final Model"
1439
+ ]
1440
+ },
1441
+ {
1442
+ "cell_type": "markdown",
1443
+ "id": "8a4196d2-0f8a-4b0b-a6dd-ead9441af44e",
1444
+ "metadata": {},
1445
+ "source": [
1446
+ "Topic 4, 8, 9 and 10 were selected for further evaluation using the visual graphs, considering that the best combination does not always yield the best result since a model with higher number of topics tends to have a better measurable result but may not fit the data the most. \n",
1447
+ "\n",
1448
+ "However, take note that even the random_state was preset and all other parameters were fixed, there are still randomness found that the model may produce inconsistant output each time. "
1449
+ ]
1450
+ },
1451
+ {
1452
+ "cell_type": "markdown",
1453
+ "id": "df1c00ad-ba54-4686-ac75-ef1033066dce",
1454
+ "metadata": {},
1455
+ "source": [
1456
+ "unfortunately, the alter of the number of topics has no much effect on the results, and the news are not clustered into relevant topics properly. also, most topics are stacked together, indicating high similarity and ambiguity among them due to the multi-aspect nature of the news contents. As a result, LDA may not be a suitable solution for this kind of news content. same result goes for moderate and minor."
1457
+ ]
1458
+ },
1459
+ {
1460
+ "cell_type": "code",
1461
+ "execution_count": null,
1462
+ "id": "490734ed-077c-4fb0-930c-0b42f4f63c94",
1463
+ "metadata": {},
1464
+ "outputs": [
1465
+ {
1466
+ "ename": "",
1467
+ "evalue": "",
1468
+ "output_type": "error",
1469
+ "traceback": [
1470
+ "\u001b[1;31mFailed to start the Kernel. \n",
1471
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1472
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1473
+ ]
1474
+ }
1475
+ ],
1476
+ "source": [
1477
+ "# realised that there may be some overlaps for 8 topics, thus 4-6 topics are optimal\n",
1478
+ "k = 8\n",
1479
+ "a = 'asymmetric'\n",
1480
+ "# a = 0.91\n",
1481
+ "# b = 0.61\n",
1482
+ "b = 'symmetric'\n",
1483
+ "\n",
1484
+ "\n",
1485
+ "\n",
1486
+ "final_model = gensim.models.LdaMulticore(corpus=docs_vecs,\n",
1487
+ " id2word=doc_dict,\n",
1488
+ " num_topics=k, \n",
1489
+ " random_state=42,\n",
1490
+ " chunksize=100,\n",
1491
+ " passes=10,\n",
1492
+ " alpha=a,\n",
1493
+ " eta=b)"
1494
+ ]
1495
+ },
1496
+ {
1497
+ "cell_type": "code",
1498
+ "execution_count": null,
1499
+ "id": "afe8abf0-2d12-414e-92be-a655865addb1",
1500
+ "metadata": {
1501
+ "tags": []
1502
+ },
1503
+ "outputs": [
1504
+ {
1505
+ "ename": "",
1506
+ "evalue": "",
1507
+ "output_type": "error",
1508
+ "traceback": [
1509
+ "\u001b[1;31mFailed to start the Kernel. \n",
1510
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1511
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1512
+ ]
1513
+ }
1514
+ ],
1515
+ "source": [
1516
+ "compute_coherence_values(corpus=docs_vecs, dictionary=doc_dict,k=k, a=a, b=b) "
1517
+ ]
1518
+ },
1519
+ {
1520
+ "cell_type": "code",
1521
+ "execution_count": null,
1522
+ "id": "8430a827-6dbb-4737-8ccc-78ed17a01234",
1523
+ "metadata": {
1524
+ "tags": []
1525
+ },
1526
+ "outputs": [
1527
+ {
1528
+ "ename": "",
1529
+ "evalue": "",
1530
+ "output_type": "error",
1531
+ "traceback": [
1532
+ "\u001b[1;31mFailed to start the Kernel. \n",
1533
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1534
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1535
+ ]
1536
+ }
1537
+ ],
1538
+ "source": [
1539
+ "#Set up the environment to display the graphical outputs\n",
1540
+ "# feed the LDA model into the pyLDAvis instance\n",
1541
+ "pyLDAvis.enable_notebook()\n",
1542
+ "visual= gensimvis.prepare(final_model, docs_vecs, doc_dict)\n",
1543
+ "\n",
1544
+ "#Save the output to the html file\n",
1545
+ "pyLDAvis.save_html(visual, \"data/topic_viz8_severe_training.html\")"
1546
+ ]
1547
+ },
1548
+ {
1549
+ "cell_type": "code",
1550
+ "execution_count": null,
1551
+ "id": "5e30d71a-a3c7-40c7-94c0-7eea1bedc887",
1552
+ "metadata": {
1553
+ "tags": []
1554
+ },
1555
+ "outputs": [
1556
+ {
1557
+ "ename": "",
1558
+ "evalue": "",
1559
+ "output_type": "error",
1560
+ "traceback": [
1561
+ "\u001b[1;31mFailed to start the Kernel. \n",
1562
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1563
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1564
+ ]
1565
+ }
1566
+ ],
1567
+ "source": [
1568
+ "final_model.print_topics(num_words=30)"
1569
+ ]
1570
+ },
1571
+ {
1572
+ "cell_type": "markdown",
1573
+ "id": "4fc5e753-e0e7-4520-9e7d-10e26e4d580d",
1574
+ "metadata": {},
1575
+ "source": [
1576
+ "This allows ease access to the trained model for future prediction work."
1577
+ ]
1578
+ },
1579
+ {
1580
+ "cell_type": "code",
1581
+ "execution_count": null,
1582
+ "id": "84eb2746-173a-4283-bca5-681f77548698",
1583
+ "metadata": {},
1584
+ "outputs": [
1585
+ {
1586
+ "ename": "",
1587
+ "evalue": "",
1588
+ "output_type": "error",
1589
+ "traceback": [
1590
+ "\u001b[1;31mFailed to start the Kernel. \n",
1591
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1592
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1593
+ ]
1594
+ }
1595
+ ],
1596
+ "source": [
1597
+ "#Save a model to disk, or reload a pre-trained model\n",
1598
+ "# naming convention: final_model_topic_alpha_eta\n",
1599
+ "final_model.save(\"final_model_8_asym_sym\")"
1600
+ ]
1601
+ },
1602
+ {
1603
+ "cell_type": "markdown",
1604
+ "id": "466c3952-69d1-4d19-b49c-d4b1e1844572",
1605
+ "metadata": {},
1606
+ "source": [
1607
+ "get dominant topics and topic percentage contribution.\n",
1608
+ "Made use of gensim lda's own function: https://radimrehurek.com/gensim/models/ldamodel.html"
1609
+ ]
1610
+ },
1611
+ {
1612
+ "cell_type": "code",
1613
+ "execution_count": null,
1614
+ "id": "cd88034c-2fb8-4f1f-a4e8-85d09b4fc1dc",
1615
+ "metadata": {},
1616
+ "outputs": [
1617
+ {
1618
+ "ename": "",
1619
+ "evalue": "",
1620
+ "output_type": "error",
1621
+ "traceback": [
1622
+ "\u001b[1;31mFailed to start the Kernel. \n",
1623
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1624
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1625
+ ]
1626
+ }
1627
+ ],
1628
+ "source": [
1629
+ "import warnings\n",
1630
+ "warnings.filterwarnings('ignore')\n",
1631
+ "\n",
1632
+ "def format_topics_sentences(ldamodel, corpus, data):\n",
1633
+ " # Preallocate memory for the DataFrame\n",
1634
+ " num_docs = len(corpus)\n",
1635
+ " sent_topics = {'Dominant_Topic': [0] * num_docs, 'Perc_Contribution': [0.0] * num_docs, 'Topic_Distribution': [()] * num_docs}\n",
1636
+ " \n",
1637
+ " # Get main topic in each document\n",
1638
+ " for i, row in enumerate(ldamodel[corpus]):\n",
1639
+ " row = sorted(row, key=lambda x: (x[1]), reverse=True)\n",
1640
+ " if row:\n",
1641
+ " # Get the Dominant topic, Perc Contribution and Keywords for each document\n",
1642
+ " dominant_topic, perc_contribution = row[0]\n",
1643
+ " topic_distribution = row\n",
1644
+ " sent_topics['Dominant_Topic'][i] = int(dominant_topic)\n",
1645
+ " sent_topics['Perc_Contribution'][i] = round(perc_contribution, 4)\n",
1646
+ " sent_topics['Topic_Distribution'][i] = topic_distribution\n",
1647
+ "\n",
1648
+ " # Create the DataFrame\n",
1649
+ " sent_topics_df = pd.DataFrame(sent_topics)\n",
1650
+ " sent_topics_df['Text'] = data\n",
1651
+ "\n",
1652
+ " return sent_topics_df"
1653
+ ]
1654
+ },
1655
+ {
1656
+ "cell_type": "code",
1657
+ "execution_count": null,
1658
+ "id": "24d3ff60-035e-4133-9ffd-88cce5cdccb1",
1659
+ "metadata": {},
1660
+ "outputs": [
1661
+ {
1662
+ "ename": "",
1663
+ "evalue": "",
1664
+ "output_type": "error",
1665
+ "traceback": [
1666
+ "\u001b[1;31mFailed to start the Kernel. \n",
1667
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1668
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1669
+ ]
1670
+ }
1671
+ ],
1672
+ "source": [
1673
+ "df_topic_sents_keywords = format_topics_sentences(ldamodel=final_model, corpus=docs_vecs, data=cleaned.Headline_Details)"
1674
+ ]
1675
+ },
1676
+ {
1677
+ "cell_type": "code",
1678
+ "execution_count": null,
1679
+ "id": "c88b088b",
1680
+ "metadata": {},
1681
+ "outputs": [
1682
+ {
1683
+ "ename": "",
1684
+ "evalue": "",
1685
+ "output_type": "error",
1686
+ "traceback": [
1687
+ "\u001b[1;31mFailed to start the Kernel. \n",
1688
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1689
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1690
+ ]
1691
+ }
1692
+ ],
1693
+ "source": [
1694
+ "# Format\n",
1695
+ "df_dominant_topic = df_topic_sents_keywords.reset_index()\n",
1696
+ "df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Topic_Distribution', 'Text']\n",
1697
+ "\n",
1698
+ "# Show\n",
1699
+ "df_dominant_topic.head(10)"
1700
+ ]
1701
+ },
1702
+ {
1703
+ "cell_type": "markdown",
1704
+ "id": "560da382-aa86-4df1-8b85-56b057a27cd4",
1705
+ "metadata": {},
1706
+ "source": [
1707
+ "# Result Analysis"
1708
+ ]
1709
+ },
1710
+ {
1711
+ "cell_type": "code",
1712
+ "execution_count": null,
1713
+ "id": "4fe6b40b-6922-4de3-8d9e-dac7474b6303",
1714
+ "metadata": {},
1715
+ "outputs": [
1716
+ {
1717
+ "ename": "",
1718
+ "evalue": "",
1719
+ "output_type": "error",
1720
+ "traceback": [
1721
+ "\u001b[1;31mFailed to start the Kernel. \n",
1722
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1723
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1724
+ ]
1725
+ }
1726
+ ],
1727
+ "source": [
1728
+ "df_dominant_topic[\"Dominant_Topic\"].value_counts()"
1729
+ ]
1730
+ },
1731
+ {
1732
+ "cell_type": "code",
1733
+ "execution_count": null,
1734
+ "id": "b9917340-31cf-48af-871f-b481128fdf22",
1735
+ "metadata": {},
1736
+ "outputs": [
1737
+ {
1738
+ "ename": "",
1739
+ "evalue": "",
1740
+ "output_type": "error",
1741
+ "traceback": [
1742
+ "\u001b[1;31mFailed to start the Kernel. \n",
1743
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1744
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1745
+ ]
1746
+ }
1747
+ ],
1748
+ "source": [
1749
+ "import matplotlib.pyplot as plt\n",
1750
+ "\n",
1751
+ "# Get value counts of each topic\n",
1752
+ "topic_counts = df_dominant_topic[\"Dominant_Topic\"].value_counts()\n",
1753
+ "\n",
1754
+ "# Create a bar plot\n",
1755
+ "plt.figure(figsize=(8, 6))\n",
1756
+ "topic_counts.plot(kind=\"bar\", color=\"skyblue\")\n",
1757
+ "\n",
1758
+ "# Add labels to the bars\n",
1759
+ "for i, count in enumerate(topic_counts):\n",
1760
+ " plt.text(i, count, str(count), ha=\"center\", va=\"bottom\")\n",
1761
+ "\n",
1762
+ "# Add labels and title\n",
1763
+ "plt.xlabel(\"Topics\")\n",
1764
+ "plt.ylabel(\"Number of News\")\n",
1765
+ "plt.title(\"Topic Distribution\")\n",
1766
+ "\n",
1767
+ "# Show the plot\n",
1768
+ "plt.xticks(rotation=45) # Rotate x-axis labels for better readability\n",
1769
+ "plt.tight_layout()\n",
1770
+ "plt.show()\n"
1771
+ ]
1772
+ },
1773
+ {
1774
+ "cell_type": "code",
1775
+ "execution_count": null,
1776
+ "id": "fffa1e57-f975-4469-a42b-19d76c60fb66",
1777
+ "metadata": {},
1778
+ "outputs": [
1779
+ {
1780
+ "ename": "",
1781
+ "evalue": "",
1782
+ "output_type": "error",
1783
+ "traceback": [
1784
+ "\u001b[1;31mFailed to start the Kernel. \n",
1785
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1786
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1787
+ ]
1788
+ }
1789
+ ],
1790
+ "source": [
1791
+ "df_dominant_topic.sort_values(by='Topic_Perc_Contrib', ascending=False).head(20)"
1792
+ ]
1793
+ },
1794
+ {
1795
+ "cell_type": "code",
1796
+ "execution_count": null,
1797
+ "id": "8510f506-141f-4382-b668-251df1afc95f",
1798
+ "metadata": {},
1799
+ "outputs": [
1800
+ {
1801
+ "ename": "",
1802
+ "evalue": "",
1803
+ "output_type": "error",
1804
+ "traceback": [
1805
+ "\u001b[1;31mFailed to start the Kernel. \n",
1806
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1807
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1808
+ ]
1809
+ }
1810
+ ],
1811
+ "source": [
1812
+ "# Sample 100 rows, can change the random_state for different samples\n",
1813
+ "sampled_data = df_dominant_topic.sample(n=100, random_state=42) \n",
1814
+ "sampled_df = pd.DataFrame(sampled_data).reset_index()\n",
1815
+ "sampled_df.to_csv('data/sample_severe.csv')"
1816
+ ]
1817
+ },
1818
+ {
1819
+ "cell_type": "code",
1820
+ "execution_count": null,
1821
+ "id": "16388596-a1d6-4509-acac-6dd57220554a",
1822
+ "metadata": {},
1823
+ "outputs": [
1824
+ {
1825
+ "ename": "",
1826
+ "evalue": "",
1827
+ "output_type": "error",
1828
+ "traceback": [
1829
+ "\u001b[1;31mFailed to start the Kernel. \n",
1830
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
1831
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1832
+ ]
1833
+ }
1834
+ ],
1835
+ "source": []
1836
+ }
1837
+ ],
1838
+ "metadata": {
1839
+ "kernelspec": {
1840
+ "display_name": "Python 3",
1841
+ "language": "python",
1842
+ "name": "python3"
1843
+ },
1844
+ "language_info": {
1845
+ "codemirror_mode": {
1846
+ "name": "ipython",
1847
+ "version": 3
1848
+ },
1849
+ "file_extension": ".py",
1850
+ "mimetype": "text/x-python",
1851
+ "name": "python",
1852
+ "nbconvert_exporter": "python",
1853
+ "pygments_lexer": "ipython3",
1854
+ "version": "3.12.4"
1855
+ }
1856
+ },
1857
+ "nbformat": 4,
1858
+ "nbformat_minor": 5
1859
+ }
notebooks/10_LLM_evaluation.ipynb ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "a64f4c1c",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "ename": "",
11
+ "evalue": "",
12
+ "output_type": "error",
13
+ "traceback": [
14
+ "\u001b[1;31mFailed to start the Kernel. \n",
15
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
16
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
17
+ ]
18
+ }
19
+ ],
20
+ "source": [
21
+ "%load_ext autoreload\n",
22
+ "%autoreload 2\n",
23
+ "\n",
24
+ "import os\n",
25
+ "import sys\n",
26
+ "from pathlib import Path\n",
27
+ "\n",
28
+ "workding_dir = str(Path.cwd().parent)\n",
29
+ "os.chdir(workding_dir)\n",
30
+ "sys.path.append(workding_dir)\n",
31
+ "print(\"workding dir:\", workding_dir)\n",
32
+ "\n",
33
+ "from dotenv import find_dotenv, load_dotenv\n",
34
+ "\n",
35
+ "found_dotenv = find_dotenv(\".env\")\n",
36
+ "\n",
37
+ "if len(found_dotenv) == 0:\n",
38
+ " found_dotenv = find_dotenv(\".env.example\")\n",
39
+ "print(f\"loading env vars from: {found_dotenv}\")\n",
40
+ "load_dotenv(found_dotenv, override=True)"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": null,
46
+ "id": "52701550-0bda-4fa4-959f-2b42eb42e140",
47
+ "metadata": {},
48
+ "outputs": [
49
+ {
50
+ "ename": "",
51
+ "evalue": "",
52
+ "output_type": "error",
53
+ "traceback": [
54
+ "\u001b[1;31mFailed to start the Kernel. \n",
55
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
56
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
57
+ ]
58
+ }
59
+ ],
60
+ "source": [
61
+ "import pandas as pd"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": null,
67
+ "id": "8cb2dd41-79e1-45bf-a0eb-ff9abc0baa8b",
68
+ "metadata": {},
69
+ "outputs": [
70
+ {
71
+ "ename": "",
72
+ "evalue": "",
73
+ "output_type": "error",
74
+ "traceback": [
75
+ "\u001b[1;31mFailed to start the Kernel. \n",
76
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
77
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
78
+ ]
79
+ }
80
+ ],
81
+ "source": [
82
+ "df = pd.read_csv(\"cleaned_data_with_categories.csv\")"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": null,
88
+ "id": "c4e79890-de47-4774-9445-142cff46cf86",
89
+ "metadata": {},
90
+ "outputs": [
91
+ {
92
+ "ename": "",
93
+ "evalue": "",
94
+ "output_type": "error",
95
+ "traceback": [
96
+ "\u001b[1;31mFailed to start the Kernel. \n",
97
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
98
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
99
+ ]
100
+ }
101
+ ],
102
+ "source": [
103
+ "df"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": null,
109
+ "id": "df3cdbd0-a8b4-4b6e-b5bf-5fdbb7ce93a3",
110
+ "metadata": {},
111
+ "outputs": [
112
+ {
113
+ "ename": "",
114
+ "evalue": "",
115
+ "output_type": "error",
116
+ "traceback": [
117
+ "\u001b[1;31mFailed to start the Kernel. \n",
118
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
119
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
120
+ ]
121
+ }
122
+ ],
123
+ "source": [
124
+ "df_sorted = df.groupby(\"Category\").count().sort_values(by='id', ascending = False)\n",
125
+ "df_sorted"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": null,
131
+ "id": "d3ddee2c-946c-4ee5-b0ba-31ac6d201261",
132
+ "metadata": {},
133
+ "outputs": [
134
+ {
135
+ "ename": "",
136
+ "evalue": "",
137
+ "output_type": "error",
138
+ "traceback": [
139
+ "\u001b[1;31mFailed to start the Kernel. \n",
140
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
141
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
142
+ ]
143
+ }
144
+ ],
145
+ "source": [
146
+ "# Function to determine the value for the new column\n",
147
+ "def categorize(value):\n",
148
+ " if '/' in str(value) or ',' in str(value):\n",
149
+ " return 'Miscellaneous Events'\n",
150
+ " else:\n",
151
+ " return value"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": null,
157
+ "id": "b470266e-00c1-4d4c-8738-28b4fde2dcc6",
158
+ "metadata": {},
159
+ "outputs": [
160
+ {
161
+ "ename": "",
162
+ "evalue": "",
163
+ "output_type": "error",
164
+ "traceback": [
165
+ "\u001b[1;31mFailed to start the Kernel. \n",
166
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
167
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
168
+ ]
169
+ }
170
+ ],
171
+ "source": [
172
+ "df['GPT Generated Result'] = df['Category'].apply(categorize)"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "code",
177
+ "execution_count": null,
178
+ "id": "4d5d7300-a663-4394-979e-cf7c5d4f88d5",
179
+ "metadata": {},
180
+ "outputs": [
181
+ {
182
+ "ename": "",
183
+ "evalue": "",
184
+ "output_type": "error",
185
+ "traceback": [
186
+ "\u001b[1;31mFailed to start the Kernel. \n",
187
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
188
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
189
+ ]
190
+ }
191
+ ],
192
+ "source": [
193
+ "df_sorted1 = df.groupby(\"GPT Generated Result\").count().sort_values(by='id', ascending = False)\n",
194
+ "df_sorted1"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": null,
200
+ "id": "6dbe0ac2-4ac6-4d75-9fa7-c856b9370269",
201
+ "metadata": {},
202
+ "outputs": [
203
+ {
204
+ "ename": "",
205
+ "evalue": "",
206
+ "output_type": "error",
207
+ "traceback": [
208
+ "\u001b[1;31mFailed to start the Kernel. \n",
209
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
210
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
211
+ ]
212
+ }
213
+ ],
214
+ "source": [
215
+ "df.to_csv('result.csv')"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": null,
221
+ "id": "5dfe7870-c29c-4942-8301-f5e8b1bd9994",
222
+ "metadata": {},
223
+ "outputs": [
224
+ {
225
+ "ename": "",
226
+ "evalue": "",
227
+ "output_type": "error",
228
+ "traceback": [
229
+ "\u001b[1;31mFailed to start the Kernel. \n",
230
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
231
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
232
+ ]
233
+ }
234
+ ],
235
+ "source": []
236
+ },
237
+ {
238
+ "cell_type": "code",
239
+ "execution_count": null,
240
+ "id": "dc6007d3-c6bd-4bc3-8ccc-099440354ce8",
241
+ "metadata": {},
242
+ "outputs": [
243
+ {
244
+ "ename": "",
245
+ "evalue": "",
246
+ "output_type": "error",
247
+ "traceback": [
248
+ "\u001b[1;31mFailed to start the Kernel. \n",
249
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
250
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
251
+ ]
252
+ }
253
+ ],
254
+ "source": [
255
+ "eva = pd.read_csv('evaluation_result.csv')\n",
256
+ "eva"
257
+ ]
258
+ },
259
+ {
260
+ "cell_type": "code",
261
+ "execution_count": null,
262
+ "id": "a98d47b7-7ab1-4947-a10a-a36fa303dcc1",
263
+ "metadata": {},
264
+ "outputs": [
265
+ {
266
+ "ename": "",
267
+ "evalue": "",
268
+ "output_type": "error",
269
+ "traceback": [
270
+ "\u001b[1;31mFailed to start the Kernel. \n",
271
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
272
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
273
+ ]
274
+ }
275
+ ],
276
+ "source": [
277
+ "eva['Result_GPT_True_Count'] = eva['Result_GPT'].astype(int) # Convert boolean values to integers\n",
278
+ "result = eva.groupby(\"GPT Generated Result\")['Result_GPT_True_Count'].sum()\n",
279
+ "\n",
280
+ "result_gpt = result.sort_values(ascending=False)\n",
281
+ "result_gpt"
282
+ ]
283
+ },
284
+ {
285
+ "cell_type": "code",
286
+ "execution_count": null,
287
+ "id": "ec0defea-c4ad-4f97-9704-23ef83f73ff7",
288
+ "metadata": {},
289
+ "outputs": [
290
+ {
291
+ "ename": "",
292
+ "evalue": "",
293
+ "output_type": "error",
294
+ "traceback": [
295
+ "\u001b[1;31mFailed to start the Kernel. \n",
296
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
297
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
298
+ ]
299
+ }
300
+ ],
301
+ "source": [
302
+ "test = eva.groupby(\"Category_GoldenResult\").count().sort_values(by='id', ascending = False)\n",
303
+ "test"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "code",
308
+ "execution_count": null,
309
+ "id": "6bc57f6b-c554-4634-8a98-45d82546d6f8",
310
+ "metadata": {},
311
+ "outputs": [
312
+ {
313
+ "ename": "",
314
+ "evalue": "",
315
+ "output_type": "error",
316
+ "traceback": [
317
+ "\u001b[1;31mFailed to start the Kernel. \n",
318
+ "\u001b[1;31mUnable to start Kernel 'maritime (Python 3.12.4)' due to a timeout waiting for the ports to get used. \n",
319
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
320
+ ]
321
+ }
322
+ ],
323
+ "source": [
324
+ "eva['Result_Golden_True_Count'] = eva['Result_Golden'].astype(int) # Convert boolean values to integers\n",
325
+ "result = eva.groupby(\"Category_GoldenResult\")['Result_Golden_True_Count'].sum()\n",
326
+ "\n",
327
+ "# If you want to sort the result by the count in descending order:\n",
328
+ "result_golden = result.sort_values(ascending=False)\n",
329
+ "\n",
330
+ "result_golden"
331
+ ]
332
+ }
333
+ ],
334
+ "metadata": {
335
+ "kernelspec": {
336
+ "display_name": "Python 3",
337
+ "language": "python",
338
+ "name": "python3"
339
+ },
340
+ "language_info": {
341
+ "codemirror_mode": {
342
+ "name": "ipython",
343
+ "version": 3
344
+ },
345
+ "file_extension": ".py",
346
+ "mimetype": "text/x-python",
347
+ "name": "python",
348
+ "nbconvert_exporter": "python",
349
+ "pygments_lexer": "ipython3",
350
+ "version": "3.12.4"
351
+ }
352
+ },
353
+ "nbformat": 4,
354
+ "nbformat_minor": 5
355
+ }
notebooks/11_Named_Entity_Recognition.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/12_Pipeline.ipynb ADDED
@@ -0,0 +1,598 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "workding dir: /Users/inflaton/code/engd/papers/maritime/global-incidents\n",
13
+ "loading env vars from: /Users/inflaton/code/engd/papers/maritime/global-incidents/.env\n"
14
+ ]
15
+ },
16
+ {
17
+ "data": {
18
+ "text/plain": [
19
+ "True"
20
+ ]
21
+ },
22
+ "execution_count": 1,
23
+ "metadata": {},
24
+ "output_type": "execute_result"
25
+ }
26
+ ],
27
+ "source": [
28
+ "%load_ext autoreload\n",
29
+ "%autoreload 2\n",
30
+ "\n",
31
+ "import os\n",
32
+ "import sys\n",
33
+ "from pathlib import Path\n",
34
+ "\n",
35
+ "workding_dir = str(Path.cwd().parent)\n",
36
+ "os.chdir(workding_dir)\n",
37
+ "sys.path.append(workding_dir)\n",
38
+ "print(\"workding dir:\", workding_dir)\n",
39
+ "\n",
40
+ "from dotenv import find_dotenv, load_dotenv\n",
41
+ "\n",
42
+ "found_dotenv = find_dotenv(\".env\")\n",
43
+ "\n",
44
+ "if len(found_dotenv) == 0:\n",
45
+ " found_dotenv = find_dotenv(\".env.example\")\n",
46
+ "print(f\"loading env vars from: {found_dotenv}\")\n",
47
+ "load_dotenv(found_dotenv, override=True)"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "markdown",
52
+ "metadata": {},
53
+ "source": [
54
+ "### The file aims for the demonstration of the methdology pipeline, please used our new designed database (new excel attached )"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "markdown",
59
+ "metadata": {},
60
+ "source": [
61
+ "#### The following code is when user put a new news article link into the model, the model will extract the headline , Date and Content"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": 2,
67
+ "metadata": {},
68
+ "outputs": [
69
+ {
70
+ "name": "stdout",
71
+ "output_type": "stream",
72
+ "text": [
73
+ "Headline: Singapore Airlines stops using Iranian airspace as 'precautionary measure' amid Middle East tensions\n",
74
+ "Publication Date: 2024-04-14 08:58:00\n",
75
+ "Content: Advertisement Singapore Singapore Airlines stops using Iranian airspace as 'precautionary measure' amid Middle East tensions Singapore Airlines says it is closely monitoring the situation in the Middle East. (File photo: REUTERS/Edgar Su) New: You can now listen to articles. This audio is generated by an AI tool. 14 Apr 2024 08:58AM (Updated: 14 Apr 2024 07:26PM) Bookmark Bookmark Share WhatsApp Telegram Facebook Twitter Email LinkedIn SINGAPORE: Singapore Airlines (SIA) said on Sunday (Apr 14) \n"
76
+ ]
77
+ }
78
+ ],
79
+ "source": [
80
+ "import requests\n",
81
+ "from bs4 import BeautifulSoup\n",
82
+ "from datetime import datetime\n",
83
+ "\n",
84
+ "\n",
85
+ "def get_article_details(article_url):\n",
86
+ " response = requests.get(article_url)\n",
87
+ " soup = BeautifulSoup(response.content, \"html.parser\")\n",
88
+ "\n",
89
+ " # Extract headline\n",
90
+ " headline_tag = soup.find(\"h1\")\n",
91
+ " headline = (\n",
92
+ " headline_tag.get_text(strip=True) if headline_tag else \"No headline found\"\n",
93
+ " )\n",
94
+ "\n",
95
+ " # Attempt to extract publication date with error handling\n",
96
+ " date_container = soup.find(\"div\", class_=\"article-publish\")\n",
97
+ " if date_container:\n",
98
+ " # Extract the text and handle cleaning it up\n",
99
+ " date_text = date_container.get_text(strip=True)\n",
100
+ " # Extract the first date assuming it's the publication date (before \"Updated:\")\n",
101
+ " publication_date_text = date_text.split(\"(Updated:\")[0].strip()\n",
102
+ " try:\n",
103
+ " publication_date = datetime.strptime(\n",
104
+ " publication_date_text, \"%d %b %Y %I:%M%p\"\n",
105
+ " ).strftime(\"%Y-%m-%d %H:%M:%S\")\n",
106
+ " except ValueError:\n",
107
+ " publication_date = \"No publication date found\"\n",
108
+ " else:\n",
109
+ " publication_date = \"No publication date found\"\n",
110
+ "\n",
111
+ " # Extract main content of the article\n",
112
+ " article_body = soup.find(\"article\")\n",
113
+ " if not article_body:\n",
114
+ " article_body = soup\n",
115
+ " article_text = (\n",
116
+ " article_body.get_text(separator=\" \", strip=True)\n",
117
+ " if article_body\n",
118
+ " else \"No article content found\"\n",
119
+ " )\n",
120
+ "\n",
121
+ " return headline, publication_date, article_text\n",
122
+ "\n",
123
+ "\n",
124
+ "# Example usage\n",
125
+ "article_url = \"https://www.channelnewsasia.com/singapore/singapore-airlines-stops-using-iran-airspace-israel-hamas-war-middle-east-escalation-4264011\"\n",
126
+ "headline, publication_date, article_content = get_article_details(article_url)\n",
127
+ "print(\"Headline:\", headline)\n",
128
+ "print(\"Publication Date:\", publication_date)\n",
129
+ "print(\"Content:\", article_content[:500]) # Print the first 500 characters to check"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "markdown",
134
+ "metadata": {},
135
+ "source": [
136
+ "#### Here is the code for Summarization of the aritlce (PLEASE USE UR OWN OPENAI KEY HAHA)"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": 3,
142
+ "metadata": {},
143
+ "outputs": [
144
+ {
145
+ "name": "stdout",
146
+ "output_type": "stream",
147
+ "text": [
148
+ "Summary: Singapore Airlines has stopped using Iranian airspace as a precautionary measure due to the escalating tensions in the Middle East. This move was mirrored by other airlines, including Lufthansa and Austrian Airlines, who also suspended flights to and from Tehran. The situation has also led to economic consequences, as flights\n"
149
+ ]
150
+ }
151
+ ],
152
+ "source": [
153
+ "import openai\n",
154
+ "\n",
155
+ "\n",
156
+ "def summarize_article(article_content):\n",
157
+ " try:\n",
158
+ " # Generating the prompt for GPT-3\n",
159
+ " prompt_text = (\n",
160
+ " \"Summarize the following article in about 70 words, focusing on \"\n",
161
+ " \"what happened, where it happened, and the consequences (economic loss, environmental impact, etc.):\\n\\n\"\n",
162
+ " f\"{article_content}\"\n",
163
+ " )\n",
164
+ "\n",
165
+ " # Call to OpenAI's Completion API\n",
166
+ " response = openai.Completion.create(\n",
167
+ " engine=\"gpt-3.5-turbo-instruct\",\n",
168
+ " prompt=prompt_text,\n",
169
+ " temperature=0.5,\n",
170
+ " max_tokens=60, # Adjust as needed to fit the summary length\n",
171
+ " top_p=1,\n",
172
+ " frequency_penalty=0,\n",
173
+ " presence_penalty=0,\n",
174
+ " )\n",
175
+ "\n",
176
+ " # Extracting the text from the response\n",
177
+ " summary = response.choices[0].text.strip()\n",
178
+ " return summary\n",
179
+ " except Exception as e:\n",
180
+ " print(f\"An error occurred: {e}\")\n",
181
+ " return \"\"\n",
182
+ "\n",
183
+ "\n",
184
+ "summary = summarize_article(article_content)\n",
185
+ "print(\"Summary:\", summary)"
186
+ ]
187
+ },
188
+ {
189
+ "cell_type": "code",
190
+ "execution_count": 4,
191
+ "metadata": {},
192
+ "outputs": [],
193
+ "source": [
194
+ "# unique_categories = df['Category'].unique()\n",
195
+ "# print(unique_categories)"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": 5,
201
+ "metadata": {},
202
+ "outputs": [],
203
+ "source": [
204
+ "import openai\n",
205
+ "\n",
206
+ "\n",
207
+ "def classify_article(article_content):\n",
208
+ " prompt = f\"\"\"Read the following article and classify its content into one of these categories: 'Aviation Advisory',\n",
209
+ "'Bombing',\n",
210
+ "'Cargo Disruption',\n",
211
+ "'Warehouse Theft',\n",
212
+ "'Chemical Spill',\n",
213
+ "'Injury',\n",
214
+ "'Earthquake',\n",
215
+ "'Flooding',\n",
216
+ "'Ground Transportation Advisory',\n",
217
+ "'Hazmat Response',\n",
218
+ "'Ice Storm',\n",
219
+ "'Individuals in Focus',\n",
220
+ "'Industrial Action',\n",
221
+ "'Maritime Accident',\n",
222
+ "'Maritime Accident.Ground Transportation Advisory',\n",
223
+ "'Maritime Advisory',\n",
224
+ "'Mine Workers Strike',\n",
225
+ "'Miscellaneous Events',\n",
226
+ "'Miscellaneous Strikes',\n",
227
+ "'Network Disruption',\n",
228
+ "'Non-industrial Fire',\n",
229
+ "'Police Operations',\n",
230
+ "'Port Closure',\n",
231
+ "'Port Congestion',\n",
232
+ "'Port Disruption',\n",
233
+ "'Power Outage',\n",
234
+ "'Production Halt',\n",
235
+ "'Protest',\n",
236
+ "'Public Safety ,\n",
237
+ "'Public Transportation Disruption',\n",
238
+ "'Roadway Closuren',\n",
239
+ "'Severe Winds',\n",
240
+ "'Storm',\n",
241
+ "'Tornado',\n",
242
+ "'Train Delays,\n",
243
+ "'Travel Warning',\n",
244
+ "'Tropical Cyclone Storm',\n",
245
+ "'Typhoon',\n",
246
+ "'Vehicle Accident',\n",
247
+ "'Weather Advisory',\n",
248
+ "'Workplace Accident'\n",
249
+ "Summary: {article_content}\n",
250
+ "Category:\"\"\"\n",
251
+ "\n",
252
+ " response = openai.Completion.create(\n",
253
+ " engine=\"gpt-3.5-turbo-instruct\", # Adjust according to the latest available and appropriate model\n",
254
+ " prompt=prompt,\n",
255
+ " temperature=0.7,\n",
256
+ " max_tokens=60, # Adjust based on your needs\n",
257
+ " top_p=1.0,\n",
258
+ " frequency_penalty=0,\n",
259
+ " presence_penalty=0,\n",
260
+ " stop=[\"\\n\"], # Stop generating further when a newline character is encountered\n",
261
+ " )\n",
262
+ " category = response.choices[0].text.strip()\n",
263
+ " return category"
264
+ ]
265
+ },
266
+ {
267
+ "cell_type": "code",
268
+ "execution_count": 6,
269
+ "metadata": {},
270
+ "outputs": [],
271
+ "source": [
272
+ "import requests\n",
273
+ "from bs4 import BeautifulSoup\n",
274
+ "\n",
275
+ "\n",
276
+ "def fetch_article_content(url):\n",
277
+ " response = requests.get(url)\n",
278
+ " soup = BeautifulSoup(response.content, \"html.parser\")\n",
279
+ " article_text = \" \".join([p.text for p in soup.find_all(\"p\")])\n",
280
+ " return article_text"
281
+ ]
282
+ },
283
+ {
284
+ "cell_type": "code",
285
+ "execution_count": 7,
286
+ "metadata": {},
287
+ "outputs": [
288
+ {
289
+ "name": "stdout",
290
+ "output_type": "stream",
291
+ "text": [
292
+ "Category: Aviation Advisory\n"
293
+ ]
294
+ }
295
+ ],
296
+ "source": [
297
+ "import openai\n",
298
+ "\n",
299
+ "\n",
300
+ "def classify_article(url):\n",
301
+ " # Fetch article content\n",
302
+ " article_content = fetch_article_content(url)\n",
303
+ "\n",
304
+ " # Construct the classification prompt\n",
305
+ " prompt = f\"\"\"Read the following article and classify its content into one of these categories:\n",
306
+ " 'Aviation Advisory', 'Bombing',\n",
307
+ "'Cargo Disruption',\n",
308
+ "'Chemical Spill',\n",
309
+ "'Death',\n",
310
+ "'Earthquake',\n",
311
+ "'Flooding',\n",
312
+ "'Ground Transportation Advisory',\n",
313
+ "'Hazmat Response',\n",
314
+ "'Ice Storm',\n",
315
+ "'Individuals in Focus',\n",
316
+ "'Industrial Action',\n",
317
+ "'Maritime Accident',\n",
318
+ "'Maritime Accident.Ground Transportation Advisory',\n",
319
+ "'Maritime Advisory',\n",
320
+ "'Mine Workers Strike',\n",
321
+ "'Miscellaneous Events',\n",
322
+ "'Miscellaneous Strikes',\n",
323
+ "'Network Disruption',\n",
324
+ "'Non-industrial Fire',\n",
325
+ "'Police Operations',\n",
326
+ "'Port Closure',\n",
327
+ "'Port Congestion',\n",
328
+ "'Port Disruption',\n",
329
+ "'Power Outage',\n",
330
+ "'Production Halt',\n",
331
+ "'Protest / Riot',\n",
332
+ "'Public Safety / Security',\n",
333
+ "'Public Transportation Disruption',\n",
334
+ "'Roadway Closure / Disruption',\n",
335
+ "'Severe Winds',\n",
336
+ "'Storm',\n",
337
+ "'Tornado',\n",
338
+ "'Train Delays / Disruption',\n",
339
+ "'Travel Warning',\n",
340
+ "'Tropical Cyclone / Storm',\n",
341
+ "'Typhoon',\n",
342
+ "'Vehicle Accident',\n",
343
+ "'Weather Advisory',\n",
344
+ "'Workplace Accident', ...\n",
345
+ " \n",
346
+ " Article:\n",
347
+ " {article_content}\n",
348
+ " \n",
349
+ " Category:\"\"\"\n",
350
+ "\n",
351
+ " # Classify using OpenAI GPT-3\n",
352
+ " response = openai.Completion.create(\n",
353
+ " engine=\"gpt-3.5-turbo-instruct\", # Ensure using a correct and non-deprecated model\n",
354
+ " prompt=prompt,\n",
355
+ " temperature=0.7,\n",
356
+ " max_tokens=60,\n",
357
+ " top_p=1.0,\n",
358
+ " frequency_penalty=0,\n",
359
+ " presence_penalty=0,\n",
360
+ " stop=[\"\\n\"], # Stop generating further when a newline character is encountered\n",
361
+ " )\n",
362
+ " category = response.choices[0].text.strip()\n",
363
+ " return category\n",
364
+ "\n",
365
+ "\n",
366
+ "# Example usage\n",
367
+ "url = \"https://www.channelnewsasia.com/singapore/singapore-airlines-stops-using-iran-airspace-israel-hamas-war-middle-east-escalation-4264011\"\n",
368
+ "category = classify_article(url)\n",
369
+ "print(\"Category:\", category)"
370
+ ]
371
+ },
372
+ {
373
+ "cell_type": "markdown",
374
+ "metadata": {},
375
+ "source": [
376
+ "#### Now you the classfication result, which is quite accurate :))"
377
+ ]
378
+ },
379
+ {
380
+ "cell_type": "code",
381
+ "execution_count": 8,
382
+ "metadata": {},
383
+ "outputs": [
384
+ {
385
+ "name": "stdout",
386
+ "output_type": "stream",
387
+ "text": [
388
+ "Singapore Airlines stops using Iranian airspace as 'precautionary measure' amid Middle East tensions\n"
389
+ ]
390
+ }
391
+ ],
392
+ "source": [
393
+ "print(headline)"
394
+ ]
395
+ },
396
+ {
397
+ "cell_type": "markdown",
398
+ "metadata": {},
399
+ "source": [
400
+ "#### Update our database!"
401
+ ]
402
+ },
403
+ {
404
+ "cell_type": "code",
405
+ "execution_count": 9,
406
+ "metadata": {},
407
+ "outputs": [
408
+ {
409
+ "name": "stdout",
410
+ "output_type": "stream",
411
+ "text": [
412
+ "Database updated successfully with ID 1.\n"
413
+ ]
414
+ }
415
+ ],
416
+ "source": [
417
+ "import pandas as pd\n",
418
+ "from datetime import datetime\n",
419
+ "\n",
420
+ "\n",
421
+ "def update_database(file_path, url):\n",
422
+ " # Fetch details from the article\n",
423
+ " headline, publication_date, article_content = get_article_details(article_url)\n",
424
+ " summary = summarize_article(article_content)\n",
425
+ " category = classify_article(url)\n",
426
+ "\n",
427
+ " new_data = {\n",
428
+ " \"Headline\": headline,\n",
429
+ " \"Summary\": summary,\n",
430
+ " \"Category\": category,\n",
431
+ " \"Datetime\": publication_date,\n",
432
+ " \"URL\": article_url,\n",
433
+ " }\n",
434
+ "\n",
435
+ " # Load the existing data from the CSV file\n",
436
+ " try:\n",
437
+ " df = pd.read_csv(file_path)\n",
438
+ " except FileNotFoundError:\n",
439
+ " # If the file does not exist, create a new DataFrame\n",
440
+ " df = pd.DataFrame(\n",
441
+ " columns=[\"id\", \"Headline\", \"Summary\", \"Category\", \"Datetime\", \"URL\"]\n",
442
+ " )\n",
443
+ " new_id = 1 # Start with ID 1 if no file exists\n",
444
+ " else:\n",
445
+ " # If IDs exist, increment from the last used ID\n",
446
+ " new_id = df[\"id\"].max() + 1 if not df.empty else 1\n",
447
+ "\n",
448
+ " # Prepare the new data entry\n",
449
+ " new_entry = pd.DataFrame(\n",
450
+ " {\n",
451
+ " \"id\": [new_id],\n",
452
+ " \"Headline\": [new_data[\"Headline\"]],\n",
453
+ " \"Summary\": [new_data[\"Summary\"]],\n",
454
+ " \"Category\": [new_data[\"Category\"]],\n",
455
+ " \"Datetime\": [new_data[\"Datetime\"]],\n",
456
+ " \"URL\": [new_data[\"URL\"]],\n",
457
+ " }\n",
458
+ " )\n",
459
+ "\n",
460
+ " # Append the new data entry to the DataFrame using concat\n",
461
+ " df = pd.concat([df, new_entry], ignore_index=True)\n",
462
+ "\n",
463
+ " # Save the updated DataFrame back to CSV\n",
464
+ " df.to_csv(file_path, index=False)\n",
465
+ " print(f\"Database updated successfully with ID {new_id}.\")\n",
466
+ "\n",
467
+ "\n",
468
+ "# Example usage\n",
469
+ "url = \"https://www.channelnewsasia.com/singapore/singapore-airlines-stops-using-iran-airspace-israel-hamas-war-middle-east-escalation-4264011\"\n",
470
+ "file_path = \"cleaned_data1.csv\"\n",
471
+ "update_database(file_path, url)"
472
+ ]
473
+ },
474
+ {
475
+ "cell_type": "code",
476
+ "execution_count": 10,
477
+ "metadata": {},
478
+ "outputs": [
479
+ {
480
+ "name": "stdout",
481
+ "output_type": "stream",
482
+ "text": [
483
+ " id Headline \\\n",
484
+ "0 1 Singapore Airlines stops using Iranian airspac... \n",
485
+ "\n",
486
+ " Summary Category \\\n",
487
+ "0 Singapore Airlines has stopped using Iranian a... Aviation Advisory \n",
488
+ "\n",
489
+ " Datetime URL \n",
490
+ "0 2024-04-14 08:58:00 https://www.channelnewsasia.com/singapore/sing... \n"
491
+ ]
492
+ }
493
+ ],
494
+ "source": [
495
+ "import pandas as pd\n",
496
+ "\n",
497
+ "\n",
498
+ "def rank_related_articles(file_path, category):\n",
499
+ " # Load the existing data from the CSV file\n",
500
+ " try:\n",
501
+ " df = pd.read_csv(file_path)\n",
502
+ " except FileNotFoundError:\n",
503
+ " print(\"Database file not found.\")\n",
504
+ " return\n",
505
+ "\n",
506
+ " # Filter articles by the specified category\n",
507
+ " filtered_df = df[df[\"Category\"] == category]\n",
508
+ "\n",
509
+ " # Convert 'Datetime' from string to datetime objects for accurate sorting\n",
510
+ " filtered_df[\"Datetime\"] = pd.to_datetime(filtered_df[\"Datetime\"])\n",
511
+ "\n",
512
+ " # Sort articles by 'Datetime' in descending order to get the most recent articles first\n",
513
+ " sorted_df = filtered_df.sort_values(by=\"Datetime\", ascending=False)\n",
514
+ "\n",
515
+ " # Display the sorted DataFrame\n",
516
+ " print(sorted_df[[\"id\", \"Headline\", \"Summary\", \"Category\", \"Datetime\", \"URL\"]])\n",
517
+ " return sorted_df\n",
518
+ "\n",
519
+ "\n",
520
+ "# Example usage\n",
521
+ "file_path = \"cleaned_data1.csv\"\n",
522
+ "category = \"Aviation Advisory\"\n",
523
+ "ranked_articles = rank_related_articles(file_path, category)"
524
+ ]
525
+ },
526
+ {
527
+ "cell_type": "code",
528
+ "execution_count": 13,
529
+ "metadata": {},
530
+ "outputs": [
531
+ {
532
+ "name": "stdout",
533
+ "output_type": "stream",
534
+ "text": [
535
+ "+----+------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------+\n",
536
+ "| id | Headline | Summary | Category | Datetime | URL |\n",
537
+ "+----+------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------+\n",
538
+ "| 1 | Singapore Airlines stops using Iranian airspace as 'precautionary measure' amid Middle East tensions | Singapore Airlines has stopped using Iranian airspace as a precautionary measure amid the escalating tensions in the Middle East. This decision was made after Iran launched over 200 drones and missiles at Israel, following an Israeli strike on an Iranian building in Syria. Other airlines, such as Lufthansa and Austrian | Aviation Advisory | 2024-04-14 08:58:00 | https://www.channelnewsasia.com/singapore/singapore-airlines-stops-using-iran-airspace-israel-hamas-war-middle-east-escalation-4264011 |\n",
539
+ "+----+------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------+\n"
540
+ ]
541
+ }
542
+ ],
543
+ "source": [
544
+ "import pandas as pd\n",
545
+ "from tabulate import tabulate\n",
546
+ "\n",
547
+ "\n",
548
+ "def print_ranked_articles_tabulate(file_path, category):\n",
549
+ " try:\n",
550
+ " df = pd.read_csv(file_path)\n",
551
+ " df[\"Datetime\"] = pd.to_datetime(df[\"Datetime\"])\n",
552
+ " filtered_df = df[df[\"Category\"] == category]\n",
553
+ " sorted_df = filtered_df.sort_values(by=\"Datetime\", ascending=False)\n",
554
+ "\n",
555
+ " # Print DataFrame using tabulate\n",
556
+ " print(tabulate(sorted_df, headers=\"keys\", tablefmt=\"pretty\", showindex=False))\n",
557
+ " except FileNotFoundError:\n",
558
+ " print(\"Database file not found.\")\n",
559
+ "\n",
560
+ "\n",
561
+ "# Example usage\n",
562
+ "file_path = \"cleaned_data1.csv\"\n",
563
+ "category = \"Aviation Advisory\"\n",
564
+ "print_ranked_articles_tabulate(file_path, category)"
565
+ ]
566
+ },
567
+ {
568
+ "cell_type": "code",
569
+ "execution_count": 14,
570
+ "metadata": {},
571
+ "outputs": [],
572
+ "source": [
573
+ "!rm cleaned_data1.csv"
574
+ ]
575
+ }
576
+ ],
577
+ "metadata": {
578
+ "kernelspec": {
579
+ "display_name": "Python 3",
580
+ "language": "python",
581
+ "name": "python3"
582
+ },
583
+ "language_info": {
584
+ "codemirror_mode": {
585
+ "name": "ipython",
586
+ "version": 3
587
+ },
588
+ "file_extension": ".py",
589
+ "mimetype": "text/x-python",
590
+ "name": "python",
591
+ "nbconvert_exporter": "python",
592
+ "pygments_lexer": "ipython3",
593
+ "version": "3.12.4"
594
+ }
595
+ },
596
+ "nbformat": 4,
597
+ "nbformat_minor": 2
598
+ }
requirements.txt CHANGED
@@ -1 +1,25 @@
1
- huggingface_hub==0.22.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ huggingface_hub==0.22.2
2
+ pandas
3
+ jupyter
4
+ ipywidgets
5
+ nltk
6
+ openpyxl
7
+ scikit-learn
8
+ matplotlib
9
+ python-dotenv
10
+ openai==0.28
11
+ spacy
12
+ wordcloud
13
+ contractions
14
+ gensim
15
+ scipy==1.12.0
16
+ seaborn
17
+ pyarrow
18
+ pyLDAvis
19
+ tabulate
20
+ black
21
+ black[jupyter]
22
+ lxml_html_clean
23
+ newspaper3k
24
+ gnews
25
+