diff --git "a/notebooks/05c_newsScraper_clearning.ipynb" "b/notebooks/05c_newsScraper_clearning.ipynb" new file mode 100644--- /dev/null +++ "b/notebooks/05c_newsScraper_clearning.ipynb" @@ -0,0 +1,1931 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "3ddf645f", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "42801c6a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "workding dir: d:\\code\\projects\\global-incidents\n", + "loading env vars from: d:\\code\\projects\\global-incidents\\.env.example\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "import sys\n", + "from pathlib import Path\n", + "\n", + "if \"workding_dir\" not in globals():\n", + " workding_dir = str(Path.cwd().parent)\n", + "\n", + "os.chdir(workding_dir)\n", + "sys.path.append(workding_dir)\n", + "print(\"workding dir:\", workding_dir)\n", + "\n", + "from dotenv import find_dotenv, load_dotenv\n", + "\n", + "found_dotenv = find_dotenv(\".env\")\n", + "\n", + "if len(found_dotenv) == 0:\n", + " found_dotenv = find_dotenv(\".env.example\")\n", + "print(f\"loading env vars from: {found_dotenv}\")\n", + "load_dotenv(found_dotenv, override=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ddf1e32e-7751-43db-9b5a-22cb08e35c6c", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"data/scrapped_data2.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a9c6ed07-a7d0-4aaa-b571-038919c75e05", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | id | \n", + "Headline | \n", + "Details | \n", + "Severity | \n", + "Category | \n", + "Region | \n", + "Datetime | \n", + "Year | \n", + "lat | \n", + "lon | \n", + "maritime_label | \n", + "found_ports | \n", + "contains_port_info | \n", + "if_labeled | \n", + "Month | \n", + "Week | \n", + "Headline_Details | \n", + "url | \n", + "title | \n", + "content | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "1 | \n", + "Grasberg Mine- Grasberg mine workers extend st... | \n", + "Media sources indicate that workers at the Gra... | \n", + "Moderate | \n", + "Mine Workers Strike | \n", + "Indonesia | \n", + "28/5/17 17:08 | \n", + "2017.0 | \n", + "-4.05608 | \n", + "137.11302 | \n", + "False | \n", + "['freeport'] | \n", + "1.0 | \n", + "False | \n", + "5.0 | \n", + "21.0 | \n", + "Grasberg Mine- Grasberg mine workers extend st... | \n", + "https://news.google.com/rss/articles/CBMiZ2h0d... | \n", + "Freeport Indonesia mine workers extend strike ... | \n", + "Trucks are seen on a road in the Grasberg copp... | \n", + "
1 | \n", + "2 | \n", + "Indonesia: Undersea internet cables damaged by... | \n", + "News sources are stating that recent typhoons ... | \n", + "Minor | \n", + "Travel Warning | \n", + "Indonesia | \n", + "4/9/17 14:30 | \n", + "2017.0 | \n", + "NaN | \n", + "NaN | \n", + "False | \n", + "['hong kong'] | \n", + "1.0 | \n", + "False | \n", + "4.0 | \n", + "14.0 | \n", + "Indonesia: Undersea internet cables damaged by... | \n", + "no url found | \n", + "no title found | \n", + "no content found | \n", + "
2 | \n", + "3 | \n", + "Shanghai port congestion impacts terminals in ... | \n", + "The persisting port congestion at Shanghai’s Y... | \n", + "Minor | \n", + "Port Congestion | \n", + "China | \n", + "27/4/17 9:16 | \n", + "2017.0 | \n", + "29.52000 | \n", + "121.33190 | \n", + "True | \n", + "['ningbo', 'qingdao', 'shanghai'] | \n", + "1.0 | \n", + "False | \n", + "4.0 | \n", + "17.0 | \n", + "Shanghai port congestion impacts terminals in ... | \n", + "https://news.google.com/rss/articles/CBMiVWh0d... | \n", + "Typhoon Muifa to shut China ports for second t... | \n", + "By Sam Whelan 13/09/2022\\n\\nAnother typhoon ha... | \n", + "
3 | \n", + "4 | \n", + "UPDATE - Indonesia: Explosion at KP Terminal i... | \n", + "Updated local media sources from Jakarta indic... | \n", + "Extreme | \n", + "Bombing, Police Operations | \n", + "Indonesia | \n", + "24/5/17 15:15 | \n", + "2017.0 | \n", + "-6.22465 | \n", + "106.86700 | \n", + "True | \n", + "['jakarta'] | \n", + "1.0 | \n", + "False | \n", + "5.0 | \n", + "21.0 | \n", + "UPDATE - Indonesia: Explosion at KP Terminal i... | \n", + "no url found | \n", + "no title found | \n", + "no content found | \n", + "
4 | \n", + "5 | \n", + "UPDATE - Indonesia: Police confirm two explosi... | \n", + "According to local police in Jakarta, two expl... | \n", + "Extreme | \n", + "Bombing, Police Operations | \n", + "Indonesia | \n", + "24/5/17 16:20 | \n", + "2017.0 | \n", + "NaN | \n", + "NaN | \n", + "True | \n", + "['jakarta'] | \n", + "1.0 | \n", + "True | \n", + "5.0 | \n", + "21.0 | \n", + "UPDATE - Indonesia: Police confirm two explosi... | \n", + "https://news.google.com/rss/articles/CBMiZWh0d... | \n", + "Jakarta Police Receive 2 More Reports on Coldp... | \n", + "TEMPO.CO, Jakarta - South Jakarta Metro Police... | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
5705 | \n", + "5776 | \n", + "Winter storm may bring inclement weather condi... | \n", + "Intelligence received by Everstream Analytics ... | \n", + "Moderate | \n", + "Ice Storm | \n", + "United States | \n", + "16/12/20 2:47 | \n", + "2020.0 | \n", + "38.90072 | \n", + "-77.05440 | \n", + "False | \n", + "['new york'] | \n", + "1.0 | \n", + "False | \n", + "12.0 | \n", + "51.0 | \n", + "Winter storm may bring inclement weather condi... | \n", + "https://news.google.com/rss/articles/CBMia2h0d... | \n", + "Big storm to dump heavy rain and snow along Ea... | \n", + "A sprawling winter storm known as a nor’easter... | \n", + "
5706 | \n", + "5777 | \n", + "Winter weather expected to continue to impact ... | \n", + "Meteorological sources indicate that a series ... | \n", + "Minor | \n", + "Roadway Closure / Disruption, Ground Transport... | \n", + "United States | \n", + "31/12/20 18:15 | \n", + "2020.0 | \n", + "41.30357 | \n", + "-72.90561 | \n", + "False | \n", + "['new york', 'virginia'] | \n", + "1.0 | \n", + "True | \n", + "12.0 | \n", + "53.0 | \n", + "Winter weather expected to continue to impact ... | \n", + "https://news.google.com/rss/articles/CBMiZ2h0d... | \n", + "Cross-country storm brings severe weather thre... | \n", + "A powerful cross-country storm that impacted t... | \n", + "
5707 | \n", + "5778 | \n", + "Workers of Svitzer Australia plan to strike on... | \n", + "Industry sources report on December 7 that Svi... | \n", + "Moderate | \n", + "Industrial Action | \n", + "Australia | \n", + "7/12/20 6:16 | \n", + "2020.0 | \n", + "-38.35169 | \n", + "145.25050 | \n", + "False | \n", + "['geelong', 'melbourne'] | \n", + "1.0 | \n", + "False | \n", + "7.0 | \n", + "28.0 | \n", + "Workers of Svitzer Australia plan to strike on... | \n", + "no url found | \n", + "no title found | \n", + "no content found | \n", + "
5708 | \n", + "5779 | \n", + "Workers stage 24-hour stoppage at DP World Ter... | \n", + "Industry sources indicate on December 14 that ... | \n", + "Moderate | \n", + "Port Strike | \n", + "Australia | \n", + "14/12/20 16:52 | \n", + "2020.0 | \n", + "NaN | \n", + "NaN | \n", + "True | \n", + "['fremantle'] | \n", + "1.0 | \n", + "True | \n", + "12.0 | \n", + "51.0 | \n", + "Workers stage 24-hour stoppage at DP World Ter... | \n", + "https://news.google.com/rss/articles/CBMiOWh0d... | \n", + "Union shuts down strikes as DP World threatens... | \n", + "The Maritime Union of Australia (MUA) has curt... | \n", + "
5709 | \n", + "5780 | \n", + "Workers with Ertsoverslagbedrijf Europoort C.V... | \n", + "On November 17, Dutch media sources reported t... | \n", + "Minor | \n", + "Port Strike | \n", + "Netherlands | \n", + "18/11/20 17:55 | \n", + "2020.0 | \n", + "51.96121 | \n", + "4.10550 | \n", + "False | \n", + "['rotterdam'] | \n", + "1.0 | \n", + "True | \n", + "11.0 | \n", + "47.0 | \n", + "Workers with Ertsoverslagbedrijf Europoort C.V... | \n", + "no url found | \n", + "no title found | \n", + "no content found | \n", + "
5710 rows × 20 columns
\n", + "\n", + " | id | \n", + "Headline | \n", + "Details | \n", + "Severity | \n", + "Category | \n", + "Region | \n", + "Datetime | \n", + "Year | \n", + "lat | \n", + "lon | \n", + "maritime_label | \n", + "found_ports | \n", + "contains_port_info | \n", + "if_labeled | \n", + "Month | \n", + "Week | \n", + "Headline_Details | \n", + "url | \n", + "title | \n", + "content | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "1 | \n", + "Grasberg Mine- Grasberg mine workers extend st... | \n", + "Media sources indicate that workers at the Gra... | \n", + "Moderate | \n", + "Mine Workers Strike | \n", + "Indonesia | \n", + "28/5/17 17:08 | \n", + "2017.0 | \n", + "-4.05608 | \n", + "137.11302 | \n", + "False | \n", + "['freeport'] | \n", + "1.0 | \n", + "False | \n", + "5.0 | \n", + "21.0 | \n", + "Grasberg Mine- Grasberg mine workers extend st... | \n", + "https://news.google.com/rss/articles/CBMiZ2h0d... | \n", + "Freeport Indonesia mine workers extend strike ... | \n", + "Trucks are seen on a road in the Grasberg copp... | \n", + "
2 | \n", + "3 | \n", + "Shanghai port congestion impacts terminals in ... | \n", + "The persisting port congestion at Shanghai’s Y... | \n", + "Minor | \n", + "Port Congestion | \n", + "China | \n", + "27/4/17 9:16 | \n", + "2017.0 | \n", + "29.52000 | \n", + "121.33190 | \n", + "True | \n", + "['ningbo', 'qingdao', 'shanghai'] | \n", + "1.0 | \n", + "False | \n", + "4.0 | \n", + "17.0 | \n", + "Shanghai port congestion impacts terminals in ... | \n", + "https://news.google.com/rss/articles/CBMiVWh0d... | \n", + "Typhoon Muifa to shut China ports for second t... | \n", + "By Sam Whelan 13/09/2022\\n\\nAnother typhoon ha... | \n", + "
4 | \n", + "5 | \n", + "UPDATE - Indonesia: Police confirm two explosi... | \n", + "According to local police in Jakarta, two expl... | \n", + "Extreme | \n", + "Bombing, Police Operations | \n", + "Indonesia | \n", + "24/5/17 16:20 | \n", + "2017.0 | \n", + "NaN | \n", + "NaN | \n", + "True | \n", + "['jakarta'] | \n", + "1.0 | \n", + "True | \n", + "5.0 | \n", + "21.0 | \n", + "UPDATE - Indonesia: Police confirm two explosi... | \n", + "https://news.google.com/rss/articles/CBMiZWh0d... | \n", + "Jakarta Police Receive 2 More Reports on Coldp... | \n", + "TEMPO.CO, Jakarta - South Jakarta Metro Police... | \n", + "
5 | \n", + "6 | \n", + "UPDATE - Indonesia: Severe winds damage infras... | \n", + "Severe winds have downed billboards and trees ... | \n", + "Moderate | \n", + "Roadway Closure / Disruption, Flooding, Severe... | \n", + "Indonesia | \n", + "19/4/17 9:10 | \n", + "2017.0 | \n", + "-6.91264 | \n", + "107.65700 | \n", + "False | \n", + "['jakarta'] | \n", + "1.0 | \n", + "True | \n", + "4.0 | \n", + "16.0 | \n", + "UPDATE - Indonesia: Severe winds damage infras... | \n", + "https://news.google.com/rss/articles/CBMiSWh0d... | \n", + "Indonesia hit by some of strongest winds recorded | \n", + "A man stands near damaged houses following a t... | \n", + "
13 | \n", + "14 | \n", + "2 miles E of Chesterfield - A tornado has touc... | \n", + "Government sources are reporting a tornado has... | \n", + "Minor | \n", + "Tornado | \n", + "United States | \n", + "17/9/18 19:55 | \n", + "2018.0 | \n", + "37.51000 | \n", + "-77.61000 | \n", + "False | \n", + "['virginia'] | \n", + "1.0 | \n", + "True | \n", + "9.0 | \n", + "38.0 | \n", + "2 miles E of Chesterfield - A tornado has touc... | \n", + "https://news.google.com/rss/articles/CBMigAFod... | \n", + "UPDATE: Number of homes without power down to ... | \n", + "More than 90,000 homes and businesses across t... | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
5699 | \n", + "5770 | \n", + "Wildfire started by lightning strike near Cart... | \n", + "Local media sources are reporting a wildfire o... | \n", + "Minor | \n", + "Wildfire | \n", + "Spain | \n", + "27/11/20 11:19 | \n", + "2020.0 | \n", + "37.59520 | \n", + "-1.10078 | \n", + "False | \n", + "['cartagena'] | \n", + "1.0 | \n", + "False | \n", + "11.0 | \n", + "48.0 | \n", + "Wildfire started by lightning strike near Cart... | \n", + "https://news.google.com/rss/articles/CBMibmh0d... | \n", + "Neanderthals Were As Smart As Modern Humans an... | \n", + "Representational image (gorodenkoff, Getty Ima... | \n", + "
5700 | \n", + "5771 | \n", + "Wind Event to Impact Southern CA This Week; Da... | \n", + "Meteorologists are predicting a new round of S... | \n", + "Moderate | \n", + "Weather Advisory, Hazmat Response, Network Dis... | \n", + "United States | \n", + "1/12/20 0:08 | \n", + "2020.0 | \n", + "NaN | \n", + "NaN | \n", + "False | \n", + "['los angeles'] | \n", + "1.0 | \n", + "True | \n", + "1.0 | \n", + "2.0 | \n", + "Wind Event to Impact Southern CA This Week; Da... | \n", + "https://news.google.com/rss/articles/CBMihgFod... | \n", + "Hurricane Hilary live updates: Newsom declares... | \n", + "Staff reports\\n\\nPalm Springs Desert Sun\\n\\nLo... | \n", + "
5701 | \n", + "5772 | \n", + "Wind stoppages expected for the Port of Durban... | \n", + "Shipping sources indicate on November 11 that ... | \n", + "Minor | \n", + "Port Disruption | \n", + "South Africa | \n", + "11/11/20 16:03 | \n", + "2020.0 | \n", + "NaN | \n", + "NaN | \n", + "False | \n", + "['durban'] | \n", + "1.0 | \n", + "True | \n", + "11.0 | \n", + "46.0 | \n", + "Wind stoppages expected for the Port of Durban... | \n", + "https://news.google.com/rss/articles/CBMiigFod... | \n", + "Severe thunderstorms expected in most parts of... | \n", + "DURBAN - Severe thunderstorms are expected in ... | \n", + "
5702 | \n", + "5773 | \n", + "Wind warnings issue for Melbourne coast for Oc... | \n", + "Media sources reported that wind warnings for ... | \n", + "Moderate | \n", + "Weather Advisory | \n", + "Australia | \n", + "15/10/20 6:19 | \n", + "2020.0 | \n", + "-37.84796 | \n", + "144.94660 | \n", + "False | \n", + "['melbourne'] | \n", + "1.0 | \n", + "False | \n", + "10.0 | \n", + "42.0 | \n", + "Wind warnings issue for Melbourne coast for Oc... | \n", + "https://news.google.com/rss/articles/CBMieGh0d... | \n", + "NRL Draw 2023: All the scores, results and sch... | \n", + "The 2023 NRL season has finally come to an end... | \n", + "
5705 | \n", + "5776 | \n", + "Winter storm may bring inclement weather condi... | \n", + "Intelligence received by Everstream Analytics ... | \n", + "Moderate | \n", + "Ice Storm | \n", + "United States | \n", + "16/12/20 2:47 | \n", + "2020.0 | \n", + "38.90072 | \n", + "-77.05440 | \n", + "False | \n", + "['new york'] | \n", + "1.0 | \n", + "False | \n", + "12.0 | \n", + "51.0 | \n", + "Winter storm may bring inclement weather condi... | \n", + "https://news.google.com/rss/articles/CBMia2h0d... | \n", + "Big storm to dump heavy rain and snow along Ea... | \n", + "A sprawling winter storm known as a nor’easter... | \n", + "
2786 rows × 20 columns
\n", + "