{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "workding dir: /Users/inflaton/code/engd/papers/maritime/global-incidents\n", "loading env vars from: /Users/inflaton/code/engd/papers/maritime/global-incidents/.env\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "import sys\n", "from pathlib import Path\n", "\n", "workding_dir = str(Path.cwd().parent)\n", "os.chdir(workding_dir)\n", "sys.path.append(workding_dir)\n", "print(\"workding dir:\", workding_dir)\n", "\n", "from dotenv import find_dotenv, load_dotenv\n", "\n", "found_dotenv = find_dotenv(\".env\")\n", "\n", "if len(found_dotenv) == 0:\n", " found_dotenv = find_dotenv(\".env.example\")\n", "print(f\"loading env vars from: {found_dotenv}\")\n", "load_dotenv(found_dotenv, override=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### The file aims for the demonstration of the methdology pipeline, please used our new designed database (new excel attached )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### The following code is when user put a new news article link into the model, the model will extract the headline , Date and Content" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Headline: Singapore Airlines stops using Iranian airspace as 'precautionary measure' amid Middle East tensions\n", "Publication Date: 2024-04-14 08:58:00\n", "Content: Advertisement Singapore Singapore Airlines stops using Iranian airspace as 'precautionary measure' amid Middle East tensions Singapore Airlines says it is closely monitoring the situation in the Middle East. (File photo: REUTERS/Edgar Su) New: You can now listen to articles. This audio is generated by an AI tool. 14 Apr 2024 08:58AM (Updated: 14 Apr 2024 07:26PM) Bookmark Bookmark Share WhatsApp Telegram Facebook Twitter Email LinkedIn SINGAPORE: Singapore Airlines (SIA) said on Sunday (Apr 14) \n" ] } ], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "from datetime import datetime\n", "\n", "\n", "def get_article_details(article_url):\n", " response = requests.get(article_url)\n", " soup = BeautifulSoup(response.content, \"html.parser\")\n", "\n", " # Extract headline\n", " headline_tag = soup.find(\"h1\")\n", " headline = (\n", " headline_tag.get_text(strip=True) if headline_tag else \"No headline found\"\n", " )\n", "\n", " # Attempt to extract publication date with error handling\n", " date_container = soup.find(\"div\", class_=\"article-publish\")\n", " if date_container:\n", " # Extract the text and handle cleaning it up\n", " date_text = date_container.get_text(strip=True)\n", " # Extract the first date assuming it's the publication date (before \"Updated:\")\n", " publication_date_text = date_text.split(\"(Updated:\")[0].strip()\n", " try:\n", " publication_date = datetime.strptime(\n", " publication_date_text, \"%d %b %Y %I:%M%p\"\n", " ).strftime(\"%Y-%m-%d %H:%M:%S\")\n", " except ValueError:\n", " publication_date = \"No publication date found\"\n", " else:\n", " publication_date = \"No publication date found\"\n", "\n", " # Extract main content of the article\n", " article_body = soup.find(\"article\")\n", " if not article_body:\n", " article_body = soup\n", " article_text = (\n", " article_body.get_text(separator=\" \", strip=True)\n", " if article_body\n", " else \"No article content found\"\n", " )\n", "\n", " return headline, publication_date, article_text\n", "\n", "\n", "# Example usage\n", "article_url = \"https://www.channelnewsasia.com/singapore/singapore-airlines-stops-using-iran-airspace-israel-hamas-war-middle-east-escalation-4264011\"\n", "headline, publication_date, article_content = get_article_details(article_url)\n", "print(\"Headline:\", headline)\n", "print(\"Publication Date:\", publication_date)\n", "print(\"Content:\", article_content[:500]) # Print the first 500 characters to check" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Here is the code for Summarization of the aritlce (PLEASE USE UR OWN OPENAI KEY HAHA)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Summary: Singapore Airlines has stopped using Iranian airspace as a precautionary measure due to the escalating tensions in the Middle East. This move was mirrored by other airlines, including Lufthansa and Austrian Airlines, who also suspended flights to and from Tehran. The situation has also led to economic consequences, as flights\n" ] } ], "source": [ "import openai\n", "\n", "\n", "def summarize_article(article_content):\n", " try:\n", " # Generating the prompt for GPT-3\n", " prompt_text = (\n", " \"Summarize the following article in about 70 words, focusing on \"\n", " \"what happened, where it happened, and the consequences (economic loss, environmental impact, etc.):\\n\\n\"\n", " f\"{article_content}\"\n", " )\n", "\n", " # Call to OpenAI's Completion API\n", " response = openai.Completion.create(\n", " engine=\"gpt-3.5-turbo-instruct\",\n", " prompt=prompt_text,\n", " temperature=0.5,\n", " max_tokens=60, # Adjust as needed to fit the summary length\n", " top_p=1,\n", " frequency_penalty=0,\n", " presence_penalty=0,\n", " )\n", "\n", " # Extracting the text from the response\n", " summary = response.choices[0].text.strip()\n", " return summary\n", " except Exception as e:\n", " print(f\"An error occurred: {e}\")\n", " return \"\"\n", "\n", "\n", "summary = summarize_article(article_content)\n", "print(\"Summary:\", summary)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# unique_categories = df['Category'].unique()\n", "# print(unique_categories)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import openai\n", "\n", "\n", "def classify_article(article_content):\n", " prompt = f\"\"\"Read the following article and classify its content into one of these categories: 'Aviation Advisory',\n", "'Bombing',\n", "'Cargo Disruption',\n", "'Warehouse Theft',\n", "'Chemical Spill',\n", "'Injury',\n", "'Earthquake',\n", "'Flooding',\n", "'Ground Transportation Advisory',\n", "'Hazmat Response',\n", "'Ice Storm',\n", "'Individuals in Focus',\n", "'Industrial Action',\n", "'Maritime Accident',\n", "'Maritime Accident.Ground Transportation Advisory',\n", "'Maritime Advisory',\n", "'Mine Workers Strike',\n", "'Miscellaneous Events',\n", "'Miscellaneous Strikes',\n", "'Network Disruption',\n", "'Non-industrial Fire',\n", "'Police Operations',\n", "'Port Closure',\n", "'Port Congestion',\n", "'Port Disruption',\n", "'Power Outage',\n", "'Production Halt',\n", "'Protest',\n", "'Public Safety ,\n", "'Public Transportation Disruption',\n", "'Roadway Closuren',\n", "'Severe Winds',\n", "'Storm',\n", "'Tornado',\n", "'Train Delays,\n", "'Travel Warning',\n", "'Tropical Cyclone Storm',\n", "'Typhoon',\n", "'Vehicle Accident',\n", "'Weather Advisory',\n", "'Workplace Accident'\n", "Summary: {article_content}\n", "Category:\"\"\"\n", "\n", " response = openai.Completion.create(\n", " engine=\"gpt-3.5-turbo-instruct\", # Adjust according to the latest available and appropriate model\n", " prompt=prompt,\n", " temperature=0.7,\n", " max_tokens=60, # Adjust based on your needs\n", " top_p=1.0,\n", " frequency_penalty=0,\n", " presence_penalty=0,\n", " stop=[\"\\n\"], # Stop generating further when a newline character is encountered\n", " )\n", " category = response.choices[0].text.strip()\n", " return category" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "\n", "\n", "def fetch_article_content(url):\n", " response = requests.get(url)\n", " soup = BeautifulSoup(response.content, \"html.parser\")\n", " article_text = \" \".join([p.text for p in soup.find_all(\"p\")])\n", " return article_text" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Category: Aviation Advisory\n" ] } ], "source": [ "import openai\n", "\n", "\n", "def classify_article(url):\n", " # Fetch article content\n", " article_content = fetch_article_content(url)\n", "\n", " # Construct the classification prompt\n", " prompt = f\"\"\"Read the following article and classify its content into one of these categories:\n", " 'Aviation Advisory', 'Bombing',\n", "'Cargo Disruption',\n", "'Chemical Spill',\n", "'Death',\n", "'Earthquake',\n", "'Flooding',\n", "'Ground Transportation Advisory',\n", "'Hazmat Response',\n", "'Ice Storm',\n", "'Individuals in Focus',\n", "'Industrial Action',\n", "'Maritime Accident',\n", "'Maritime Accident.Ground Transportation Advisory',\n", "'Maritime Advisory',\n", "'Mine Workers Strike',\n", "'Miscellaneous Events',\n", "'Miscellaneous Strikes',\n", "'Network Disruption',\n", "'Non-industrial Fire',\n", "'Police Operations',\n", "'Port Closure',\n", "'Port Congestion',\n", "'Port Disruption',\n", "'Power Outage',\n", "'Production Halt',\n", "'Protest / Riot',\n", "'Public Safety / Security',\n", "'Public Transportation Disruption',\n", "'Roadway Closure / Disruption',\n", "'Severe Winds',\n", "'Storm',\n", "'Tornado',\n", "'Train Delays / Disruption',\n", "'Travel Warning',\n", "'Tropical Cyclone / Storm',\n", "'Typhoon',\n", "'Vehicle Accident',\n", "'Weather Advisory',\n", "'Workplace Accident', ...\n", " \n", " Article:\n", " {article_content}\n", " \n", " Category:\"\"\"\n", "\n", " # Classify using OpenAI GPT-3\n", " response = openai.Completion.create(\n", " engine=\"gpt-3.5-turbo-instruct\", # Ensure using a correct and non-deprecated model\n", " prompt=prompt,\n", " temperature=0.7,\n", " max_tokens=60,\n", " top_p=1.0,\n", " frequency_penalty=0,\n", " presence_penalty=0,\n", " stop=[\"\\n\"], # Stop generating further when a newline character is encountered\n", " )\n", " category = response.choices[0].text.strip()\n", " return category\n", "\n", "\n", "# Example usage\n", "url = \"https://www.channelnewsasia.com/singapore/singapore-airlines-stops-using-iran-airspace-israel-hamas-war-middle-east-escalation-4264011\"\n", "category = classify_article(url)\n", "print(\"Category:\", category)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Now you the classfication result, which is quite accurate :))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Singapore Airlines stops using Iranian airspace as 'precautionary measure' amid Middle East tensions\n" ] } ], "source": [ "print(headline)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Update our database!" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Database updated successfully with ID 1.\n" ] } ], "source": [ "import pandas as pd\n", "from datetime import datetime\n", "\n", "\n", "def update_database(file_path, url):\n", " # Fetch details from the article\n", " headline, publication_date, article_content = get_article_details(article_url)\n", " summary = summarize_article(article_content)\n", " category = classify_article(url)\n", "\n", " new_data = {\n", " \"Headline\": headline,\n", " \"Summary\": summary,\n", " \"Category\": category,\n", " \"Datetime\": publication_date,\n", " \"URL\": article_url,\n", " }\n", "\n", " # Load the existing data from the CSV file\n", " try:\n", " df = pd.read_csv(file_path)\n", " except FileNotFoundError:\n", " # If the file does not exist, create a new DataFrame\n", " df = pd.DataFrame(\n", " columns=[\"id\", \"Headline\", \"Summary\", \"Category\", \"Datetime\", \"URL\"]\n", " )\n", " new_id = 1 # Start with ID 1 if no file exists\n", " else:\n", " # If IDs exist, increment from the last used ID\n", " new_id = df[\"id\"].max() + 1 if not df.empty else 1\n", "\n", " # Prepare the new data entry\n", " new_entry = pd.DataFrame(\n", " {\n", " \"id\": [new_id],\n", " \"Headline\": [new_data[\"Headline\"]],\n", " \"Summary\": [new_data[\"Summary\"]],\n", " \"Category\": [new_data[\"Category\"]],\n", " \"Datetime\": [new_data[\"Datetime\"]],\n", " \"URL\": [new_data[\"URL\"]],\n", " }\n", " )\n", "\n", " # Append the new data entry to the DataFrame using concat\n", " df = pd.concat([df, new_entry], ignore_index=True)\n", "\n", " # Save the updated DataFrame back to CSV\n", " df.to_csv(file_path, index=False)\n", " print(f\"Database updated successfully with ID {new_id}.\")\n", "\n", "\n", "# Example usage\n", "url = \"https://www.channelnewsasia.com/singapore/singapore-airlines-stops-using-iran-airspace-israel-hamas-war-middle-east-escalation-4264011\"\n", "file_path = \"cleaned_data1.csv\"\n", "update_database(file_path, url)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " id Headline \\\n", "0 1 Singapore Airlines stops using Iranian airspac... \n", "\n", " Summary Category \\\n", "0 Singapore Airlines has stopped using Iranian a... Aviation Advisory \n", "\n", " Datetime URL \n", "0 2024-04-14 08:58:00 https://www.channelnewsasia.com/singapore/sing... \n" ] } ], "source": [ "import pandas as pd\n", "\n", "\n", "def rank_related_articles(file_path, category):\n", " # Load the existing data from the CSV file\n", " try:\n", " df = pd.read_csv(file_path)\n", " except FileNotFoundError:\n", " print(\"Database file not found.\")\n", " return\n", "\n", " # Filter articles by the specified category\n", " filtered_df = df[df[\"Category\"] == category]\n", "\n", " # Convert 'Datetime' from string to datetime objects for accurate sorting\n", " filtered_df[\"Datetime\"] = pd.to_datetime(filtered_df[\"Datetime\"])\n", "\n", " # Sort articles by 'Datetime' in descending order to get the most recent articles first\n", " sorted_df = filtered_df.sort_values(by=\"Datetime\", ascending=False)\n", "\n", " # Display the sorted DataFrame\n", " print(sorted_df[[\"id\", \"Headline\", \"Summary\", \"Category\", \"Datetime\", \"URL\"]])\n", " return sorted_df\n", "\n", "\n", "# Example usage\n", "file_path = \"cleaned_data1.csv\"\n", "category = \"Aviation Advisory\"\n", "ranked_articles = rank_related_articles(file_path, category)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----+------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------+\n", "| id | Headline | Summary | Category | Datetime | URL |\n", "+----+------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------+\n", "| 1 | Singapore Airlines stops using Iranian airspace as 'precautionary measure' amid Middle East tensions | Singapore Airlines has stopped using Iranian airspace as a precautionary measure amid the escalating tensions in the Middle East. This decision was made after Iran launched over 200 drones and missiles at Israel, following an Israeli strike on an Iranian building in Syria. Other airlines, such as Lufthansa and Austrian | Aviation Advisory | 2024-04-14 08:58:00 | https://www.channelnewsasia.com/singapore/singapore-airlines-stops-using-iran-airspace-israel-hamas-war-middle-east-escalation-4264011 |\n", "+----+------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------+\n" ] } ], "source": [ "import pandas as pd\n", "from tabulate import tabulate\n", "\n", "\n", "def print_ranked_articles_tabulate(file_path, category):\n", " try:\n", " df = pd.read_csv(file_path)\n", " df[\"Datetime\"] = pd.to_datetime(df[\"Datetime\"])\n", " filtered_df = df[df[\"Category\"] == category]\n", " sorted_df = filtered_df.sort_values(by=\"Datetime\", ascending=False)\n", "\n", " # Print DataFrame using tabulate\n", " print(tabulate(sorted_df, headers=\"keys\", tablefmt=\"pretty\", showindex=False))\n", " except FileNotFoundError:\n", " print(\"Database file not found.\")\n", "\n", "\n", "# Example usage\n", "file_path = \"cleaned_data1.csv\"\n", "category = \"Aviation Advisory\"\n", "print_ranked_articles_tabulate(file_path, category)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "!rm cleaned_data1.csv" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }