{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/wangkaiyuan/Desktop/UPenn/Fall 2024/CIS 5190/CIS5190finalproj/.venv/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n", " warnings.warn(\n" ] } ], "source": [ "import pandas as pd\n", "import requests\n", "from bs4 import BeautifulSoup" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
url
0https://www.foxnews.com/lifestyle/jack-carrs-e...
1https://www.foxnews.com/entertainment/bruce-wi...
2https://www.foxnews.com/politics/blinken-meets...
3https://www.foxnews.com/entertainment/emily-bl...
4https://www.foxnews.com/media/the-view-co-host...
\n", "
" ], "text/plain": [ " url\n", "0 https://www.foxnews.com/lifestyle/jack-carrs-e...\n", "1 https://www.foxnews.com/entertainment/bruce-wi...\n", "2 https://www.foxnews.com/politics/blinken-meets...\n", "3 https://www.foxnews.com/entertainment/emily-bl...\n", "4 https://www.foxnews.com/media/the-view-co-host..." ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load csv file and process the data\n", "urls_df = pd.read_csv('url_only_data.csv')\n", "urls_df.head()\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# define the function to fetch the title of the news article\n", "def fetch_title(url):\n", " try:\n", " response = requests.get(url)\n", " if response.status_code != 200:\n", " return f\"Error: {response.status_code}\"\n", " soup = BeautifulSoup(response.text, \"html.parser\")\n", " # Try to find the headline based on a common class used on Fox News pages\n", " title = soup.find(\"h1\", class_=\"headline speakable\")\n", " return title.text.strip() if title else \"Title not found\"\n", " except Exception as e:\n", " return f\"Error: {e}\"\n", "\n", "def fetch_title_altered(url):\n", " try:\n", " response = requests.get(url)\n", " if response.status_code != 200:\n", " return f\"Error: {response.status_code}\"\n", " soup = BeautifulSoup(response.text, \"html.parser\")\n", " # Try to find the headline based on a common class used on Fox News pages\n", " title = soup.find(\"h1\")\n", " return title.text.strip() if title else \"Title not found\"\n", " except Exception as e:\n", " return f\"Error: {e}\"" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# remove the '.print' from the urls\n", "urls_df['url'] = urls_df['url'].str.replace('.print', '', regex=False)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[5], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# fetch the title of the news article\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m urls_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtitle\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43murls_df\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43murl\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfetch_title\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/Desktop/UPenn/Fall 2024/CIS 5190/CIS5190finalproj/.venv/lib/python3.9/site-packages/pandas/core/series.py:4917\u001b[0m, in \u001b[0;36mSeries.apply\u001b[0;34m(self, func, convert_dtype, args, by_row, **kwargs)\u001b[0m\n\u001b[1;32m 4789\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mapply\u001b[39m(\n\u001b[1;32m 4790\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 4791\u001b[0m func: AggFuncType,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 4796\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 4797\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m Series:\n\u001b[1;32m 4798\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 4799\u001b[0m \u001b[38;5;124;03m Invoke function on values of Series.\u001b[39;00m\n\u001b[1;32m 4800\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 4915\u001b[0m \u001b[38;5;124;03m dtype: float64\u001b[39;00m\n\u001b[1;32m 4916\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 4917\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mSeriesApply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 4918\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4919\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4920\u001b[0m \u001b[43m \u001b[49m\u001b[43mconvert_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4921\u001b[0m \u001b[43m \u001b[49m\u001b[43mby_row\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mby_row\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4922\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4923\u001b[0m \u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4924\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/Desktop/UPenn/Fall 2024/CIS 5190/CIS5190finalproj/.venv/lib/python3.9/site-packages/pandas/core/apply.py:1427\u001b[0m, in \u001b[0;36mSeriesApply.apply\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1424\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_compat()\n\u001b[1;32m 1426\u001b[0m \u001b[38;5;66;03m# self.func is Callable\u001b[39;00m\n\u001b[0;32m-> 1427\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply_standard\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/Desktop/UPenn/Fall 2024/CIS 5190/CIS5190finalproj/.venv/lib/python3.9/site-packages/pandas/core/apply.py:1507\u001b[0m, in \u001b[0;36mSeriesApply.apply_standard\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1501\u001b[0m \u001b[38;5;66;03m# row-wise access\u001b[39;00m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# apply doesn't have a `na_action` keyword and for backward compat reasons\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m \u001b[38;5;66;03m# we need to give `na_action=\"ignore\"` for categorical data.\u001b[39;00m\n\u001b[1;32m 1504\u001b[0m \u001b[38;5;66;03m# TODO: remove the `na_action=\"ignore\"` when that default has been changed in\u001b[39;00m\n\u001b[1;32m 1505\u001b[0m \u001b[38;5;66;03m# Categorical (GH51645).\u001b[39;00m\n\u001b[1;32m 1506\u001b[0m action \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj\u001b[38;5;241m.\u001b[39mdtype, CategoricalDtype) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1507\u001b[0m mapped \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_map_values\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1508\u001b[0m \u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcurried\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\n\u001b[1;32m 1509\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1511\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(mapped) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(mapped[\u001b[38;5;241m0\u001b[39m], ABCSeries):\n\u001b[1;32m 1512\u001b[0m \u001b[38;5;66;03m# GH#43986 Need to do list(mapped) in order to get treated as nested\u001b[39;00m\n\u001b[1;32m 1513\u001b[0m \u001b[38;5;66;03m# See also GH#25959 regarding EA support\u001b[39;00m\n\u001b[1;32m 1514\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\u001b[38;5;241m.\u001b[39m_constructor_expanddim(\u001b[38;5;28mlist\u001b[39m(mapped), index\u001b[38;5;241m=\u001b[39mobj\u001b[38;5;241m.\u001b[39mindex)\n", "File \u001b[0;32m~/Desktop/UPenn/Fall 2024/CIS 5190/CIS5190finalproj/.venv/lib/python3.9/site-packages/pandas/core/base.py:921\u001b[0m, in \u001b[0;36mIndexOpsMixin._map_values\u001b[0;34m(self, mapper, na_action, convert)\u001b[0m\n\u001b[1;32m 918\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(arr, ExtensionArray):\n\u001b[1;32m 919\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mmap(mapper, na_action\u001b[38;5;241m=\u001b[39mna_action)\n\u001b[0;32m--> 921\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43malgorithms\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mna_action\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/Desktop/UPenn/Fall 2024/CIS 5190/CIS5190finalproj/.venv/lib/python3.9/site-packages/pandas/core/algorithms.py:1743\u001b[0m, in \u001b[0;36mmap_array\u001b[0;34m(arr, mapper, na_action, convert)\u001b[0m\n\u001b[1;32m 1741\u001b[0m values \u001b[38;5;241m=\u001b[39m arr\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mobject\u001b[39m, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 1742\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m na_action \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1743\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_infer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1744\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mmap_infer_mask(\n\u001b[1;32m 1746\u001b[0m values, mapper, mask\u001b[38;5;241m=\u001b[39misna(values)\u001b[38;5;241m.\u001b[39mview(np\u001b[38;5;241m.\u001b[39muint8), convert\u001b[38;5;241m=\u001b[39mconvert\n\u001b[1;32m 1747\u001b[0m )\n", "File \u001b[0;32mlib.pyx:2972\u001b[0m, in \u001b[0;36mpandas._libs.lib.map_infer\u001b[0;34m()\u001b[0m\n", "Cell \u001b[0;32mIn[3], line 7\u001b[0m, in \u001b[0;36mfetch_title\u001b[0;34m(url)\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 7\u001b[0m soup \u001b[38;5;241m=\u001b[39m \u001b[43mBeautifulSoup\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mhtml.parser\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;66;03m# Try to find the headline based on a common class used on Fox News pages\u001b[39;00m\n\u001b[1;32m 9\u001b[0m title \u001b[38;5;241m=\u001b[39m soup\u001b[38;5;241m.\u001b[39mfind(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mh1\u001b[39m\u001b[38;5;124m\"\u001b[39m, class_\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mheadline speakable\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", "File \u001b[0;32m~/Desktop/UPenn/Fall 2024/CIS 5190/CIS5190finalproj/.venv/lib/python3.9/site-packages/bs4/__init__.py:335\u001b[0m, in \u001b[0;36mBeautifulSoup.__init__\u001b[0;34m(self, markup, features, builder, parse_only, from_encoding, exclude_encodings, element_classes, **kwargs)\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuilder\u001b[38;5;241m.\u001b[39minitialize_soup(\u001b[38;5;28mself\u001b[39m)\n\u001b[1;32m 334\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 335\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_feed\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 336\u001b[0m success \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 337\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n", "File \u001b[0;32m~/Desktop/UPenn/Fall 2024/CIS 5190/CIS5190finalproj/.venv/lib/python3.9/site-packages/bs4/__init__.py:478\u001b[0m, in \u001b[0;36mBeautifulSoup._feed\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 475\u001b[0m \u001b[38;5;66;03m# Convert the document to Unicode.\u001b[39;00m\n\u001b[1;32m 476\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuilder\u001b[38;5;241m.\u001b[39mreset()\n\u001b[0;32m--> 478\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbuilder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeed\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmarkup\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 479\u001b[0m \u001b[38;5;66;03m# Close out any unfinished strings and close all the open tags.\u001b[39;00m\n\u001b[1;32m 480\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mendData()\n", "File \u001b[0;32m~/Desktop/UPenn/Fall 2024/CIS 5190/CIS5190finalproj/.venv/lib/python3.9/site-packages/bs4/builder/_htmlparser.py:380\u001b[0m, in \u001b[0;36mHTMLParserTreeBuilder.feed\u001b[0;34m(self, markup)\u001b[0m\n\u001b[1;32m 378\u001b[0m parser\u001b[38;5;241m.\u001b[39msoup \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msoup\n\u001b[1;32m 379\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 380\u001b[0m \u001b[43mparser\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmarkup\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 381\u001b[0m parser\u001b[38;5;241m.\u001b[39mclose()\n\u001b[1;32m 382\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mAssertionError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 383\u001b[0m \u001b[38;5;66;03m# html.parser raises AssertionError in rare cases to\u001b[39;00m\n\u001b[1;32m 384\u001b[0m \u001b[38;5;66;03m# indicate a fatal problem with the markup, especially\u001b[39;00m\n\u001b[1;32m 385\u001b[0m \u001b[38;5;66;03m# when there's an error in the doctype declaration.\u001b[39;00m\n", "File \u001b[0;32m/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/html/parser.py:110\u001b[0m, in \u001b[0;36mHTMLParser.feed\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"Feed data to the parser.\u001b[39;00m\n\u001b[1;32m 105\u001b[0m \n\u001b[1;32m 106\u001b[0m \u001b[38;5;124;03mCall this as often as you want, with as little or as much text\u001b[39;00m\n\u001b[1;32m 107\u001b[0m \u001b[38;5;124;03mas you want (may include '\\n').\u001b[39;00m\n\u001b[1;32m 108\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 109\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrawdata \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrawdata \u001b[38;5;241m+\u001b[39m data\n\u001b[0;32m--> 110\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgoahead\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/html/parser.py:172\u001b[0m, in \u001b[0;36mHTMLParser.goahead\u001b[0;34m(self, end)\u001b[0m\n\u001b[1;32m 170\u001b[0m k \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparse_starttag(i)\n\u001b[1;32m 171\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m startswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m 172\u001b[0m k \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse_endtag\u001b[49m\u001b[43m(\u001b[49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m startswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m 392\u001b[0m match \u001b[38;5;241m=\u001b[39m \u001b[43mendtagfind\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmatch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrawdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# \u001b[39;00m\n\u001b[1;32m 393\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m match:\n\u001b[1;32m 394\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcdata_elem \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "# fetch the title of the news article\n", "urls_df['title'] = urls_df['url'].apply(fetch_title)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/y8/__mdhnk12l9d1zxvj_wms9h00000gn/T/ipykernel_38707/2702622145.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " not_found['title'] = not_found['url'].apply(fetch_title_altered)\n" ] } ], "source": [ "# fetch the title of the news article that was not found\n", "not_found = urls_df[urls_df['title'] == 'Title not found']\n", "not_found['title'] = not_found['url'].apply(fetch_title_altered)\n", "\n" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "urls_df.update(not_found)" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "# remove duplicates titles\n", "urls_df.drop_duplicates(subset='title', keep='first', inplace=True)" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "# convert title to string\n", "urls_df['title'] = urls_df['title'].astype(str)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# remove the \" \"\" \" from the titles\n", "urls_df['title'] = urls_df['title'].str.strip('\"')" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "# save the data to a new csv file\n", "urls_df.to_csv('fetched_headlines.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [], "source": [ "# Split the data into training and testing sets\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import classification_report\n", "from sklearn.metrics import accuracy_score\n" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "# Convert the labels to binary values (0 for ’FoxNews’, 1 for ’NBC’)\n", "urls_df['label'] = urls_df['url'].apply(lambda x: 0 if 'foxnews.com' in x else 1 if 'nbcnews.com' in x else None)" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [], "source": [ "# split the data into training and testing sets\n", "X_train, X_test, y_train, y_test = train_test_split(urls_df['title'], urls_df['label'], test_size=0.2, random_state=42)\n" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [], "source": [ "# Convert the text data to TF-IDF features\n", "vectorizer = TfidfVectorizer(stop_words='english', max_features=100)\n", "X_train_tfidf = vectorizer.fit_transform(X_train)\n", "X_test_tfidf = vectorizer.transform(X_test)" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "LogisticRegression()" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Train a Logistic Regression model\n", "model = LogisticRegression(max_iter=100)\n", "model.fit(X_train_tfidf, y_train)" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [], "source": [ "y_pred = model.predict(X_test_tfidf)" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.7084\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.72 0.80 0.76 427\n", " 1 0.70 0.59 0.64 331\n", "\n", " accuracy 0.71 758\n", " macro avg 0.71 0.70 0.70 758\n", "weighted avg 0.71 0.71 0.70 758\n", "\n" ] } ], "source": [ "# 7. Evaluate the model\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred)\n", ")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('fetched_headlines.csv')\n", "df.head" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urltitlelabeloutlet
0https://www.foxnews.com/lifestyle/jack-carrs-e...Jack Carr recalls Gen. Eisenhower's D-Day memo...0FoxNews
1https://www.foxnews.com/entertainment/bruce-wi...Bruce Willis, Demi Moore avoided doing one thi...0FoxNews
2https://www.foxnews.com/politics/blinken-meets...Blinken meets Qatar PM, says Israeli actions a...0FoxNews
3https://www.foxnews.com/entertainment/emily-bl...Emily Blunt says her ‘toes curl’ when people t...0FoxNews
4https://www.foxnews.com/media/the-view-co-host...'The View' co-host, CNN commentator Ana Navarr...0FoxNews
\n", "
" ], "text/plain": [ " url \\\n", "0 https://www.foxnews.com/lifestyle/jack-carrs-e... \n", "1 https://www.foxnews.com/entertainment/bruce-wi... \n", "2 https://www.foxnews.com/politics/blinken-meets... \n", "3 https://www.foxnews.com/entertainment/emily-bl... \n", "4 https://www.foxnews.com/media/the-view-co-host... \n", "\n", " title label outlet \n", "0 Jack Carr recalls Gen. Eisenhower's D-Day memo... 0 FoxNews \n", "1 Bruce Willis, Demi Moore avoided doing one thi... 0 FoxNews \n", "2 Blinken meets Qatar PM, says Israeli actions a... 0 FoxNews \n", "3 Emily Blunt says her ‘toes curl’ when people t... 0 FoxNews \n", "4 'The View' co-host, CNN commentator Ana Navarr... 0 FoxNews " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['outlet'] = df['url'].apply(lambda x: 'FoxNews' if 'foxnews.com' in x else 'NBC')\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titleoutletlabel
0Jack Carr recalls Gen. Eisenhower's D-Day memo...FoxNews1
1Bruce Willis, Demi Moore avoided doing one thi...FoxNews1
2Blinken meets Qatar PM, says Israeli actions a...FoxNews1
3Emily Blunt says her ‘toes curl’ when people t...FoxNews1
4'The View' co-host, CNN commentator Ana Navarr...FoxNews1
\n", "
" ], "text/plain": [ " title outlet label\n", "0 Jack Carr recalls Gen. Eisenhower's D-Day memo... FoxNews 1\n", "1 Bruce Willis, Demi Moore avoided doing one thi... FoxNews 1\n", "2 Blinken meets Qatar PM, says Israeli actions a... FoxNews 1\n", "3 Emily Blunt says her ‘toes curl’ when people t... FoxNews 1\n", "4 'The View' co-host, CNN commentator Ana Navarr... FoxNews 1" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Swap label and outlet position and update label values\n", "df['label'] = df['outlet'].apply(lambda x: 1 if x == 'FoxNews' else 0)\n", "df = df[[ 'title', 'outlet', 'label']]\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "df.to_csv('train_data.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([], dtype=object)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['title'].apply(type).unique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 2 }