KaiyuanW commited on
Commit
aee006d
·
verified ·
1 Parent(s): a288599

Upload 2 files

Browse files
Files changed (2) hide show
  1. datacleaning.ipynb +1066 -0
  2. train_data.csv +0 -0
datacleaning.ipynb ADDED
@@ -0,0 +1,1066 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/Users/wangkaiyuan/Desktop/UPenn/Fall 2024/CIS 5190/CIS5190finalproj/.venv/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n",
13
+ " warnings.warn(\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "import pandas as pd\n",
19
+ "import requests\n",
20
+ "from bs4 import BeautifulSoup"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 2,
26
+ "metadata": {},
27
+ "outputs": [
28
+ {
29
+ "data": {
30
+ "text/html": [
31
+ "<div>\n",
32
+ "<style scoped>\n",
33
+ " .dataframe tbody tr th:only-of-type {\n",
34
+ " vertical-align: middle;\n",
35
+ " }\n",
36
+ "\n",
37
+ " .dataframe tbody tr th {\n",
38
+ " vertical-align: top;\n",
39
+ " }\n",
40
+ "\n",
41
+ " .dataframe thead th {\n",
42
+ " text-align: right;\n",
43
+ " }\n",
44
+ "</style>\n",
45
+ "<table border=\"1\" class=\"dataframe\">\n",
46
+ " <thead>\n",
47
+ " <tr style=\"text-align: right;\">\n",
48
+ " <th></th>\n",
49
+ " <th>url</th>\n",
50
+ " </tr>\n",
51
+ " </thead>\n",
52
+ " <tbody>\n",
53
+ " <tr>\n",
54
+ " <th>0</th>\n",
55
+ " <td>https://www.foxnews.com/lifestyle/jack-carrs-e...</td>\n",
56
+ " </tr>\n",
57
+ " <tr>\n",
58
+ " <th>1</th>\n",
59
+ " <td>https://www.foxnews.com/entertainment/bruce-wi...</td>\n",
60
+ " </tr>\n",
61
+ " <tr>\n",
62
+ " <th>2</th>\n",
63
+ " <td>https://www.foxnews.com/politics/blinken-meets...</td>\n",
64
+ " </tr>\n",
65
+ " <tr>\n",
66
+ " <th>3</th>\n",
67
+ " <td>https://www.foxnews.com/entertainment/emily-bl...</td>\n",
68
+ " </tr>\n",
69
+ " <tr>\n",
70
+ " <th>4</th>\n",
71
+ " <td>https://www.foxnews.com/media/the-view-co-host...</td>\n",
72
+ " </tr>\n",
73
+ " </tbody>\n",
74
+ "</table>\n",
75
+ "</div>"
76
+ ],
77
+ "text/plain": [
78
+ " url\n",
79
+ "0 https://www.foxnews.com/lifestyle/jack-carrs-e...\n",
80
+ "1 https://www.foxnews.com/entertainment/bruce-wi...\n",
81
+ "2 https://www.foxnews.com/politics/blinken-meets...\n",
82
+ "3 https://www.foxnews.com/entertainment/emily-bl...\n",
83
+ "4 https://www.foxnews.com/media/the-view-co-host..."
84
+ ]
85
+ },
86
+ "execution_count": 2,
87
+ "metadata": {},
88
+ "output_type": "execute_result"
89
+ }
90
+ ],
91
+ "source": [
92
+ "# load csv file and process the data\n",
93
+ "urls_df = pd.read_csv('url_only_data.csv')\n",
94
+ "urls_df.head()\n"
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "execution_count": 3,
100
+ "metadata": {},
101
+ "outputs": [],
102
+ "source": [
103
+ "# define the function to fetch the title of the news article\n",
104
+ "def fetch_title(url):\n",
105
+ " try:\n",
106
+ " response = requests.get(url)\n",
107
+ " if response.status_code != 200:\n",
108
+ " return f\"Error: {response.status_code}\"\n",
109
+ " soup = BeautifulSoup(response.text, \"html.parser\")\n",
110
+ " # Try to find the headline based on a common class used on Fox News pages\n",
111
+ " title = soup.find(\"h1\", class_=\"headline speakable\")\n",
112
+ " return title.text.strip() if title else \"Title not found\"\n",
113
+ " except Exception as e:\n",
114
+ " return f\"Error: {e}\"\n",
115
+ "\n",
116
+ "def fetch_title_altered(url):\n",
117
+ " try:\n",
118
+ " response = requests.get(url)\n",
119
+ " if response.status_code != 200:\n",
120
+ " return f\"Error: {response.status_code}\"\n",
121
+ " soup = BeautifulSoup(response.text, \"html.parser\")\n",
122
+ " # Try to find the headline based on a common class used on Fox News pages\n",
123
+ " title = soup.find(\"h1\")\n",
124
+ " return title.text.strip() if title else \"Title not found\"\n",
125
+ " except Exception as e:\n",
126
+ " return f\"Error: {e}\""
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "execution_count": 4,
132
+ "metadata": {},
133
+ "outputs": [],
134
+ "source": [
135
+ "# remove the '.print' from the urls\n",
136
+ "urls_df['url'] = urls_df['url'].str.replace('.print', '', regex=False)"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": 5,
142
+ "metadata": {},
143
+ "outputs": [
144
+ {
145
+ "ename": "KeyboardInterrupt",
146
+ "evalue": "",
147
+ "output_type": "error",
148
+ "traceback": [
149
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
150
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
151
+ "Cell \u001b[0;32mIn[5], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# fetch the title of the news article\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m urls_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtitle\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43murls_df\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43murl\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfetch_title\u001b[49m\u001b[43m)\u001b[49m\n",
152
+ "File \u001b[0;32m~/Desktop/UPenn/Fall 2024/CIS 5190/CIS5190finalproj/.venv/lib/python3.9/site-packages/pandas/core/series.py:4917\u001b[0m, in \u001b[0;36mSeries.apply\u001b[0;34m(self, func, convert_dtype, args, by_row, **kwargs)\u001b[0m\n\u001b[1;32m 4789\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mapply\u001b[39m(\n\u001b[1;32m 4790\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 4791\u001b[0m func: AggFuncType,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 4796\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 4797\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m Series:\n\u001b[1;32m 4798\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 4799\u001b[0m \u001b[38;5;124;03m Invoke function on values of Series.\u001b[39;00m\n\u001b[1;32m 4800\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 4915\u001b[0m \u001b[38;5;124;03m dtype: float64\u001b[39;00m\n\u001b[1;32m 4916\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 4917\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mSeriesApply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 4918\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4919\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4920\u001b[0m \u001b[43m \u001b[49m\u001b[43mconvert_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4921\u001b[0m \u001b[43m \u001b[49m\u001b[43mby_row\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mby_row\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4922\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4923\u001b[0m \u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4924\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
153
+ "File \u001b[0;32m~/Desktop/UPenn/Fall 2024/CIS 5190/CIS5190finalproj/.venv/lib/python3.9/site-packages/pandas/core/apply.py:1427\u001b[0m, in \u001b[0;36mSeriesApply.apply\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1424\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_compat()\n\u001b[1;32m 1426\u001b[0m \u001b[38;5;66;03m# self.func is Callable\u001b[39;00m\n\u001b[0;32m-> 1427\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply_standard\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
154
+ "File \u001b[0;32m~/Desktop/UPenn/Fall 2024/CIS 5190/CIS5190finalproj/.venv/lib/python3.9/site-packages/pandas/core/apply.py:1507\u001b[0m, in \u001b[0;36mSeriesApply.apply_standard\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1501\u001b[0m \u001b[38;5;66;03m# row-wise access\u001b[39;00m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# apply doesn't have a `na_action` keyword and for backward compat reasons\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m \u001b[38;5;66;03m# we need to give `na_action=\"ignore\"` for categorical data.\u001b[39;00m\n\u001b[1;32m 1504\u001b[0m \u001b[38;5;66;03m# TODO: remove the `na_action=\"ignore\"` when that default has been changed in\u001b[39;00m\n\u001b[1;32m 1505\u001b[0m \u001b[38;5;66;03m# Categorical (GH51645).\u001b[39;00m\n\u001b[1;32m 1506\u001b[0m action \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj\u001b[38;5;241m.\u001b[39mdtype, CategoricalDtype) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1507\u001b[0m mapped \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_map_values\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1508\u001b[0m \u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcurried\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\n\u001b[1;32m 1509\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1511\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(mapped) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(mapped[\u001b[38;5;241m0\u001b[39m], ABCSeries):\n\u001b[1;32m 1512\u001b[0m \u001b[38;5;66;03m# GH#43986 Need to do list(mapped) in order to get treated as nested\u001b[39;00m\n\u001b[1;32m 1513\u001b[0m \u001b[38;5;66;03m# See also GH#25959 regarding EA support\u001b[39;00m\n\u001b[1;32m 1514\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\u001b[38;5;241m.\u001b[39m_constructor_expanddim(\u001b[38;5;28mlist\u001b[39m(mapped), index\u001b[38;5;241m=\u001b[39mobj\u001b[38;5;241m.\u001b[39mindex)\n",
155
+ "File \u001b[0;32m~/Desktop/UPenn/Fall 2024/CIS 5190/CIS5190finalproj/.venv/lib/python3.9/site-packages/pandas/core/base.py:921\u001b[0m, in \u001b[0;36mIndexOpsMixin._map_values\u001b[0;34m(self, mapper, na_action, convert)\u001b[0m\n\u001b[1;32m 918\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(arr, ExtensionArray):\n\u001b[1;32m 919\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mmap(mapper, na_action\u001b[38;5;241m=\u001b[39mna_action)\n\u001b[0;32m--> 921\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43malgorithms\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mna_action\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n",
156
+ "File \u001b[0;32m~/Desktop/UPenn/Fall 2024/CIS 5190/CIS5190finalproj/.venv/lib/python3.9/site-packages/pandas/core/algorithms.py:1743\u001b[0m, in \u001b[0;36mmap_array\u001b[0;34m(arr, mapper, na_action, convert)\u001b[0m\n\u001b[1;32m 1741\u001b[0m values \u001b[38;5;241m=\u001b[39m arr\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mobject\u001b[39m, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 1742\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m na_action \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1743\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_infer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1744\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mmap_infer_mask(\n\u001b[1;32m 1746\u001b[0m values, mapper, mask\u001b[38;5;241m=\u001b[39misna(values)\u001b[38;5;241m.\u001b[39mview(np\u001b[38;5;241m.\u001b[39muint8), convert\u001b[38;5;241m=\u001b[39mconvert\n\u001b[1;32m 1747\u001b[0m )\n",
157
+ "File \u001b[0;32mlib.pyx:2972\u001b[0m, in \u001b[0;36mpandas._libs.lib.map_infer\u001b[0;34m()\u001b[0m\n",
158
+ "Cell \u001b[0;32mIn[3], line 7\u001b[0m, in \u001b[0;36mfetch_title\u001b[0;34m(url)\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 7\u001b[0m soup \u001b[38;5;241m=\u001b[39m \u001b[43mBeautifulSoup\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mhtml.parser\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;66;03m# Try to find the headline based on a common class used on Fox News pages\u001b[39;00m\n\u001b[1;32m 9\u001b[0m title \u001b[38;5;241m=\u001b[39m soup\u001b[38;5;241m.\u001b[39mfind(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mh1\u001b[39m\u001b[38;5;124m\"\u001b[39m, class_\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mheadline speakable\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
159
+ "File \u001b[0;32m~/Desktop/UPenn/Fall 2024/CIS 5190/CIS5190finalproj/.venv/lib/python3.9/site-packages/bs4/__init__.py:335\u001b[0m, in \u001b[0;36mBeautifulSoup.__init__\u001b[0;34m(self, markup, features, builder, parse_only, from_encoding, exclude_encodings, element_classes, **kwargs)\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuilder\u001b[38;5;241m.\u001b[39minitialize_soup(\u001b[38;5;28mself\u001b[39m)\n\u001b[1;32m 334\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 335\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_feed\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 336\u001b[0m success \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 337\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n",
160
+ "File \u001b[0;32m~/Desktop/UPenn/Fall 2024/CIS 5190/CIS5190finalproj/.venv/lib/python3.9/site-packages/bs4/__init__.py:478\u001b[0m, in \u001b[0;36mBeautifulSoup._feed\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 475\u001b[0m \u001b[38;5;66;03m# Convert the document to Unicode.\u001b[39;00m\n\u001b[1;32m 476\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuilder\u001b[38;5;241m.\u001b[39mreset()\n\u001b[0;32m--> 478\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbuilder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeed\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmarkup\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 479\u001b[0m \u001b[38;5;66;03m# Close out any unfinished strings and close all the open tags.\u001b[39;00m\n\u001b[1;32m 480\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mendData()\n",
161
+ "File \u001b[0;32m~/Desktop/UPenn/Fall 2024/CIS 5190/CIS5190finalproj/.venv/lib/python3.9/site-packages/bs4/builder/_htmlparser.py:380\u001b[0m, in \u001b[0;36mHTMLParserTreeBuilder.feed\u001b[0;34m(self, markup)\u001b[0m\n\u001b[1;32m 378\u001b[0m parser\u001b[38;5;241m.\u001b[39msoup \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msoup\n\u001b[1;32m 379\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 380\u001b[0m \u001b[43mparser\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmarkup\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 381\u001b[0m parser\u001b[38;5;241m.\u001b[39mclose()\n\u001b[1;32m 382\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mAssertionError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 383\u001b[0m \u001b[38;5;66;03m# html.parser raises AssertionError in rare cases to\u001b[39;00m\n\u001b[1;32m 384\u001b[0m \u001b[38;5;66;03m# indicate a fatal problem with the markup, especially\u001b[39;00m\n\u001b[1;32m 385\u001b[0m \u001b[38;5;66;03m# when there's an error in the doctype declaration.\u001b[39;00m\n",
162
+ "File \u001b[0;32m/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/html/parser.py:110\u001b[0m, in \u001b[0;36mHTMLParser.feed\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"Feed data to the parser.\u001b[39;00m\n\u001b[1;32m 105\u001b[0m \n\u001b[1;32m 106\u001b[0m \u001b[38;5;124;03mCall this as often as you want, with as little or as much text\u001b[39;00m\n\u001b[1;32m 107\u001b[0m \u001b[38;5;124;03mas you want (may include '\\n').\u001b[39;00m\n\u001b[1;32m 108\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 109\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrawdata \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrawdata \u001b[38;5;241m+\u001b[39m data\n\u001b[0;32m--> 110\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgoahead\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n",
163
+ "File \u001b[0;32m/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/html/parser.py:172\u001b[0m, in \u001b[0;36mHTMLParser.goahead\u001b[0;34m(self, end)\u001b[0m\n\u001b[1;32m 170\u001b[0m k \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparse_starttag(i)\n\u001b[1;32m 171\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m startswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m</\u001b[39m\u001b[38;5;124m\"\u001b[39m, i):\n\u001b[0;32m--> 172\u001b[0m k \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse_endtag\u001b[49m\u001b[43m(\u001b[49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m startswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m<!--\u001b[39m\u001b[38;5;124m\"\u001b[39m, i):\n\u001b[1;32m 174\u001b[0m k \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparse_comment(i)\n",
164
+ "File \u001b[0;32m/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/html/parser.py:392\u001b[0m, in \u001b[0;36mHTMLParser.parse_endtag\u001b[0;34m(self, i)\u001b[0m\n\u001b[1;32m 390\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 391\u001b[0m gtpos \u001b[38;5;241m=\u001b[39m match\u001b[38;5;241m.\u001b[39mend()\n\u001b[0;32m--> 392\u001b[0m match \u001b[38;5;241m=\u001b[39m \u001b[43mendtagfind\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmatch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrawdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# </ + tag + >\u001b[39;00m\n\u001b[1;32m 393\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m match:\n\u001b[1;32m 394\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcdata_elem \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
165
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
166
+ ]
167
+ }
168
+ ],
169
+ "source": [
170
+ "# fetch the title of the news article\n",
171
+ "urls_df['title'] = urls_df['url'].apply(fetch_title)"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": null,
177
+ "metadata": {},
178
+ "outputs": [
179
+ {
180
+ "name": "stderr",
181
+ "output_type": "stream",
182
+ "text": [
183
+ "/var/folders/y8/__mdhnk12l9d1zxvj_wms9h00000gn/T/ipykernel_38707/2702622145.py:3: SettingWithCopyWarning: \n",
184
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
185
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
186
+ "\n",
187
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
188
+ " not_found['title'] = not_found['url'].apply(fetch_title_altered)\n"
189
+ ]
190
+ }
191
+ ],
192
+ "source": [
193
+ "# fetch the title of the news article that was not found\n",
194
+ "not_found = urls_df[urls_df['title'] == 'Title not found']\n",
195
+ "not_found['title'] = not_found['url'].apply(fetch_title_altered)\n",
196
+ "\n"
197
+ ]
198
+ },
199
+ {
200
+ "cell_type": "code",
201
+ "execution_count": 72,
202
+ "metadata": {},
203
+ "outputs": [],
204
+ "source": [
205
+ "urls_df.update(not_found)"
206
+ ]
207
+ },
208
+ {
209
+ "cell_type": "code",
210
+ "execution_count": 75,
211
+ "metadata": {},
212
+ "outputs": [],
213
+ "source": [
214
+ "# remove duplicates titles\n",
215
+ "urls_df.drop_duplicates(subset='title', keep='first', inplace=True)"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": 84,
221
+ "metadata": {},
222
+ "outputs": [],
223
+ "source": [
224
+ "# convert title to string\n",
225
+ "urls_df['title'] = urls_df['title'].astype(str)"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": null,
231
+ "metadata": {},
232
+ "outputs": [],
233
+ "source": [
234
+ "# remove the \" \"\" \" from the titles\n",
235
+ "urls_df['title'] = urls_df['title'].str.strip('\"')"
236
+ ]
237
+ },
238
+ {
239
+ "cell_type": "code",
240
+ "execution_count": 93,
241
+ "metadata": {},
242
+ "outputs": [],
243
+ "source": [
244
+ "# save the data to a new csv file\n",
245
+ "urls_df.to_csv('fetched_headlines.csv', index=False)"
246
+ ]
247
+ },
248
+ {
249
+ "cell_type": "code",
250
+ "execution_count": 104,
251
+ "metadata": {},
252
+ "outputs": [],
253
+ "source": [
254
+ "# Split the data into training and testing sets\n",
255
+ "from sklearn.model_selection import train_test_split\n",
256
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
257
+ "from sklearn.linear_model import LogisticRegression\n",
258
+ "from sklearn.metrics import classification_report\n",
259
+ "from sklearn.metrics import accuracy_score\n"
260
+ ]
261
+ },
262
+ {
263
+ "cell_type": "code",
264
+ "execution_count": 91,
265
+ "metadata": {},
266
+ "outputs": [],
267
+ "source": [
268
+ "# Convert the labels to binary values (0 for ’FoxNews’, 1 for ’NBC’)\n",
269
+ "urls_df['label'] = urls_df['url'].apply(lambda x: 0 if 'foxnews.com' in x else 1 if 'nbcnews.com' in x else None)"
270
+ ]
271
+ },
272
+ {
273
+ "cell_type": "code",
274
+ "execution_count": 97,
275
+ "metadata": {},
276
+ "outputs": [],
277
+ "source": [
278
+ "# split the data into training and testing sets\n",
279
+ "X_train, X_test, y_train, y_test = train_test_split(urls_df['title'], urls_df['label'], test_size=0.2, random_state=42)\n"
280
+ ]
281
+ },
282
+ {
283
+ "cell_type": "code",
284
+ "execution_count": 98,
285
+ "metadata": {},
286
+ "outputs": [],
287
+ "source": [
288
+ "# Convert the text data to TF-IDF features\n",
289
+ "vectorizer = TfidfVectorizer(stop_words='english', max_features=100)\n",
290
+ "X_train_tfidf = vectorizer.fit_transform(X_train)\n",
291
+ "X_test_tfidf = vectorizer.transform(X_test)"
292
+ ]
293
+ },
294
+ {
295
+ "cell_type": "code",
296
+ "execution_count": 99,
297
+ "metadata": {},
298
+ "outputs": [
299
+ {
300
+ "data": {
301
+ "text/html": [
302
+ "<style>#sk-container-id-1 {\n",
303
+ " /* Definition of color scheme common for light and dark mode */\n",
304
+ " --sklearn-color-text: black;\n",
305
+ " --sklearn-color-line: gray;\n",
306
+ " /* Definition of color scheme for unfitted estimators */\n",
307
+ " --sklearn-color-unfitted-level-0: #fff5e6;\n",
308
+ " --sklearn-color-unfitted-level-1: #f6e4d2;\n",
309
+ " --sklearn-color-unfitted-level-2: #ffe0b3;\n",
310
+ " --sklearn-color-unfitted-level-3: chocolate;\n",
311
+ " /* Definition of color scheme for fitted estimators */\n",
312
+ " --sklearn-color-fitted-level-0: #f0f8ff;\n",
313
+ " --sklearn-color-fitted-level-1: #d4ebff;\n",
314
+ " --sklearn-color-fitted-level-2: #b3dbfd;\n",
315
+ " --sklearn-color-fitted-level-3: cornflowerblue;\n",
316
+ "\n",
317
+ " /* Specific color for light theme */\n",
318
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
319
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
320
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
321
+ " --sklearn-color-icon: #696969;\n",
322
+ "\n",
323
+ " @media (prefers-color-scheme: dark) {\n",
324
+ " /* Redefinition of color scheme for dark theme */\n",
325
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
326
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
327
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
328
+ " --sklearn-color-icon: #878787;\n",
329
+ " }\n",
330
+ "}\n",
331
+ "\n",
332
+ "#sk-container-id-1 {\n",
333
+ " color: var(--sklearn-color-text);\n",
334
+ "}\n",
335
+ "\n",
336
+ "#sk-container-id-1 pre {\n",
337
+ " padding: 0;\n",
338
+ "}\n",
339
+ "\n",
340
+ "#sk-container-id-1 input.sk-hidden--visually {\n",
341
+ " border: 0;\n",
342
+ " clip: rect(1px 1px 1px 1px);\n",
343
+ " clip: rect(1px, 1px, 1px, 1px);\n",
344
+ " height: 1px;\n",
345
+ " margin: -1px;\n",
346
+ " overflow: hidden;\n",
347
+ " padding: 0;\n",
348
+ " position: absolute;\n",
349
+ " width: 1px;\n",
350
+ "}\n",
351
+ "\n",
352
+ "#sk-container-id-1 div.sk-dashed-wrapped {\n",
353
+ " border: 1px dashed var(--sklearn-color-line);\n",
354
+ " margin: 0 0.4em 0.5em 0.4em;\n",
355
+ " box-sizing: border-box;\n",
356
+ " padding-bottom: 0.4em;\n",
357
+ " background-color: var(--sklearn-color-background);\n",
358
+ "}\n",
359
+ "\n",
360
+ "#sk-container-id-1 div.sk-container {\n",
361
+ " /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
362
+ " but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
363
+ " so we also need the `!important` here to be able to override the\n",
364
+ " default hidden behavior on the sphinx rendered scikit-learn.org.\n",
365
+ " See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
366
+ " display: inline-block !important;\n",
367
+ " position: relative;\n",
368
+ "}\n",
369
+ "\n",
370
+ "#sk-container-id-1 div.sk-text-repr-fallback {\n",
371
+ " display: none;\n",
372
+ "}\n",
373
+ "\n",
374
+ "div.sk-parallel-item,\n",
375
+ "div.sk-serial,\n",
376
+ "div.sk-item {\n",
377
+ " /* draw centered vertical line to link estimators */\n",
378
+ " background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
379
+ " background-size: 2px 100%;\n",
380
+ " background-repeat: no-repeat;\n",
381
+ " background-position: center center;\n",
382
+ "}\n",
383
+ "\n",
384
+ "/* Parallel-specific style estimator block */\n",
385
+ "\n",
386
+ "#sk-container-id-1 div.sk-parallel-item::after {\n",
387
+ " content: \"\";\n",
388
+ " width: 100%;\n",
389
+ " border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
390
+ " flex-grow: 1;\n",
391
+ "}\n",
392
+ "\n",
393
+ "#sk-container-id-1 div.sk-parallel {\n",
394
+ " display: flex;\n",
395
+ " align-items: stretch;\n",
396
+ " justify-content: center;\n",
397
+ " background-color: var(--sklearn-color-background);\n",
398
+ " position: relative;\n",
399
+ "}\n",
400
+ "\n",
401
+ "#sk-container-id-1 div.sk-parallel-item {\n",
402
+ " display: flex;\n",
403
+ " flex-direction: column;\n",
404
+ "}\n",
405
+ "\n",
406
+ "#sk-container-id-1 div.sk-parallel-item:first-child::after {\n",
407
+ " align-self: flex-end;\n",
408
+ " width: 50%;\n",
409
+ "}\n",
410
+ "\n",
411
+ "#sk-container-id-1 div.sk-parallel-item:last-child::after {\n",
412
+ " align-self: flex-start;\n",
413
+ " width: 50%;\n",
414
+ "}\n",
415
+ "\n",
416
+ "#sk-container-id-1 div.sk-parallel-item:only-child::after {\n",
417
+ " width: 0;\n",
418
+ "}\n",
419
+ "\n",
420
+ "/* Serial-specific style estimator block */\n",
421
+ "\n",
422
+ "#sk-container-id-1 div.sk-serial {\n",
423
+ " display: flex;\n",
424
+ " flex-direction: column;\n",
425
+ " align-items: center;\n",
426
+ " background-color: var(--sklearn-color-background);\n",
427
+ " padding-right: 1em;\n",
428
+ " padding-left: 1em;\n",
429
+ "}\n",
430
+ "\n",
431
+ "\n",
432
+ "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
433
+ "clickable and can be expanded/collapsed.\n",
434
+ "- Pipeline and ColumnTransformer use this feature and define the default style\n",
435
+ "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
436
+ "*/\n",
437
+ "\n",
438
+ "/* Pipeline and ColumnTransformer style (default) */\n",
439
+ "\n",
440
+ "#sk-container-id-1 div.sk-toggleable {\n",
441
+ " /* Default theme specific background. It is overwritten whether we have a\n",
442
+ " specific estimator or a Pipeline/ColumnTransformer */\n",
443
+ " background-color: var(--sklearn-color-background);\n",
444
+ "}\n",
445
+ "\n",
446
+ "/* Toggleable label */\n",
447
+ "#sk-container-id-1 label.sk-toggleable__label {\n",
448
+ " cursor: pointer;\n",
449
+ " display: block;\n",
450
+ " width: 100%;\n",
451
+ " margin-bottom: 0;\n",
452
+ " padding: 0.5em;\n",
453
+ " box-sizing: border-box;\n",
454
+ " text-align: center;\n",
455
+ "}\n",
456
+ "\n",
457
+ "#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n",
458
+ " /* Arrow on the left of the label */\n",
459
+ " content: \"▸\";\n",
460
+ " float: left;\n",
461
+ " margin-right: 0.25em;\n",
462
+ " color: var(--sklearn-color-icon);\n",
463
+ "}\n",
464
+ "\n",
465
+ "#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n",
466
+ " color: var(--sklearn-color-text);\n",
467
+ "}\n",
468
+ "\n",
469
+ "/* Toggleable content - dropdown */\n",
470
+ "\n",
471
+ "#sk-container-id-1 div.sk-toggleable__content {\n",
472
+ " max-height: 0;\n",
473
+ " max-width: 0;\n",
474
+ " overflow: hidden;\n",
475
+ " text-align: left;\n",
476
+ " /* unfitted */\n",
477
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
478
+ "}\n",
479
+ "\n",
480
+ "#sk-container-id-1 div.sk-toggleable__content.fitted {\n",
481
+ " /* fitted */\n",
482
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
483
+ "}\n",
484
+ "\n",
485
+ "#sk-container-id-1 div.sk-toggleable__content pre {\n",
486
+ " margin: 0.2em;\n",
487
+ " border-radius: 0.25em;\n",
488
+ " color: var(--sklearn-color-text);\n",
489
+ " /* unfitted */\n",
490
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
491
+ "}\n",
492
+ "\n",
493
+ "#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n",
494
+ " /* unfitted */\n",
495
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
496
+ "}\n",
497
+ "\n",
498
+ "#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
499
+ " /* Expand drop-down */\n",
500
+ " max-height: 200px;\n",
501
+ " max-width: 100%;\n",
502
+ " overflow: auto;\n",
503
+ "}\n",
504
+ "\n",
505
+ "#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
506
+ " content: \"▾\";\n",
507
+ "}\n",
508
+ "\n",
509
+ "/* Pipeline/ColumnTransformer-specific style */\n",
510
+ "\n",
511
+ "#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
512
+ " color: var(--sklearn-color-text);\n",
513
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
514
+ "}\n",
515
+ "\n",
516
+ "#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
517
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
518
+ "}\n",
519
+ "\n",
520
+ "/* Estimator-specific style */\n",
521
+ "\n",
522
+ "/* Colorize estimator box */\n",
523
+ "#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
524
+ " /* unfitted */\n",
525
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
526
+ "}\n",
527
+ "\n",
528
+ "#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
529
+ " /* fitted */\n",
530
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
531
+ "}\n",
532
+ "\n",
533
+ "#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n",
534
+ "#sk-container-id-1 div.sk-label label {\n",
535
+ " /* The background is the default theme color */\n",
536
+ " color: var(--sklearn-color-text-on-default-background);\n",
537
+ "}\n",
538
+ "\n",
539
+ "/* On hover, darken the color of the background */\n",
540
+ "#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n",
541
+ " color: var(--sklearn-color-text);\n",
542
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
543
+ "}\n",
544
+ "\n",
545
+ "/* Label box, darken color on hover, fitted */\n",
546
+ "#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
547
+ " color: var(--sklearn-color-text);\n",
548
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
549
+ "}\n",
550
+ "\n",
551
+ "/* Estimator label */\n",
552
+ "\n",
553
+ "#sk-container-id-1 div.sk-label label {\n",
554
+ " font-family: monospace;\n",
555
+ " font-weight: bold;\n",
556
+ " display: inline-block;\n",
557
+ " line-height: 1.2em;\n",
558
+ "}\n",
559
+ "\n",
560
+ "#sk-container-id-1 div.sk-label-container {\n",
561
+ " text-align: center;\n",
562
+ "}\n",
563
+ "\n",
564
+ "/* Estimator-specific */\n",
565
+ "#sk-container-id-1 div.sk-estimator {\n",
566
+ " font-family: monospace;\n",
567
+ " border: 1px dotted var(--sklearn-color-border-box);\n",
568
+ " border-radius: 0.25em;\n",
569
+ " box-sizing: border-box;\n",
570
+ " margin-bottom: 0.5em;\n",
571
+ " /* unfitted */\n",
572
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
573
+ "}\n",
574
+ "\n",
575
+ "#sk-container-id-1 div.sk-estimator.fitted {\n",
576
+ " /* fitted */\n",
577
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
578
+ "}\n",
579
+ "\n",
580
+ "/* on hover */\n",
581
+ "#sk-container-id-1 div.sk-estimator:hover {\n",
582
+ " /* unfitted */\n",
583
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
584
+ "}\n",
585
+ "\n",
586
+ "#sk-container-id-1 div.sk-estimator.fitted:hover {\n",
587
+ " /* fitted */\n",
588
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
589
+ "}\n",
590
+ "\n",
591
+ "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
592
+ "\n",
593
+ "/* Common style for \"i\" and \"?\" */\n",
594
+ "\n",
595
+ ".sk-estimator-doc-link,\n",
596
+ "a:link.sk-estimator-doc-link,\n",
597
+ "a:visited.sk-estimator-doc-link {\n",
598
+ " float: right;\n",
599
+ " font-size: smaller;\n",
600
+ " line-height: 1em;\n",
601
+ " font-family: monospace;\n",
602
+ " background-color: var(--sklearn-color-background);\n",
603
+ " border-radius: 1em;\n",
604
+ " height: 1em;\n",
605
+ " width: 1em;\n",
606
+ " text-decoration: none !important;\n",
607
+ " margin-left: 1ex;\n",
608
+ " /* unfitted */\n",
609
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
610
+ " color: var(--sklearn-color-unfitted-level-1);\n",
611
+ "}\n",
612
+ "\n",
613
+ ".sk-estimator-doc-link.fitted,\n",
614
+ "a:link.sk-estimator-doc-link.fitted,\n",
615
+ "a:visited.sk-estimator-doc-link.fitted {\n",
616
+ " /* fitted */\n",
617
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
618
+ " color: var(--sklearn-color-fitted-level-1);\n",
619
+ "}\n",
620
+ "\n",
621
+ "/* On hover */\n",
622
+ "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
623
+ ".sk-estimator-doc-link:hover,\n",
624
+ "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
625
+ ".sk-estimator-doc-link:hover {\n",
626
+ " /* unfitted */\n",
627
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
628
+ " color: var(--sklearn-color-background);\n",
629
+ " text-decoration: none;\n",
630
+ "}\n",
631
+ "\n",
632
+ "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
633
+ ".sk-estimator-doc-link.fitted:hover,\n",
634
+ "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
635
+ ".sk-estimator-doc-link.fitted:hover {\n",
636
+ " /* fitted */\n",
637
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
638
+ " color: var(--sklearn-color-background);\n",
639
+ " text-decoration: none;\n",
640
+ "}\n",
641
+ "\n",
642
+ "/* Span, style for the box shown on hovering the info icon */\n",
643
+ ".sk-estimator-doc-link span {\n",
644
+ " display: none;\n",
645
+ " z-index: 9999;\n",
646
+ " position: relative;\n",
647
+ " font-weight: normal;\n",
648
+ " right: .2ex;\n",
649
+ " padding: .5ex;\n",
650
+ " margin: .5ex;\n",
651
+ " width: min-content;\n",
652
+ " min-width: 20ex;\n",
653
+ " max-width: 50ex;\n",
654
+ " color: var(--sklearn-color-text);\n",
655
+ " box-shadow: 2pt 2pt 4pt #999;\n",
656
+ " /* unfitted */\n",
657
+ " background: var(--sklearn-color-unfitted-level-0);\n",
658
+ " border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
659
+ "}\n",
660
+ "\n",
661
+ ".sk-estimator-doc-link.fitted span {\n",
662
+ " /* fitted */\n",
663
+ " background: var(--sklearn-color-fitted-level-0);\n",
664
+ " border: var(--sklearn-color-fitted-level-3);\n",
665
+ "}\n",
666
+ "\n",
667
+ ".sk-estimator-doc-link:hover span {\n",
668
+ " display: block;\n",
669
+ "}\n",
670
+ "\n",
671
+ "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
672
+ "\n",
673
+ "#sk-container-id-1 a.estimator_doc_link {\n",
674
+ " float: right;\n",
675
+ " font-size: 1rem;\n",
676
+ " line-height: 1em;\n",
677
+ " font-family: monospace;\n",
678
+ " background-color: var(--sklearn-color-background);\n",
679
+ " border-radius: 1rem;\n",
680
+ " height: 1rem;\n",
681
+ " width: 1rem;\n",
682
+ " text-decoration: none;\n",
683
+ " /* unfitted */\n",
684
+ " color: var(--sklearn-color-unfitted-level-1);\n",
685
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
686
+ "}\n",
687
+ "\n",
688
+ "#sk-container-id-1 a.estimator_doc_link.fitted {\n",
689
+ " /* fitted */\n",
690
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
691
+ " color: var(--sklearn-color-fitted-level-1);\n",
692
+ "}\n",
693
+ "\n",
694
+ "/* On hover */\n",
695
+ "#sk-container-id-1 a.estimator_doc_link:hover {\n",
696
+ " /* unfitted */\n",
697
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
698
+ " color: var(--sklearn-color-background);\n",
699
+ " text-decoration: none;\n",
700
+ "}\n",
701
+ "\n",
702
+ "#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n",
703
+ " /* fitted */\n",
704
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
705
+ "}\n",
706
+ "</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LogisticRegression()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;LogisticRegression<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.linear_model.LogisticRegression.html\">?<span>Documentation for LogisticRegression</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>LogisticRegression()</pre></div> </div></div></div></div>"
707
+ ],
708
+ "text/plain": [
709
+ "LogisticRegression()"
710
+ ]
711
+ },
712
+ "execution_count": 99,
713
+ "metadata": {},
714
+ "output_type": "execute_result"
715
+ }
716
+ ],
717
+ "source": [
718
+ "# Train a Logistic Regression model\n",
719
+ "model = LogisticRegression(max_iter=100)\n",
720
+ "model.fit(X_train_tfidf, y_train)"
721
+ ]
722
+ },
723
+ {
724
+ "cell_type": "code",
725
+ "execution_count": 100,
726
+ "metadata": {},
727
+ "outputs": [],
728
+ "source": [
729
+ "y_pred = model.predict(X_test_tfidf)"
730
+ ]
731
+ },
732
+ {
733
+ "cell_type": "code",
734
+ "execution_count": 105,
735
+ "metadata": {},
736
+ "outputs": [
737
+ {
738
+ "name": "stdout",
739
+ "output_type": "stream",
740
+ "text": [
741
+ "Accuracy: 0.7084\n",
742
+ "Classification Report:\n",
743
+ " precision recall f1-score support\n",
744
+ "\n",
745
+ " 0 0.72 0.80 0.76 427\n",
746
+ " 1 0.70 0.59 0.64 331\n",
747
+ "\n",
748
+ " accuracy 0.71 758\n",
749
+ " macro avg 0.71 0.70 0.70 758\n",
750
+ "weighted avg 0.71 0.71 0.70 758\n",
751
+ "\n"
752
+ ]
753
+ }
754
+ ],
755
+ "source": [
756
+ "# 7. Evaluate the model\n",
757
+ "accuracy = accuracy_score(y_test, y_pred)\n",
758
+ "print(f\"Accuracy: {accuracy:.4f}\")\n",
759
+ "print(\"Classification Report:\\n\", classification_report(y_test, y_pred)\n",
760
+ ")"
761
+ ]
762
+ },
763
+ {
764
+ "cell_type": "code",
765
+ "execution_count": 7,
766
+ "metadata": {},
767
+ "outputs": [
768
+ {
769
+ "data": {
770
+ "text/plain": [
771
+ "<bound method NDFrame.head of url \\\n",
772
+ "0 https://www.foxnews.com/lifestyle/jack-carrs-e... \n",
773
+ "1 https://www.foxnews.com/entertainment/bruce-wi... \n",
774
+ "2 https://www.foxnews.com/politics/blinken-meets... \n",
775
+ "3 https://www.foxnews.com/entertainment/emily-bl... \n",
776
+ "4 https://www.foxnews.com/media/the-view-co-host... \n",
777
+ "... ... \n",
778
+ "3784 https://www.nbcnews.com/politics/2024-election... \n",
779
+ "3785 https://www.nbcnews.com/select/shopping/best-a... \n",
780
+ "3786 https://www.nbcnews.com/select/shopping/best-v... \n",
781
+ "3787 https://www.nbcnews.com/politics/2024-election... \n",
782
+ "3788 https://www.nbcnews.com/select/shopping/white-... \n",
783
+ "\n",
784
+ " title label \n",
785
+ "0 Jack Carr recalls Gen. Eisenhower's D-Day memo... 0 \n",
786
+ "1 Bruce Willis, Demi Moore avoided doing one thi... 0 \n",
787
+ "2 Blinken meets Qatar PM, says Israeli actions a... 0 \n",
788
+ "3 Emily Blunt says her ‘toes curl’ when people t... 0 \n",
789
+ "4 'The View' co-host, CNN commentator Ana Navarr... 0 \n",
790
+ "... ... ... \n",
791
+ "3784 Trump's lawyers seek post-Election Day delay f... 1 \n",
792
+ "3785 How to treat acne scars and hyperpigmentation,... 1 \n",
793
+ "3786 7 best vegetarian and vegan meal delivery serv... 1 \n",
794
+ "3787 Trump says presidential civilian award is 'bet... 1 \n",
795
+ "3788 19 best white elephant and Secret Santa gift i... 1 \n",
796
+ "\n",
797
+ "[3789 rows x 3 columns]>"
798
+ ]
799
+ },
800
+ "execution_count": 7,
801
+ "metadata": {},
802
+ "output_type": "execute_result"
803
+ }
804
+ ],
805
+ "source": [
806
+ "df = pd.read_csv('fetched_headlines.csv')\n",
807
+ "df.head"
808
+ ]
809
+ },
810
+ {
811
+ "cell_type": "code",
812
+ "execution_count": null,
813
+ "metadata": {},
814
+ "outputs": [
815
+ {
816
+ "data": {
817
+ "text/html": [
818
+ "<div>\n",
819
+ "<style scoped>\n",
820
+ " .dataframe tbody tr th:only-of-type {\n",
821
+ " vertical-align: middle;\n",
822
+ " }\n",
823
+ "\n",
824
+ " .dataframe tbody tr th {\n",
825
+ " vertical-align: top;\n",
826
+ " }\n",
827
+ "\n",
828
+ " .dataframe thead th {\n",
829
+ " text-align: right;\n",
830
+ " }\n",
831
+ "</style>\n",
832
+ "<table border=\"1\" class=\"dataframe\">\n",
833
+ " <thead>\n",
834
+ " <tr style=\"text-align: right;\">\n",
835
+ " <th></th>\n",
836
+ " <th>url</th>\n",
837
+ " <th>title</th>\n",
838
+ " <th>label</th>\n",
839
+ " <th>outlet</th>\n",
840
+ " </tr>\n",
841
+ " </thead>\n",
842
+ " <tbody>\n",
843
+ " <tr>\n",
844
+ " <th>0</th>\n",
845
+ " <td>https://www.foxnews.com/lifestyle/jack-carrs-e...</td>\n",
846
+ " <td>Jack Carr recalls Gen. Eisenhower's D-Day memo...</td>\n",
847
+ " <td>0</td>\n",
848
+ " <td>FoxNews</td>\n",
849
+ " </tr>\n",
850
+ " <tr>\n",
851
+ " <th>1</th>\n",
852
+ " <td>https://www.foxnews.com/entertainment/bruce-wi...</td>\n",
853
+ " <td>Bruce Willis, Demi Moore avoided doing one thi...</td>\n",
854
+ " <td>0</td>\n",
855
+ " <td>FoxNews</td>\n",
856
+ " </tr>\n",
857
+ " <tr>\n",
858
+ " <th>2</th>\n",
859
+ " <td>https://www.foxnews.com/politics/blinken-meets...</td>\n",
860
+ " <td>Blinken meets Qatar PM, says Israeli actions a...</td>\n",
861
+ " <td>0</td>\n",
862
+ " <td>FoxNews</td>\n",
863
+ " </tr>\n",
864
+ " <tr>\n",
865
+ " <th>3</th>\n",
866
+ " <td>https://www.foxnews.com/entertainment/emily-bl...</td>\n",
867
+ " <td>Emily Blunt says her ‘toes curl’ when people t...</td>\n",
868
+ " <td>0</td>\n",
869
+ " <td>FoxNews</td>\n",
870
+ " </tr>\n",
871
+ " <tr>\n",
872
+ " <th>4</th>\n",
873
+ " <td>https://www.foxnews.com/media/the-view-co-host...</td>\n",
874
+ " <td>'The View' co-host, CNN commentator Ana Navarr...</td>\n",
875
+ " <td>0</td>\n",
876
+ " <td>FoxNews</td>\n",
877
+ " </tr>\n",
878
+ " </tbody>\n",
879
+ "</table>\n",
880
+ "</div>"
881
+ ],
882
+ "text/plain": [
883
+ " url \\\n",
884
+ "0 https://www.foxnews.com/lifestyle/jack-carrs-e... \n",
885
+ "1 https://www.foxnews.com/entertainment/bruce-wi... \n",
886
+ "2 https://www.foxnews.com/politics/blinken-meets... \n",
887
+ "3 https://www.foxnews.com/entertainment/emily-bl... \n",
888
+ "4 https://www.foxnews.com/media/the-view-co-host... \n",
889
+ "\n",
890
+ " title label outlet \n",
891
+ "0 Jack Carr recalls Gen. Eisenhower's D-Day memo... 0 FoxNews \n",
892
+ "1 Bruce Willis, Demi Moore avoided doing one thi... 0 FoxNews \n",
893
+ "2 Blinken meets Qatar PM, says Israeli actions a... 0 FoxNews \n",
894
+ "3 Emily Blunt says her ‘toes curl’ when people t... 0 FoxNews \n",
895
+ "4 'The View' co-host, CNN commentator Ana Navarr... 0 FoxNews "
896
+ ]
897
+ },
898
+ "execution_count": 8,
899
+ "metadata": {},
900
+ "output_type": "execute_result"
901
+ }
902
+ ],
903
+ "source": [
904
+ "df['outlet'] = df['url'].apply(lambda x: 'FoxNews' if 'foxnews.com' in x else 'NBC')\n"
905
+ ]
906
+ },
907
+ {
908
+ "cell_type": "code",
909
+ "execution_count": 10,
910
+ "metadata": {},
911
+ "outputs": [
912
+ {
913
+ "data": {
914
+ "text/html": [
915
+ "<div>\n",
916
+ "<style scoped>\n",
917
+ " .dataframe tbody tr th:only-of-type {\n",
918
+ " vertical-align: middle;\n",
919
+ " }\n",
920
+ "\n",
921
+ " .dataframe tbody tr th {\n",
922
+ " vertical-align: top;\n",
923
+ " }\n",
924
+ "\n",
925
+ " .dataframe thead th {\n",
926
+ " text-align: right;\n",
927
+ " }\n",
928
+ "</style>\n",
929
+ "<table border=\"1\" class=\"dataframe\">\n",
930
+ " <thead>\n",
931
+ " <tr style=\"text-align: right;\">\n",
932
+ " <th></th>\n",
933
+ " <th>title</th>\n",
934
+ " <th>outlet</th>\n",
935
+ " <th>label</th>\n",
936
+ " </tr>\n",
937
+ " </thead>\n",
938
+ " <tbody>\n",
939
+ " <tr>\n",
940
+ " <th>0</th>\n",
941
+ " <td>Jack Carr recalls Gen. Eisenhower's D-Day memo...</td>\n",
942
+ " <td>FoxNews</td>\n",
943
+ " <td>1</td>\n",
944
+ " </tr>\n",
945
+ " <tr>\n",
946
+ " <th>1</th>\n",
947
+ " <td>Bruce Willis, Demi Moore avoided doing one thi...</td>\n",
948
+ " <td>FoxNews</td>\n",
949
+ " <td>1</td>\n",
950
+ " </tr>\n",
951
+ " <tr>\n",
952
+ " <th>2</th>\n",
953
+ " <td>Blinken meets Qatar PM, says Israeli actions a...</td>\n",
954
+ " <td>FoxNews</td>\n",
955
+ " <td>1</td>\n",
956
+ " </tr>\n",
957
+ " <tr>\n",
958
+ " <th>3</th>\n",
959
+ " <td>Emily Blunt says her ‘toes curl’ when people t...</td>\n",
960
+ " <td>FoxNews</td>\n",
961
+ " <td>1</td>\n",
962
+ " </tr>\n",
963
+ " <tr>\n",
964
+ " <th>4</th>\n",
965
+ " <td>'The View' co-host, CNN commentator Ana Navarr...</td>\n",
966
+ " <td>FoxNews</td>\n",
967
+ " <td>1</td>\n",
968
+ " </tr>\n",
969
+ " </tbody>\n",
970
+ "</table>\n",
971
+ "</div>"
972
+ ],
973
+ "text/plain": [
974
+ " title outlet label\n",
975
+ "0 Jack Carr recalls Gen. Eisenhower's D-Day memo... FoxNews 1\n",
976
+ "1 Bruce Willis, Demi Moore avoided doing one thi... FoxNews 1\n",
977
+ "2 Blinken meets Qatar PM, says Israeli actions a... FoxNews 1\n",
978
+ "3 Emily Blunt says her ‘toes curl’ when people t... FoxNews 1\n",
979
+ "4 'The View' co-host, CNN commentator Ana Navarr... FoxNews 1"
980
+ ]
981
+ },
982
+ "execution_count": 10,
983
+ "metadata": {},
984
+ "output_type": "execute_result"
985
+ }
986
+ ],
987
+ "source": [
988
+ "# Swap label and outlet position and update label values\n",
989
+ "df['label'] = df['outlet'].apply(lambda x: 1 if x == 'FoxNews' else 0)\n",
990
+ "df = df[[ 'title', 'outlet', 'label']]\n",
991
+ "df.head()"
992
+ ]
993
+ },
994
+ {
995
+ "cell_type": "code",
996
+ "execution_count": 11,
997
+ "metadata": {},
998
+ "outputs": [],
999
+ "source": [
1000
+ "df.to_csv('train_data.csv', index=False)"
1001
+ ]
1002
+ },
1003
+ {
1004
+ "cell_type": "code",
1005
+ "execution_count": 12,
1006
+ "metadata": {},
1007
+ "outputs": [
1008
+ {
1009
+ "data": {
1010
+ "text/plain": [
1011
+ "array([<class 'str'>], dtype=object)"
1012
+ ]
1013
+ },
1014
+ "execution_count": 12,
1015
+ "metadata": {},
1016
+ "output_type": "execute_result"
1017
+ }
1018
+ ],
1019
+ "source": [
1020
+ "df['title'].apply(type).unique()"
1021
+ ]
1022
+ },
1023
+ {
1024
+ "cell_type": "code",
1025
+ "execution_count": null,
1026
+ "metadata": {},
1027
+ "outputs": [],
1028
+ "source": []
1029
+ },
1030
+ {
1031
+ "cell_type": "code",
1032
+ "execution_count": null,
1033
+ "metadata": {},
1034
+ "outputs": [],
1035
+ "source": []
1036
+ },
1037
+ {
1038
+ "cell_type": "code",
1039
+ "execution_count": null,
1040
+ "metadata": {},
1041
+ "outputs": [],
1042
+ "source": []
1043
+ }
1044
+ ],
1045
+ "metadata": {
1046
+ "kernelspec": {
1047
+ "display_name": ".venv",
1048
+ "language": "python",
1049
+ "name": "python3"
1050
+ },
1051
+ "language_info": {
1052
+ "codemirror_mode": {
1053
+ "name": "ipython",
1054
+ "version": 3
1055
+ },
1056
+ "file_extension": ".py",
1057
+ "mimetype": "text/x-python",
1058
+ "name": "python",
1059
+ "nbconvert_exporter": "python",
1060
+ "pygments_lexer": "ipython3",
1061
+ "version": "3.9.6"
1062
+ }
1063
+ },
1064
+ "nbformat": 4,
1065
+ "nbformat_minor": 2
1066
+ }
train_data.csv ADDED
The diff for this file is too large to render. See raw diff