Jiahuita commited on
Commit
ade1685
Β·
verified Β·
1 Parent(s): 58c342a

Upload 3 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. combined_data.csv +3 -0
  3. lstm.ipynb +223 -0
  4. main.ipynb +0 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ combined_data.csv filter=lfs diff=lfs merge=lfs -text
combined_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a036654289f27cd973f6d8b2ac28932202021afb97b38f8b61c67c80aa88f300
3
+ size 28167352
lstm.ipynb ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "Epoch 1/10\n",
13
+ "\u001b[1m7964/7964\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m383s\u001b[0m 48ms/step - accuracy: 0.7637 - loss: 0.4815 - val_accuracy: 0.8195 - val_loss: 0.3929 - learning_rate: 0.0010\n",
14
+ "Epoch 2/10\n",
15
+ "\u001b[1m7964/7964\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m360s\u001b[0m 45ms/step - accuracy: 0.8561 - loss: 0.3267 - val_accuracy: 0.8256 - val_loss: 0.3854 - learning_rate: 0.0010\n",
16
+ "Epoch 3/10\n",
17
+ "\u001b[1m7964/7964\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m373s\u001b[0m 47ms/step - accuracy: 0.8937 - loss: 0.2503 - val_accuracy: 0.8250 - val_loss: 0.4444 - learning_rate: 0.0010\n",
18
+ "Epoch 4/10\n",
19
+ "\u001b[1m7964/7964\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m377s\u001b[0m 47ms/step - accuracy: 0.9269 - loss: 0.1794 - val_accuracy: 0.8173 - val_loss: 0.4580 - learning_rate: 0.0010\n",
20
+ "Epoch 5/10\n",
21
+ "\u001b[1m7964/7964\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m385s\u001b[0m 48ms/step - accuracy: 0.9496 - loss: 0.1284 - val_accuracy: 0.8147 - val_loss: 0.5704 - learning_rate: 0.0010\n",
22
+ "\u001b[1m2213/2213\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m20s\u001b[0m 9ms/step - accuracy: 0.8228 - loss: 0.3848\n",
23
+ "Test Accuracy: 0.8214734792709351\n",
24
+ "\u001b[1m2213/2213\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m24s\u001b[0m 11ms/step\n",
25
+ "\n",
26
+ "Classification Report:\n",
27
+ " precision recall f1-score support\n",
28
+ "\n",
29
+ " 0 0.84 0.90 0.87 46733\n",
30
+ " 1 0.77 0.68 0.72 24052\n",
31
+ "\n",
32
+ " accuracy 0.82 70785\n",
33
+ " macro avg 0.81 0.79 0.79 70785\n",
34
+ "weighted avg 0.82 0.82 0.82 70785\n",
35
+ "\n",
36
+ "\n",
37
+ "Confusion Matrix:\n",
38
+ "[[41892 4841]\n",
39
+ " [ 7796 16256]]\n"
40
+ ]
41
+ }
42
+ ],
43
+ "source": [
44
+ "import numpy as np\n",
45
+ "import pandas as pd\n",
46
+ "from tensorflow.keras.preprocessing.text import Tokenizer\n",
47
+ "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
48
+ "from tensorflow.keras.models import Sequential\n",
49
+ "from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout\n",
50
+ "from tensorflow.keras.utils import to_categorical\n",
51
+ "from sklearn.model_selection import train_test_split\n",
52
+ "from sklearn.preprocessing import LabelEncoder\n",
53
+ "from sklearn.metrics import classification_report, confusion_matrix\n",
54
+ "from tensorflow.keras.callbacks import ReduceLROnPlateau, TensorBoard, EarlyStopping\n",
55
+ "\n",
56
+ "# load data\n",
57
+ "df = pd.read_csv('combined_data.csv')\n",
58
+ "\n",
59
+ "# Tokenize the text\n",
60
+ "tokenizer = Tokenizer()\n",
61
+ "tokenizer.fit_on_texts(df['title'])\n",
62
+ "X = tokenizer.texts_to_sequences(df['title'])\n",
63
+ "X = pad_sequences(X)\n",
64
+ "\n",
65
+ "# Encode the target variable\n",
66
+ "encoder = LabelEncoder()\n",
67
+ "y = encoder.fit_transform(df['source'])\n",
68
+ "y = to_categorical(y)\n",
69
+ "\n",
70
+ "# Split the data\n",
71
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
72
+ "\n",
73
+ "# Build the LSTM model\n",
74
+ "model = Sequential()\n",
75
+ "model.add(Embedding(len(tokenizer.word_index) + 1, 128))\n",
76
+ "model.add(LSTM(128, return_sequences=True))\n",
77
+ "model.add(Dropout(0.5))\n",
78
+ "model.add(LSTM(64))\n",
79
+ "model.add(Dropout(0.5))\n",
80
+ "model.add(Dense(len(encoder.classes_), activation='softmax'))\n",
81
+ "model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])\n",
82
+ "\n",
83
+ "# Learning rate scheduler\n",
84
+ "lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5)\n",
85
+ "\n",
86
+ "# TensorBoard callback for logging\n",
87
+ "tensorboard_callback = TensorBoard(log_dir='./logs', histogram_freq=1)\n",
88
+ "\n",
89
+ "# Early stopping to prevent overfitting\n",
90
+ "early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)\n",
91
+ "\n",
92
+ "# Train the model with callbacks\n",
93
+ "model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1, \n",
94
+ " callbacks=[lr_scheduler, tensorboard_callback, early_stopping])\n",
95
+ "\n",
96
+ "# Evaluate the model\n",
97
+ "loss, accuracy = model.evaluate(X_test, y_test)\n",
98
+ "print(f\"Test Accuracy: {accuracy}\")\n",
99
+ "\n",
100
+ "# Predictions and evaluation\n",
101
+ "y_pred = model.predict(X_test)\n",
102
+ "y_pred_classes = y_pred.argmax(axis=1)\n",
103
+ "y_test_classes = y_test.argmax(axis=1)\n",
104
+ "\n",
105
+ "print(\"\\nClassification Report:\")\n",
106
+ "print(classification_report(y_test_classes, y_pred_classes))\n",
107
+ "\n",
108
+ "print(\"\\nConfusion Matrix:\")\n",
109
+ "print(confusion_matrix(y_test_classes, y_pred_classes))\n"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": 6,
115
+ "metadata": {},
116
+ "outputs": [
117
+ {
118
+ "name": "stderr",
119
+ "output_type": "stream",
120
+ "text": [
121
+ "WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
122
+ ]
123
+ }
124
+ ],
125
+ "source": [
126
+ "# save model\n",
127
+ "model.save('news_classifier.h5')\n",
128
+ "\n",
129
+ "# save tokenizer\n",
130
+ "import pickle\n",
131
+ "with open('tokenizer.pickle', 'wb') as handle:\n",
132
+ " pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
133
+ " \n",
134
+ "# save encoder\n",
135
+ "with open('encoder.pickle', 'wb') as handle:\n",
136
+ " pickle.dump(encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)\n"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": 14,
142
+ "metadata": {},
143
+ "outputs": [],
144
+ "source": [
145
+ "# deploy the model\n",
146
+ "# user give the title and the model will predict the source\n",
147
+ "# Load the model and tokenizer\n",
148
+ "from tensorflow.keras.models import load_model\n",
149
+ "import pickle\n",
150
+ "\n",
151
+ "# Load the tokenizer\n",
152
+ "with open('tokenizer.pickle', 'rb') as handle:\n",
153
+ " tokenizer = pickle.load(handle)\n",
154
+ "\n",
155
+ "# Load the encoder\n",
156
+ "with open('encoder.pickle', 'rb') as handle:\n",
157
+ " encoder = pickle.load(handle)\n",
158
+ "\n",
159
+ "\n",
160
+ "def predict_source(title):\n",
161
+ " # Load the model\n",
162
+ " model = load_model('news_classifier.h5')\n",
163
+ " # Tokenize the input\n",
164
+ " X = tokenizer.texts_to_sequences([title])\n",
165
+ " X = pad_sequences(X)\n",
166
+ " # Predict the source\n",
167
+ " y_pred = model.predict(X)\n",
168
+ " source = encoder.inverse_transform(y_pred.argmax(axis=1))\n",
169
+ " return source[0]"
170
+ ]
171
+ },
172
+ {
173
+ "cell_type": "code",
174
+ "execution_count": 26,
175
+ "metadata": {},
176
+ "outputs": [
177
+ {
178
+ "name": "stderr",
179
+ "output_type": "stream",
180
+ "text": [
181
+ "WARNING:absl:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n"
182
+ ]
183
+ },
184
+ {
185
+ "name": "stdout",
186
+ "output_type": "stream",
187
+ "text": [
188
+ "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 109ms/step\n",
189
+ "Predicted Source: foxnews\n"
190
+ ]
191
+ }
192
+ ],
193
+ "source": [
194
+ "# Test the function\n",
195
+ "# user input\n",
196
+ "title = input(\"Enter the title: \")\n",
197
+ "source = predict_source(title)\n",
198
+ "print(f\"Predicted Source: {source}\")"
199
+ ]
200
+ }
201
+ ],
202
+ "metadata": {
203
+ "kernelspec": {
204
+ "display_name": "base",
205
+ "language": "python",
206
+ "name": "python3"
207
+ },
208
+ "language_info": {
209
+ "codemirror_mode": {
210
+ "name": "ipython",
211
+ "version": 3
212
+ },
213
+ "file_extension": ".py",
214
+ "mimetype": "text/x-python",
215
+ "name": "python",
216
+ "nbconvert_exporter": "python",
217
+ "pygments_lexer": "ipython3",
218
+ "version": "3.12.4"
219
+ }
220
+ },
221
+ "nbformat": 4,
222
+ "nbformat_minor": 2
223
+ }
main.ipynb ADDED
The diff for this file is too large to render. See raw diff