Upload 3 files

Browse files

Files changed (4) hide show

.gitattributes +1 -0
combined_data.csv +3 -0
lstm.ipynb +223 -0
main.ipynb +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+combined_data.csv filter=lfs diff=lfs merge=lfs -text

combined_data.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a036654289f27cd973f6d8b2ac28932202021afb97b38f8b61c67c80aa88f300
+size 28167352

lstm.ipynb ADDED Viewed

	@@ -0,0 +1,223 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1/10\n",
+      "\u001b[1m7964/7964\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m383s\u001b[0m 48ms/step - accuracy: 0.7637 - loss: 0.4815 - val_accuracy: 0.8195 - val_loss: 0.3929 - learning_rate: 0.0010\n",
+      "Epoch 2/10\n",
+      "\u001b[1m7964/7964\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m360s\u001b[0m 45ms/step - accuracy: 0.8561 - loss: 0.3267 - val_accuracy: 0.8256 - val_loss: 0.3854 - learning_rate: 0.0010\n",
+      "Epoch 3/10\n",
+      "\u001b[1m7964/7964\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m373s\u001b[0m 47ms/step - accuracy: 0.8937 - loss: 0.2503 - val_accuracy: 0.8250 - val_loss: 0.4444 - learning_rate: 0.0010\n",
+      "Epoch 4/10\n",
+      "\u001b[1m7964/7964\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m377s\u001b[0m 47ms/step - accuracy: 0.9269 - loss: 0.1794 - val_accuracy: 0.8173 - val_loss: 0.4580 - learning_rate: 0.0010\n",
+      "Epoch 5/10\n",
+      "\u001b[1m7964/7964\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m385s\u001b[0m 48ms/step - accuracy: 0.9496 - loss: 0.1284 - val_accuracy: 0.8147 - val_loss: 0.5704 - learning_rate: 0.0010\n",
+      "\u001b[1m2213/2213\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m20s\u001b[0m 9ms/step - accuracy: 0.8228 - loss: 0.3848\n",
+      "Test Accuracy: 0.8214734792709351\n",
+      "\u001b[1m2213/2213\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m24s\u001b[0m 11ms/step\n",
+      "\n",
+      "Classification Report:\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.84      0.90      0.87     46733\n",
+      "           1       0.77      0.68      0.72     24052\n",
+      "\n",
+      "    accuracy                           0.82     70785\n",
+      "   macro avg       0.81      0.79      0.79     70785\n",
+      "weighted avg       0.82      0.82      0.82     70785\n",
+      "\n",
+      "\n",
+      "Confusion Matrix:\n",
+      "[[41892  4841]\n",
+      " [ 7796 16256]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
+    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout\n",
+    "from tensorflow.keras.utils import to_categorical\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "from sklearn.metrics import classification_report, confusion_matrix\n",
+    "from tensorflow.keras.callbacks import ReduceLROnPlateau, TensorBoard, EarlyStopping\n",
+    "\n",
+    "# load data\n",
+    "df = pd.read_csv('combined_data.csv')\n",
+    "\n",
+    "# Tokenize the text\n",
+    "tokenizer = Tokenizer()\n",
+    "tokenizer.fit_on_texts(df['title'])\n",
+    "X = tokenizer.texts_to_sequences(df['title'])\n",
+    "X = pad_sequences(X)\n",
+    "\n",
+    "# Encode the target variable\n",
+    "encoder = LabelEncoder()\n",
+    "y = encoder.fit_transform(df['source'])\n",
+    "y = to_categorical(y)\n",
+    "\n",
+    "# Split the data\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
+    "\n",
+    "# Build the LSTM model\n",
+    "model = Sequential()\n",
+    "model.add(Embedding(len(tokenizer.word_index) + 1, 128))\n",
+    "model.add(LSTM(128, return_sequences=True))\n",
+    "model.add(Dropout(0.5))\n",
+    "model.add(LSTM(64))\n",
+    "model.add(Dropout(0.5))\n",
+    "model.add(Dense(len(encoder.classes_), activation='softmax'))\n",
+    "model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])\n",
+    "\n",
+    "# Learning rate scheduler\n",
+    "lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5)\n",
+    "\n",
+    "# TensorBoard callback for logging\n",
+    "tensorboard_callback = TensorBoard(log_dir='./logs', histogram_freq=1)\n",
+    "\n",
+    "# Early stopping to prevent overfitting\n",
+    "early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)\n",
+    "\n",
+    "# Train the model with callbacks\n",
+    "model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1, \n",
+    "          callbacks=[lr_scheduler, tensorboard_callback, early_stopping])\n",
+    "\n",
+    "# Evaluate the model\n",
+    "loss, accuracy = model.evaluate(X_test, y_test)\n",
+    "print(f\"Test Accuracy: {accuracy}\")\n",
+    "\n",
+    "# Predictions and evaluation\n",
+    "y_pred = model.predict(X_test)\n",
+    "y_pred_classes = y_pred.argmax(axis=1)\n",
+    "y_test_classes = y_test.argmax(axis=1)\n",
+    "\n",
+    "print(\"\\nClassification Report:\")\n",
+    "print(classification_report(y_test_classes, y_pred_classes))\n",
+    "\n",
+    "print(\"\\nConfusion Matrix:\")\n",
+    "print(confusion_matrix(y_test_classes, y_pred_classes))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
+     ]
+    }
+   ],
+   "source": [
+    "# save model\n",
+    "model.save('news_classifier.h5')\n",
+    "\n",
+    "# save tokenizer\n",
+    "import pickle\n",
+    "with open('tokenizer.pickle', 'wb') as handle:\n",
+    "    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
+    "    \n",
+    "# save encoder\n",
+    "with open('encoder.pickle', 'wb') as handle:\n",
+    "    pickle.dump(encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# deploy the model\n",
+    "# user give the title and the model will predict the source\n",
+    "# Load the model and tokenizer\n",
+    "from tensorflow.keras.models import load_model\n",
+    "import pickle\n",
+    "\n",
+    "# Load the tokenizer\n",
+    "with open('tokenizer.pickle', 'rb') as handle:\n",
+    "    tokenizer = pickle.load(handle)\n",
+    "\n",
+    "# Load the encoder\n",
+    "with open('encoder.pickle', 'rb') as handle:\n",
+    "    encoder = pickle.load(handle)\n",
+    "\n",
+    "\n",
+    "def predict_source(title):\n",
+    "    # Load the model\n",
+    "    model = load_model('news_classifier.h5')\n",
+    "    # Tokenize the input\n",
+    "    X = tokenizer.texts_to_sequences([title])\n",
+    "    X = pad_sequences(X)\n",
+    "    # Predict the source\n",
+    "    y_pred = model.predict(X)\n",
+    "    source = encoder.inverse_transform(y_pred.argmax(axis=1))\n",
+    "    return source[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:absl:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 109ms/step\n",
+      "Predicted Source: foxnews\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Test the function\n",
+    "# user input\n",
+    "title = input(\"Enter the title: \")\n",
+    "source = predict_source(title)\n",
+    "print(f\"Predicted Source: {source}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

main.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff