{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "db772bcc", "metadata": {}, "outputs": [], "source": [ "# Data handling\n", "import pandas as pd\n", "import numpy as np \n", "\n", "\n", "# EDA (pandas-profiling, etc. )\n", "...\n", "\n", "# Feature Processing (Scikit-learn processing, etc. )\n", "from sklearn import preprocessing\n", "\n", "# Machine Learning (Scikit-learn Estimators, Catboost, LightGBM, etc. )\n", "...\n", "\n", "# Hyperparameters Fine-tuning (Scikit-learn hp search, cross-validation, etc. )\n", "...\n", "\n", "# Other packages\n", "import os\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "#display all columns and rows \n", "pd.set_option('display.max_columns', None)\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "d80b4220", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Class counts before SMOTE: No 4111\n", "Yes 1505\n", "Name: Churn, dtype: int64\n", "Class counts after SMOTE: Yes 4111\n", "No 4111\n", "Name: Churn, dtype: int64\n", "AdaBoost Classifier: 0.9019360028118717\n", "Logistic Regression Classifier: 0.8608679697080713\n", "Random Forest Classifier: 0.9311295690912422\n", "Gradient Boosting Classifier: 0.9235269779240596\n", "SVM Classifier: 0.8944493562575639\n", "Best model: Random Forest Classifier\n", "AdaBoost Classifier classification report:\n", " precision recall f1-score support\n", "\n", " No 0.90 0.76 0.82 1053\n", " Yes 0.50 0.74 0.60 352\n", "\n", " accuracy 0.75 1405\n", " macro avg 0.70 0.75 0.71 1405\n", "weighted avg 0.80 0.75 0.77 1405\n", "\n", "\n", "Logistic Regression Classifier classification report:\n", " precision recall f1-score support\n", "\n", " No 0.92 0.73 0.81 1053\n", " Yes 0.49 0.80 0.61 352\n", "\n", " accuracy 0.74 1405\n", " macro avg 0.70 0.76 0.71 1405\n", "weighted avg 0.81 0.74 0.76 1405\n", "\n", "\n", "Random Forest Classifier classification report:\n", " precision recall f1-score support\n", "\n", " No 0.86 0.84 0.85 1053\n", " Yes 0.56 0.61 0.58 352\n", "\n", " accuracy 0.78 1405\n", " macro avg 0.71 0.72 0.72 1405\n", "weighted avg 0.79 0.78 0.79 1405\n", "\n", "\n", "Gradient Boosting Classifier classification report:\n", " precision recall f1-score support\n", "\n", " No 0.89 0.80 0.84 1053\n", " Yes 0.54 0.69 0.60 352\n", "\n", " accuracy 0.77 1405\n", " macro avg 0.71 0.74 0.72 1405\n", "weighted avg 0.80 0.77 0.78 1405\n", "\n", "\n", "SVM Classifier classification report:\n", " precision recall f1-score support\n", "\n", " No 0.89 0.77 0.83 1053\n", " Yes 0.52 0.73 0.60 352\n", "\n", " accuracy 0.76 1405\n", " macro avg 0.71 0.75 0.72 1405\n", "weighted avg 0.80 0.76 0.77 1405\n", "\n", "\n" ] } ], "source": [ "# For CSV, use pandas.read_csv\n", "\n", "df = pd.read_csv(\"Telco-Customer-Churn.csv\")\n", "df.drop(['customerID'], axis=1, inplace=True)\n", "# Coerce the conversion of TotalCharges column to float\n", "df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')\n", "# Remove the duplicate rows\n", "df = df.drop_duplicates()\n", "\n", "cols_to_replace = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'MultipleLines']\n", "df[cols_to_replace] = df[cols_to_replace].replace('No internet service', 'No').replace('No phone service', 'No')\n", "\n", "\n", "from sklearn.model_selection import train_test_split\n", "\n", "# split the data into features (X) and target variable (y)\n", "X = df.drop('Churn', axis=1)\n", "y = df['Churn']\n", "\n", "# split the data into train and test sets\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "\n", "# Identify numeric and non-numeric columns\n", "num_cols = X.select_dtypes(include=[np.number]).columns.tolist()\n", "cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()\n", "\n", "\n", "'''creating copy of the categorical features and numerical features\n", "before imputing null value to avoid modifying the orginal dataset'''\n", "\n", "X_train_cat = X_train[cat_cols].copy()\n", "X_train_num = X_train[num_cols].copy()\n", "\n", "X_test_cat = X_test[cat_cols].copy()\n", "X_test_num = X_test[num_cols].copy()\n", "\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.compose import ColumnTransformer\n", "\n", "# Creating imputer variables\n", "numerical_imputer = SimpleImputer(strategy = \"mean\")\n", "categorical_imputer = SimpleImputer(strategy = \"most_frequent\")\n", "\n", "\n", "# Define the column transformer\n", "categorical_features = cat_cols\n", "categorical_transformer = Pipeline(steps=[\n", " ('onehot', OneHotEncoder(handle_unknown='ignore', categories='auto', sparse=False))\n", "])\n", "preprocessor = ColumnTransformer(\n", " transformers=[\n", " ('cat', categorical_transformer, categorical_features)\n", " ])\n", "\n", "# Fitting the Imputer\n", "X_train_cat_imputed = categorical_imputer.fit_transform(X_train_cat)\n", "X_train_num_imputed = numerical_imputer.fit_transform(X_train_num)\n", "\n", "X_test_cat_imputed = categorical_imputer.fit_transform(X_test_cat)\n", "X_test_num_imputed = numerical_imputer.fit_transform(X_test_num)\n", "\n", "encoder=OneHotEncoder(handle_unknown='ignore')\n", "\n", "# encoding the xtrain categories and converting to a dataframe\n", "X_train_cat_encoded = encoder.fit(X_train_cat_imputed)\n", "X_train_cat_encoded = pd.DataFrame(encoder.transform(X_train_cat_imputed).toarray(),\n", " columns=encoder.get_feature_names_out(cat_cols))\n", "\n", "# encoding the xeval categories and converting to a dataframe\n", "X_test_cat_encoded = encoder.fit(X_test_cat_imputed)\n", "X_test_cat_encoded = pd.DataFrame(encoder.transform(X_test_cat_imputed).toarray(),\n", " columns=encoder.get_feature_names_out(cat_cols))\n", "\n", "\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "scaler= StandardScaler()\n", "\n", "X_train_num_scaled = scaler.fit_transform(X_train_num_imputed)\n", "X_train_num_sc = pd.DataFrame(X_train_num_scaled, columns = num_cols)\n", "\n", "X_test_num_scaled = scaler.fit_transform(X_test_num_imputed)\n", "X_test_num_sc = pd.DataFrame(X_test_num_scaled, columns = num_cols)\n", "\n", "X_train_df = pd.concat([X_train_num_sc,X_train_cat_encoded], axis =1)\n", "X_test_df = pd.concat([X_test_num_sc,X_test_cat_encoded], axis =1)\n", "\n", "\n", "#Training over SMOTE-balanced data with roc_auc scoring \n", "\n", "\n", "from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.svm import SVC\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.metrics import roc_auc_score\n", "from imblearn.over_sampling import SMOTE\n", "\n", "# initialize SMOTE\n", "sm = SMOTE(random_state=42)\n", "\n", "# fit SMOTE on the training data and resample it\n", "X_train_resampled, y_train_resampled = sm.fit_resample(X_train_df, y_train)\n", "\n", "# print class counts before and after SMOTE\n", "print(f'Class counts before SMOTE: {y_train.value_counts()}')\n", "print(f'Class counts after SMOTE: {y_train_resampled.value_counts()}')\n", "\n", "# create a dictionary of models to fit\n", "models = {\n", " 'AdaBoost Classifier': AdaBoostClassifier(),\n", " 'Logistic Regression Classifier': LogisticRegression(),\n", " 'Random Forest Classifier': RandomForestClassifier(),\n", " 'Gradient Boosting Classifier': GradientBoostingClassifier(),\n", " 'SVM Classifier': SVC(probability=True)\n", "}\n", "\n", "# iterate over the models and fit each one to the resampled training data\n", "for name, model in models.items():\n", " model.fit(X_train_resampled, y_train_resampled)\n", " \n", "# evaluate each model using cross-validation based on ROC-AUC\n", "roc_auc_scores = {}\n", "for name, model in models.items():\n", " scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='roc_auc')\n", " roc_auc_scores[name] = scores.mean()\n", " \n", "# print the ROC-AUC scores for each model\n", "for name, score in roc_auc_scores.items():\n", " print(f'{name}: {score}')\n", "\n", "# choose the model with the highest ROC-AUC score\n", "best_model_name = max(roc_auc_scores, key=roc_auc_scores.get)\n", "best_model = models[best_model_name]\n", "print(f'Best model: {best_model_name}')\n", "\n", "from sklearn.metrics import classification_report\n", "\n", "# iterate over the models and make predictions on the test data for each one\n", "for name, model in models.items():\n", " # fit the model to the resampled training data\n", " model.fit(X_train_resampled, y_train_resampled)\n", " # make predictions on the test data\n", " y_pred = model.predict(X_test_df)\n", " # generate the classification report\n", " report = classification_report(y_test, y_pred)\n", " # print the classification report\n", " print(f'{name} classification report:\\n{report}\\n')\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "4aab6799", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['SeniorCitizen',\n", " 'tenure',\n", " 'MonthlyCharges',\n", " 'TotalCharges',\n", " 'gender_Female',\n", " 'gender_Male',\n", " 'Partner_No',\n", " 'Partner_Yes',\n", " 'Dependents_No',\n", " 'Dependents_Yes',\n", " 'PhoneService_No',\n", " 'PhoneService_Yes',\n", " 'MultipleLines_No',\n", " 'MultipleLines_Yes',\n", " 'InternetService_DSL',\n", " 'InternetService_Fiber optic',\n", " 'InternetService_No',\n", " 'OnlineSecurity_No',\n", " 'OnlineSecurity_Yes',\n", " 'OnlineBackup_No',\n", " 'OnlineBackup_Yes',\n", " 'DeviceProtection_No',\n", " 'DeviceProtection_Yes',\n", " 'TechSupport_No',\n", " 'TechSupport_Yes',\n", " 'StreamingTV_No',\n", " 'StreamingTV_Yes',\n", " 'StreamingMovies_No',\n", " 'StreamingMovies_Yes',\n", " 'Contract_Month-to-month',\n", " 'Contract_One year',\n", " 'Contract_Two year',\n", " 'PaperlessBilling_No',\n", " 'PaperlessBilling_Yes',\n", " 'PaymentMethod_Bank transfer (automatic)',\n", " 'PaymentMethod_Credit card (automatic)',\n", " 'PaymentMethod_Electronic check',\n", " 'PaymentMethod_Mailed check']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train_df.columns.tolist()" ] }, { "cell_type": "code", "execution_count": 8, "id": "d53e6b9e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Column 'gender' categories: ['Female' 'Male']\n", "Column 'SeniorCitizen' categories: [0 1]\n", "Column 'Partner' categories: ['Yes' 'No']\n", "Column 'Dependents' categories: ['No' 'Yes']\n", "Column 'tenure' categories: [ 1 34 2 45 8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27\n", " 5 46 11 70 63 43 15 60 18 66 9 3 31 50 64 56 7 42 35 48 29 65 38 68\n", " 32 55 37 36 41 6 4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26 0\n", " 39]\n", "Column 'PhoneService' categories: ['No' 'Yes']\n", "Column 'MultipleLines' categories: ['No' 'Yes']\n", "Column 'InternetService' categories: ['DSL' 'Fiber optic' 'No']\n", "Column 'OnlineSecurity' categories: ['No' 'Yes']\n", "Column 'OnlineBackup' categories: ['Yes' 'No']\n", "Column 'DeviceProtection' categories: ['No' 'Yes']\n", "Column 'TechSupport' categories: ['No' 'Yes']\n", "Column 'StreamingTV' categories: ['No' 'Yes']\n", "Column 'StreamingMovies' categories: ['No' 'Yes']\n", "Column 'Contract' categories: ['Month-to-month' 'One year' 'Two year']\n", "Column 'PaperlessBilling' categories: ['Yes' 'No']\n", "Column 'PaymentMethod' categories: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'\n", " 'Credit card (automatic)']\n", "Column 'MonthlyCharges' categories: [29.85 56.95 53.85 ... 63.1 44.2 78.7 ]\n", "Column 'TotalCharges' categories: [ 29.85 1889.5 108.15 ... 346.45 306.6 6844.5 ]\n" ] } ], "source": [ "for col in X.columns:\n", " print(f\"Column '{col}' categories: {X[col].unique()}\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "b6f7708a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Best model: Random Forest Classifier\n" ] } ], "source": [ "best_model_name = 'Random Forest Classifier'\n", "\n", "best_model = models[best_model_name]\n", "\n", "print(f'Best model: {best_model_name}')" ] }, { "cell_type": "code", "execution_count": 9, "id": "2adb8c7e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " No 0.85 0.86 0.86 1053\n", " Yes 0.57 0.56 0.56 352\n", "\n", " accuracy 0.78 1405\n", " macro avg 0.71 0.71 0.71 1405\n", "weighted avg 0.78 0.78 0.78 1405\n", "\n" ] } ], "source": [ "# Calculate the class weights\n", "class_weight = {\"No\": 1, \"Yes\": 10}\n", "\n", "# Initialize Logistic Regression model with class weights\n", "rf = RandomForestClassifier(class_weight=class_weight)\n", "\n", "# Fit the model to the training data\n", "rf.fit(X_train_resampled, y_train_resampled)\n", "\n", "# Predict the labels of the test set\n", "y_pred = rf.predict(X_test_df)\n", "\n", "# Generate the classification report\n", "report = classification_report(y_test, y_pred)\n", "print(report)" ] }, { "cell_type": "code", "execution_count": 4, "id": "3ca066e7", "metadata": { "scrolled": true }, "outputs": [], "source": [ "from joblib import dump\n", "import os\n", "\n", "# set the destination path to the \"export\" directory\n", "destination = \".\"\n", "\n", "# create a dictionary to store the objects and their filenames\n", "models = {\"numerical_imputer\": numerical_imputer,\n", " \"categorical_imputer\": categorical_imputer,\n", " \"encoder\": encoder,\n", " \"scaler\": scaler,\n", " \"Final_model\": best_model}\n", "\n", "# loop through the models and save them using joblib.dump()\n", "for name, model in models.items():\n", " dump(model, os.path.join(destination, f\"{name}.joblib\"))\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "2185d2f9", "metadata": {}, "outputs": [], "source": [ "#!pip freeze > requirements.txt" ] }, { "cell_type": "code", "execution_count": 6, "id": "8117c959", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO: Successfully saved requirements file in .\\requirements.txt\n" ] } ], "source": [ "!pipreqs . --force" ] }, { "cell_type": "code", "execution_count": 11, "id": "33af820b", "metadata": {}, "outputs": [], "source": [ "#!pip list --format=freeze > requirements.txt" ] }, { "cell_type": "code", "execution_count": 5, "id": "816b3fe9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "numerical_imputer saved successfully!\n", "categorical_imputer saved successfully!\n", "encoder saved successfully!\n", "scaler saved successfully!\n", "Final_model saved successfully!\n" ] } ], "source": [ "for name, model in models.items():\n", " dump(model, os.path.join(destination, f\"{name}.joblib\"))\n", " if os.path.exists(os.path.join(destination, f\"{name}.joblib\")):\n", " print(f\"{name} saved successfully!\")\n", " else:\n", " print(f\"{name} failed to save.\")\n" ] }, { "cell_type": "code", "execution_count": 90, "id": "5143eadb", "metadata": {}, "outputs": [], "source": [ "destination = \".\"\n", "numerical_imputer = joblib.load(os.path.join(destination, \"numerical_imputer.joblib\"))\n", "categorical_imputer = joblib.load(os.path.join(destination, \"categorical_imputer.joblib\"))\n", "encoder = joblib.load(os.path.join(destination, \"encoder.joblib\"))\n", "scaler = joblib.load(os.path.join(destination, \"scaler.joblib\"))\n", "best_model = joblib.load(os.path.join(destination, \"Final_model.joblib\"))\n", "\n", "loaded_models = {\"numerical_imputer\": numerical_imputer,\n", " \"categorical_imputer\": categorical_imputer,\n", " \"encoder\": encoder,\n", " \"scaler\": scaler,\n", " \"Final_model\": best_model}\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }