In [1]:
# Data handling
import pandas as pd
import numpy as np 


# EDA (pandas-profiling, etc. )
...

# Feature Processing (Scikit-learn processing, etc. )
from sklearn import preprocessing

# Machine Learning (Scikit-learn Estimators, Catboost, LightGBM, etc. )
...

# Hyperparameters Fine-tuning (Scikit-learn hp search, cross-validation, etc. )
...

# Other packages
import os
import warnings
warnings.filterwarnings('ignore')

#display all columns and rows 
pd.set_option('display.max_columns', None)


In [2]:
# For CSV, use pandas.read_csv

df = pd.read_csv("Telco-Customer-Churn.csv")
df.drop(['customerID'], axis=1, inplace=True)
# Coerce the conversion of TotalCharges column to float
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
# Remove the duplicate rows
df = df.drop_duplicates()

cols_to_replace = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'MultipleLines']
df[cols_to_replace] = df[cols_to_replace].replace('No internet service', 'No').replace('No phone service', 'No')


from sklearn.model_selection import train_test_split

# split the data into features (X) and target variable (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numeric and non-numeric columns
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()


'''creating copy of the categorical features and numerical features
before imputing null value to avoid modifying the orginal dataset'''

X_train_cat = X_train[cat_cols].copy()
X_train_num = X_train[num_cols].copy()

X_test_cat = X_test[cat_cols].copy()
X_test_num = X_test[num_cols].copy()

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Creating imputer variables
numerical_imputer = SimpleImputer(strategy = "mean")
categorical_imputer = SimpleImputer(strategy = "most_frequent")


# Define the column transformer
categorical_features = cat_cols
categorical_transformer = Pipeline(steps=[
 ('onehot', OneHotEncoder(handle_unknown='ignore', categories='auto', sparse=False))
])
preprocessor = ColumnTransformer(
 transformers=[
 ('cat', categorical_transformer, categorical_features)
 ])

# Fitting the Imputer
X_train_cat_imputed = categorical_imputer.fit_transform(X_train_cat)
X_train_num_imputed = numerical_imputer.fit_transform(X_train_num)

X_test_cat_imputed = categorical_imputer.fit_transform(X_test_cat)
X_test_num_imputed = numerical_imputer.fit_transform(X_test_num)

encoder=OneHotEncoder(handle_unknown='ignore')

# encoding the xtrain categories and converting to a dataframe
X_train_cat_encoded = encoder.fit(X_train_cat_imputed)
X_train_cat_encoded = pd.DataFrame(encoder.transform(X_train_cat_imputed).toarray(),
 columns=encoder.get_feature_names_out(cat_cols))

# encoding the xeval categories and converting to a dataframe
X_test_cat_encoded = encoder.fit(X_test_cat_imputed)
X_test_cat_encoded = pd.DataFrame(encoder.transform(X_test_cat_imputed).toarray(),
 columns=encoder.get_feature_names_out(cat_cols))


from sklearn.preprocessing import StandardScaler

scaler= StandardScaler()

X_train_num_scaled = scaler.fit_transform(X_train_num_imputed)
X_train_num_sc = pd.DataFrame(X_train_num_scaled, columns = num_cols)

X_test_num_scaled = scaler.fit_transform(X_test_num_imputed)
X_test_num_sc = pd.DataFrame(X_test_num_scaled, columns = num_cols)

X_train_df = pd.concat([X_train_num_sc,X_train_cat_encoded], axis =1)
X_test_df = pd.concat([X_test_num_sc,X_test_cat_encoded], axis =1)


#Training over SMOTE-balanced data with roc_auc scoring 


from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE

# initialize SMOTE
sm = SMOTE(random_state=42)

# fit SMOTE on the training data and resample it
X_train_resampled, y_train_resampled = sm.fit_resample(X_train_df, y_train)

# print class counts before and after SMOTE
print(f'Class counts before SMOTE: {y_train.value_counts()}')
print(f'Class counts after SMOTE: {y_train_resampled.value_counts()}')

# create a dictionary of models to fit
models = {
 'AdaBoost Classifier': AdaBoostClassifier(),
 'Logistic Regression Classifier': LogisticRegression(),
 'Random Forest Classifier': RandomForestClassifier(),
 'Gradient Boosting Classifier': GradientBoostingClassifier(),
 'SVM Classifier': SVC(probability=True)
}

# iterate over the models and fit each one to the resampled training data
for name, model in models.items():
 model.fit(X_train_resampled, y_train_resampled)
 
# evaluate each model using cross-validation based on ROC-AUC
roc_auc_scores = {}
for name, model in models.items():
 scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='roc_auc')
 roc_auc_scores[name] = scores.mean()
 
# print the ROC-AUC scores for each model
for name, score in roc_auc_scores.items():
 print(f'{name}: {score}')

# choose the model with the highest ROC-AUC score
best_model_name = max(roc_auc_scores, key=roc_auc_scores.get)
best_model = models[best_model_name]
print(f'Best model: {best_model_name}')

from sklearn.metrics import classification_report

# iterate over the models and make predictions on the test data for each one
for name, model in models.items():
 # fit the model to the resampled training data
 model.fit(X_train_resampled, y_train_resampled)
 # make predictions on the test data
 y_pred = model.predict(X_test_df)
 # generate the classification report
 report = classification_report(y_test, y_pred)
 # print the classification report
 print(f'{name} classification report:\n{report}\n')


Class counts before SMOTE: No 4111
Yes 1505
Name: Churn, dtype: int64
Class counts after SMOTE: Yes 4111
No 4111
Name: Churn, dtype: int64
AdaBoost Classifier: 0.9019360028118717
Logistic Regression Classifier: 0.8608679697080713
Random Forest Classifier: 0.9311295690912422
Gradient Boosting Classifier: 0.9235269779240596
SVM Classifier: 0.8944493562575639
Best model: Random Forest Classifier
AdaBoost Classifier classification report:
 precision recall f1-score support

 No 0.90 0.76 0.82 1053
 Yes 0.50 0.74 0.60 352

 accuracy 0.75 1405
 macro avg 0.70 0.75 0.71 1405
weighted avg 0.80 0.75 0.77 1405


Logistic Regression Classifier classification report:
 precision recall f1-score support

 No 0.92 0.73 0.81 1053
 Yes 0.49 0.80 0.61 352

 accuracy 0.74 1405
 macro avg 0.70 0.76 0.71 1405
weighted avg 0.81 0.74 0.76 1405


Random Forest Classifier classification report:
 precision recall f1-score support

 No 0.86 0.84 0.85 1053
 Yes 0.56 0.61 0.58 352

 accuracy 0.78 1405
 macro avg 0

In [10]:
X_train_df.columns.tolist()

['SeniorCitizen',
 'tenure',
 'MonthlyCharges',
 'TotalCharges',
 'gender_Female',
 'gender_Male',
 'Partner_No',
 'Partner_Yes',
 'Dependents_No',
 'Dependents_Yes',
 'PhoneService_No',
 'PhoneService_Yes',
 'MultipleLines_No',
 'MultipleLines_Yes',
 'InternetService_DSL',
 'InternetService_Fiber optic',
 'InternetService_No',
 'OnlineSecurity_No',
 'OnlineSecurity_Yes',
 'OnlineBackup_No',
 'OnlineBackup_Yes',
 'DeviceProtection_No',
 'DeviceProtection_Yes',
 'TechSupport_No',
 'TechSupport_Yes',
 'StreamingTV_No',
 'StreamingTV_Yes',
 'StreamingMovies_No',
 'StreamingMovies_Yes',
 'Contract_Month-to-month',
 'Contract_One year',
 'Contract_Two year',
 'PaperlessBilling_No',
 'PaperlessBilling_Yes',
 'PaymentMethod_Bank transfer (automatic)',
 'PaymentMethod_Credit card (automatic)',
 'PaymentMethod_Electronic check',
 'PaymentMethod_Mailed check']

In [8]:
for col in X.columns:
 print(f"Column '{col}' categories: {X[col].unique()}")

Column 'gender' categories: ['Female' 'Male']
Column 'SeniorCitizen' categories: [0 1]
Column 'Partner' categories: ['Yes' 'No']
Column 'Dependents' categories: ['No' 'Yes']
Column 'tenure' categories: [ 1 34 2 45 8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
 5 46 11 70 63 43 15 60 18 66 9 3 31 50 64 56 7 42 35 48 29 65 38 68
 32 55 37 36 41 6 4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26 0
 39]
Column 'PhoneService' categories: ['No' 'Yes']
Column 'MultipleLines' categories: ['No' 'Yes']
Column 'InternetService' categories: ['DSL' 'Fiber optic' 'No']
Column 'OnlineSecurity' categories: ['No' 'Yes']
Column 'OnlineBackup' categories: ['Yes' 'No']
Column 'DeviceProtection' categories: ['No' 'Yes']
Column 'TechSupport' categories: ['No' 'Yes']
Column 'StreamingTV' categories: ['No' 'Yes']
Column 'StreamingMovies' categories: ['No' 'Yes']
Column 'Contract' categories: ['Month-to-month' 'One year' 'Two year']
Column 'PaperlessBilling' categories: ['Yes' 'No']
Column 'Payme

In [3]:
best_model_name = 'Random Forest Classifier'

best_model = models[best_model_name]

print(f'Best model: {best_model_name}')

Best model: Random Forest Classifier


In [9]:
# Calculate the class weights
class_weight = {"No": 1, "Yes": 10}

# Initialize Logistic Regression model with class weights
rf = RandomForestClassifier(class_weight=class_weight)

# Fit the model to the training data
rf.fit(X_train_resampled, y_train_resampled)

# Predict the labels of the test set
y_pred = rf.predict(X_test_df)

# Generate the classification report
report = classification_report(y_test, y_pred)
print(report)

 precision recall f1-score support

 No 0.85 0.86 0.86 1053
 Yes 0.57 0.56 0.56 352

 accuracy 0.78 1405
 macro avg 0.71 0.71 0.71 1405
weighted avg 0.78 0.78 0.78 1405



In [4]:
from joblib import dump
import os

# set the destination path to the "export" directory
destination = "."

# create a dictionary to store the objects and their filenames
models = {"numerical_imputer": numerical_imputer,
 "categorical_imputer": categorical_imputer,
 "encoder": encoder,
 "scaler": scaler,
 "Final_model": best_model}

# loop through the models and save them using joblib.dump()
for name, model in models.items():
 dump(model, os.path.join(destination, f"{name}.joblib"))


In [10]:
#!pip freeze > requirements.txt

In [6]:
!pipreqs . --force

INFO: Successfully saved requirements file in .\requirements.txt


In [11]:
#!pip list --format=freeze > requirements.txt

In [5]:
for name, model in models.items():
 dump(model, os.path.join(destination, f"{name}.joblib"))
 if os.path.exists(os.path.join(destination, f"{name}.joblib")):
 print(f"{name} saved successfully!")
 else:
 print(f"{name} failed to save.")


numerical_imputer saved successfully!
categorical_imputer saved successfully!
encoder saved successfully!
scaler saved successfully!
Final_model saved successfully!


In [90]:
destination = "."
numerical_imputer = joblib.load(os.path.join(destination, "numerical_imputer.joblib"))
categorical_imputer = joblib.load(os.path.join(destination, "categorical_imputer.joblib"))
encoder = joblib.load(os.path.join(destination, "encoder.joblib"))
scaler = joblib.load(os.path.join(destination, "scaler.joblib"))
best_model = joblib.load(os.path.join(destination, "Final_model.joblib"))

loaded_models = {"numerical_imputer": numerical_imputer,
 "categorical_imputer": categorical_imputer,
 "encoder": encoder,
 "scaler": scaler,
 "Final_model": best_model}
