# -*- coding: utf-8 -*-
"""DIABETES ANALYTICS.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1AJ9I7KBbC--mXyoABUgQPZHMZoRRnpo4

DIABETES ANALYSIS AND PREDICTION

Cleaning the data
"""

import pandas as pd
import numpy as np

data = pd.read_csv("diabetes.csv")
data.head()

# checking for the number of records
data.shape

"""This shows that the number of patients were 768"""

data.dtypes
#checking the data types of the variables

# checking for duplicates
data_duplicate= data[data.duplicated()]
# finding the sum of the duplicates
sum(data.duplicated())
# checking the stats of the data
data.describe()

"""a value of 0 indicates that there is no duplicated data

Checking the Relation between Skin Thickness and Insulin
"""

import matplotlib.pyplot as plt
plt.scatter(data['SkinThickness'], data['Insulin'])
plt.xscale('log')
plt.show()

"""Checking for zeroes"""

import pandas as pd


# Check for zeros in the 'BloodPressure' column
zeros_in_blood_pressure = (data['BloodPressure'] == 0).sum()
print(f"Number of zeros in 'BloodPressure': {zeros_in_blood_pressure}")

# Check for zeros in the 'Insulin' column
zeros_in_insulin = (data['Insulin'] == 0).sum()
print(f"Number of zeros in 'Insulin': {zeros_in_insulin}")

# Check for zeros in the 'BMI' column
zeros_in_bmi = (data['BMI'] == 0).sum()
print(f"Number of zeros in 'BMI': {zeros_in_bmi}")

"""These values of zero are impractical since the values for BloodPressure, Insulin and BMI pf an individual CAN NOT be zero

First Model for Predicting the Values where the BloodPressure, Insulin and BMI are 0
"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


X = data.drop(columns=['BloodPressure', 'Insulin', 'BMI'])  # Features
y = data[['BloodPressure', 'Insulin', 'BMI']]  # Target variables with missing values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create separate Random Forest models for each column
models = {}
for column in y.columns:
    model = RandomForestRegressor(n_estimators=100, random_state=42)  
    model.fit(X_train, y_train[column])
    y_pred = model.predict(X_test)

    # Evaluate the model's performance using Mean Squared Error (MSE)
    mse = mean_squared_error(y_test[column], y_pred)
    print(f"Mean Squared Error for {column}: {mse}")

"""The MSE of Insukin proved the model was not doing well so I tried to improve the model to predict more closer values for the Insulin"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error


X = data.drop(columns=['Insulin'])  # Features
y = data['Insulin']  # Target variable with missing values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Gradient Boosting model for 'Insulin' prediction
model = GradientBoostingRegressor(n_estimators=100, random_state=42)  # You can adjust hyperparameters as needed
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model's performance using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error for Insulin: {mse}")

"""Trying to improve the Insulin predictions"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error


X = data.drop(columns=['BloodPressure', 'BMI'])  # Features
y = data[['BloodPressure',  'BMI']]  # Target variables with missing values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a dictionary to store models for each column
models = {
    'BloodPressure': LinearRegression(),
    'Insulin': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'BMI': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate each model
for column, model in models.items():
    model.fit(X_train, y_train[column])
    y_pred = model.predict(X_test)

    # Evaluate the model's performance using Mean Squared Error (MSE)
    mse = mean_squared_error(y_test[column], y_pred)
    print(f"Mean Squared Error for {column}: {mse}")

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Load your dataset, assuming it's named 'data'
# For this example, we'll focus only on predicting 'Insulin'
# Replace 'TargetColumn' with the appropriate column name that contains the missing 'Insulin' values
X = data.drop(columns=['Insulin'])  # Features
y = data['Insulin']  # Target variable with missing values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Gradient Boosting model for 'Insulin' prediction with hyperparameter tuning
model = GradientBoostingRegressor(
    n_estimators=500,  
    learning_rate=0.20,  
    max_depth=5,  #
    min_samples_split=2,  
    min_samples_leaf=1,  
    random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model's performance using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error for Insulin: {mse}")

"""The High MSE led me to use the median to fill those columns where the Insulin levels were zero because the mean and median of the distribution was almost equal which meant that the disstribution was symmestrical and that it was a uniform distribution"""

import pandas as pd


median_insulin = data['Insulin'].median()

# Replace zeros in the 'Insulin' column with the median value
data['Insulin'].replace(0, median_insulin, inplace=True)

"""Building the model for BloodPressure and BMI since the models performed well"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error


X = data.drop(columns=['BloodPressure', 'BMI'])  # Features
y = data[['BloodPressure',  'BMI']]  # Target variables with missing values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a dictionary to store models for each column
models = {
    'BloodPressure': LinearRegression(),

    'BMI': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate each model
for column, model in models.items():
    model.fit(X_train, y_train[column])
    y_pred = model.predict(X_test)

    # Evaluate the model's performance using Mean Squared Error (MSE)
    mse = mean_squared_error(y_test[column], y_pred)
    print(f"Mean Squared Error for {column}: {mse}")

"""Filling the zero columns with the predicted values

"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Checking for zeros in the 'BloodPressure' and 'BMI' columns
zero_bp_indices = data[data['BloodPressure'] == 0].index
zero_bmi_indices = data[data['BMI'] == 0].index

# Create a copy of the dataset to work with
filled_data = data.copy()

# Iterate through the zero indices for 'BloodPressure' and 'BMI'
for index in zero_bp_indices:
    # Predict 'BloodPressure' using the trained model
    prediction_bp = models['BloodPressure'].predict([filled_data.drop(columns=['BloodPressure', 'BMI']).loc[index]])
    filled_data.at[index, 'BloodPressure'] = prediction_bp[0]

for index in zero_bmi_indices:
    # Predict 'BMI' using the trained model
    prediction_bmi = models['BMI'].predict([filled_data.drop(columns=['BloodPressure', 'BMI']).loc[index]])
    filled_data.at[index, 'BMI'] = prediction_bmi[0]

"""Rechecking for zeroes"""

import pandas as pd


# Check for zeros in the 'BloodPressure' column
zeros_in_blood_pressure = (filled_data['BloodPressure'] == 0).sum()
print(f"Number of zeros in 'BloodPressure': {zeros_in_blood_pressure}")

# Check for zeros in the 'Insulin' column
zeros_in_insulin = (filled_data['Insulin'] == 0).sum()
print(f"Number of zeros in 'Insulin': {zeros_in_insulin}")

# Check for zeros in the 'BMI' column
zeros_in_bmi = (filled_data['BMI'] == 0).sum()
print(f"Number of zeros in 'BMI': {zeros_in_bmi}")

"""Building the model for prediction and passing the parameters in the form of a function to check whether an individual has diabetis or not"""

from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Assuming you have a dataset with labels (0 for not diabetic, 1 for diabetic)
X = data.drop(columns=['Outcome'])  # Features
y = data['Outcome']  # Target variable

# Create and train a RandomForestClassifier
classification_model = RandomForestClassifier(n_estimators=100, random_state=42)
classification_model.fit(X, y)

# Function to predict diabetes based on user input
def predict_diabetes():
    print("Enter the following values for prediction:")
    Pregnancies = float(input("Pregnancies: "))
    Glucose = float(input("Glucose: "))
    BloodPressure = float(input("BloodPressure: "))
    SkinThickness = float(input("SkinThickness: "))
    Insulin = float(input("Insulin: "))
    BMI = float(input("BMI: "))
    DiabetesPedigreeFunction = float(input("DiabetesPedigreeFunction: "))
    Age = float(input("Age: "))

    individual_data = pd.DataFrame({
        'Pregnancies': [Pregnancies],
        'Glucose': [Glucose],
        'BloodPressure': [BloodPressure],
        'SkinThickness': [SkinThickness],
        'Insulin': [Insulin],
        'BMI': [BMI],
        'DiabetesPedigreeFunction': [DiabetesPedigreeFunction],
        'Age': [Age]
    })

    # Use the classification model to predict diabetes
    predicted_class = classification_model.predict(individual_data)

    if predicted_class == 1:
        return "The individual is predicted to have diabetes."
    else:
        return "The individual is predicted not to have diabetes."


result = predict_diabetes()
print(result)


#saving the model
import joblib

filename = 'diabetes_prediction_model.joblib'
joblib.dump(classification_model, filename)

print(f"Model saved as {filename}")