# -*- coding: utf-8 -*- """DIABETES ANALYTICS.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1AJ9I7KBbC--mXyoABUgQPZHMZoRRnpo4 DIABETES ANALYSIS AND PREDICTION Cleaning the data """ import pandas as pd import numpy as np data = pd.read_csv("diabetes.csv") data.head() # checking for the number of records data.shape """This shows that the number of patients were 768""" data.dtypes #checking the data types of the variables # checking for duplicates data_duplicate= data[data.duplicated()] # finding the sum of the duplicates sum(data.duplicated()) # checking the stats of the data data.describe() """a value of 0 indicates that there is no duplicated data Checking the Relation between Skin Thickness and Insulin """ import matplotlib.pyplot as plt plt.scatter(data['SkinThickness'], data['Insulin']) plt.xscale('log') plt.show() """Checking for zeroes""" import pandas as pd # Check for zeros in the 'BloodPressure' column zeros_in_blood_pressure = (data['BloodPressure'] == 0).sum() print(f"Number of zeros in 'BloodPressure': {zeros_in_blood_pressure}") # Check for zeros in the 'Insulin' column zeros_in_insulin = (data['Insulin'] == 0).sum() print(f"Number of zeros in 'Insulin': {zeros_in_insulin}") # Check for zeros in the 'BMI' column zeros_in_bmi = (data['BMI'] == 0).sum() print(f"Number of zeros in 'BMI': {zeros_in_bmi}") """These values of zero are impractical since the values for BloodPressure, Insulin and BMI pf an individual CAN NOT be zero First Model for Predicting the Values where the BloodPressure, Insulin and BMI are 0 """ import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error X = data.drop(columns=['BloodPressure', 'Insulin', 'BMI']) # Features y = data[['BloodPressure', 'Insulin', 'BMI']] # Target variables with missing values # Split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Create separate Random Forest models for each column models = {} for column in y.columns: model = RandomForestRegressor(n_estimators=100, random_state=42) model.fit(X_train, y_train[column]) y_pred = model.predict(X_test) # Evaluate the model's performance using Mean Squared Error (MSE) mse = mean_squared_error(y_test[column], y_pred) print(f"Mean Squared Error for {column}: {mse}") """The MSE of Insukin proved the model was not doing well so I tried to improve the model to predict more closer values for the Insulin""" import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingRegressor from sklearn.metrics import mean_squared_error X = data.drop(columns=['Insulin']) # Features y = data['Insulin'] # Target variable with missing values # Split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Create a Gradient Boosting model for 'Insulin' prediction model = GradientBoostingRegressor(n_estimators=100, random_state=42) # You can adjust hyperparameters as needed model.fit(X_train, y_train) y_pred = model.predict(X_test) # Evaluate the model's performance using Mean Squared Error (MSE) mse = mean_squared_error(y_test, y_pred) print(f"Mean Squared Error for Insulin: {mse}") """Trying to improve the Insulin predictions""" import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.ensemble import GradientBoostingRegressor from sklearn.metrics import mean_squared_error X = data.drop(columns=['BloodPressure', 'BMI']) # Features y = data[['BloodPressure', 'BMI']] # Target variables with missing values # Split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Create a dictionary to store models for each column models = { 'BloodPressure': LinearRegression(), 'Insulin': GradientBoostingRegressor(n_estimators=100, random_state=42), 'BMI': GradientBoostingRegressor(n_estimators=100, random_state=42) } # Train and evaluate each model for column, model in models.items(): model.fit(X_train, y_train[column]) y_pred = model.predict(X_test) # Evaluate the model's performance using Mean Squared Error (MSE) mse = mean_squared_error(y_test[column], y_pred) print(f"Mean Squared Error for {column}: {mse}") import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingRegressor from sklearn.metrics import mean_squared_error # Load your dataset, assuming it's named 'data' # For this example, we'll focus only on predicting 'Insulin' # Replace 'TargetColumn' with the appropriate column name that contains the missing 'Insulin' values X = data.drop(columns=['Insulin']) # Features y = data['Insulin'] # Target variable with missing values # Split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Create a Gradient Boosting model for 'Insulin' prediction with hyperparameter tuning model = GradientBoostingRegressor( n_estimators=500, learning_rate=0.20, max_depth=5, # min_samples_split=2, min_samples_leaf=1, random_state=42 ) model.fit(X_train, y_train) y_pred = model.predict(X_test) # Evaluate the model's performance using Mean Squared Error (MSE) mse = mean_squared_error(y_test, y_pred) print(f"Mean Squared Error for Insulin: {mse}") """The High MSE led me to use the median to fill those columns where the Insulin levels were zero because the mean and median of the distribution was almost equal which meant that the disstribution was symmestrical and that it was a uniform distribution""" import pandas as pd median_insulin = data['Insulin'].median() # Replace zeros in the 'Insulin' column with the median value data['Insulin'].replace(0, median_insulin, inplace=True) """Building the model for BloodPressure and BMI since the models performed well""" import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.ensemble import GradientBoostingRegressor from sklearn.metrics import mean_squared_error X = data.drop(columns=['BloodPressure', 'BMI']) # Features y = data[['BloodPressure', 'BMI']] # Target variables with missing values # Split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Create a dictionary to store models for each column models = { 'BloodPressure': LinearRegression(), 'BMI': GradientBoostingRegressor(n_estimators=100, random_state=42) } # Train and evaluate each model for column, model in models.items(): model.fit(X_train, y_train[column]) y_pred = model.predict(X_test) # Evaluate the model's performance using Mean Squared Error (MSE) mse = mean_squared_error(y_test[column], y_pred) print(f"Mean Squared Error for {column}: {mse}") """Filling the zero columns with the predicted values """ import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.ensemble import GradientBoostingRegressor from sklearn.metrics import mean_squared_error # Checking for zeros in the 'BloodPressure' and 'BMI' columns zero_bp_indices = data[data['BloodPressure'] == 0].index zero_bmi_indices = data[data['BMI'] == 0].index # Create a copy of the dataset to work with filled_data = data.copy() # Iterate through the zero indices for 'BloodPressure' and 'BMI' for index in zero_bp_indices: # Predict 'BloodPressure' using the trained model prediction_bp = models['BloodPressure'].predict([filled_data.drop(columns=['BloodPressure', 'BMI']).loc[index]]) filled_data.at[index, 'BloodPressure'] = prediction_bp[0] for index in zero_bmi_indices: # Predict 'BMI' using the trained model prediction_bmi = models['BMI'].predict([filled_data.drop(columns=['BloodPressure', 'BMI']).loc[index]]) filled_data.at[index, 'BMI'] = prediction_bmi[0] """Rechecking for zeroes""" import pandas as pd # Check for zeros in the 'BloodPressure' column zeros_in_blood_pressure = (filled_data['BloodPressure'] == 0).sum() print(f"Number of zeros in 'BloodPressure': {zeros_in_blood_pressure}") # Check for zeros in the 'Insulin' column zeros_in_insulin = (filled_data['Insulin'] == 0).sum() print(f"Number of zeros in 'Insulin': {zeros_in_insulin}") # Check for zeros in the 'BMI' column zeros_in_bmi = (filled_data['BMI'] == 0).sum() print(f"Number of zeros in 'BMI': {zeros_in_bmi}") """Building the model for prediction and passing the parameters in the form of a function to check whether an individual has diabetis or not""" from sklearn.ensemble import RandomForestClassifier import pandas as pd # Assuming you have a dataset with labels (0 for not diabetic, 1 for diabetic) X = data.drop(columns=['Outcome']) # Features y = data['Outcome'] # Target variable # Create and train a RandomForestClassifier classification_model = RandomForestClassifier(n_estimators=100, random_state=42) classification_model.fit(X, y) # Function to predict diabetes based on user input def predict_diabetes(): print("Enter the following values for prediction:") Pregnancies = float(input("Pregnancies: ")) Glucose = float(input("Glucose: ")) BloodPressure = float(input("BloodPressure: ")) SkinThickness = float(input("SkinThickness: ")) Insulin = float(input("Insulin: ")) BMI = float(input("BMI: ")) DiabetesPedigreeFunction = float(input("DiabetesPedigreeFunction: ")) Age = float(input("Age: ")) individual_data = pd.DataFrame({ 'Pregnancies': [Pregnancies], 'Glucose': [Glucose], 'BloodPressure': [BloodPressure], 'SkinThickness': [SkinThickness], 'Insulin': [Insulin], 'BMI': [BMI], 'DiabetesPedigreeFunction': [DiabetesPedigreeFunction], 'Age': [Age] }) # Use the classification model to predict diabetes predicted_class = classification_model.predict(individual_data) if predicted_class == 1: return "The individual is predicted to have diabetes." else: return "The individual is predicted not to have diabetes." result = predict_diabetes() print(result) #saving the model import joblib filename = 'diabetes_prediction_model.joblib' joblib.dump(classification_model, filename) print(f"Model saved as {filename}")