Spaces:

ninte
/

diabetes_prediction

Build error

App Files Files Community

diabetes_prediction / diabetes_predictor.py

ninte

initial commit

bc9062f about 1 year ago

raw

history blame

10.8 kB

	# -- coding: utf-8 --
	"""DIABETES ANALYTICS.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1AJ9I7KBbC--mXyoABUgQPZHMZoRRnpo4

	DIABETES ANALYSIS AND PREDICTION

	Cleaning the data
	"""

	import pandas as pd
	import numpy as np

	data = pd.read_csv("diabetes.csv")
	data.head()

	# checking for the number of records
	data.shape

	"""This shows that the number of patients were 768"""

	data.dtypes
	#checking the data types of the variables

	# checking for duplicates
	data_duplicate= data[data.duplicated()]
	# finding the sum of the duplicates
	sum(data.duplicated())
	# checking the stats of the data
	data.describe()

	"""a value of 0 indicates that there is no duplicated data

	Checking the Relation between Skin Thickness and Insulin
	"""

	import matplotlib.pyplot as plt
	plt.scatter(data['SkinThickness'], data['Insulin'])
	plt.xscale('log')
	plt.show()

	"""Checking for zeroes"""

	import pandas as pd


	# Check for zeros in the 'BloodPressure' column
	zeros_in_blood_pressure = (data['BloodPressure'] == 0).sum()
	print(f"Number of zeros in 'BloodPressure': {zeros_in_blood_pressure}")

	# Check for zeros in the 'Insulin' column
	zeros_in_insulin = (data['Insulin'] == 0).sum()
	print(f"Number of zeros in 'Insulin': {zeros_in_insulin}")

	# Check for zeros in the 'BMI' column
	zeros_in_bmi = (data['BMI'] == 0).sum()
	print(f"Number of zeros in 'BMI': {zeros_in_bmi}")

	"""These values of zero are impractical since the values for BloodPressure, Insulin and BMI pf an individual CAN NOT be zero

	First Model for Predicting the Values where the BloodPressure, Insulin and BMI are 0
	"""

	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import mean_squared_error


	X = data.drop(columns=['BloodPressure', 'Insulin', 'BMI']) # Features
	y = data[['BloodPressure', 'Insulin', 'BMI']] # Target variables with missing values

	# Split the data into training and test sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Create separate Random Forest models for each column
	models = {}
	for column in y.columns:
	model = RandomForestRegressor(n_estimators=100, random_state=42)
	model.fit(X_train, y_train[column])
	y_pred = model.predict(X_test)

	# Evaluate the model's performance using Mean Squared Error (MSE)
	mse = mean_squared_error(y_test[column], y_pred)
	print(f"Mean Squared Error for {column}: {mse}")

	"""The MSE of Insukin proved the model was not doing well so I tried to improve the model to predict more closer values for the Insulin"""

	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import GradientBoostingRegressor
	from sklearn.metrics import mean_squared_error



	X = data.drop(columns=['Insulin']) # Features
	y = data['Insulin'] # Target variable with missing values

	# Split the data into training and test sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Create a Gradient Boosting model for 'Insulin' prediction
	model = GradientBoostingRegressor(n_estimators=100, random_state=42) # You can adjust hyperparameters as needed
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)

	# Evaluate the model's performance using Mean Squared Error (MSE)
	mse = mean_squared_error(y_test, y_pred)
	print(f"Mean Squared Error for Insulin: {mse}")

	"""Trying to improve the Insulin predictions"""

	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LinearRegression
	from sklearn.ensemble import GradientBoostingRegressor
	from sklearn.metrics import mean_squared_error


	X = data.drop(columns=['BloodPressure', 'BMI']) # Features
	y = data[['BloodPressure', 'BMI']] # Target variables with missing values

	# Split the data into training and test sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Create a dictionary to store models for each column
	models = {
	'BloodPressure': LinearRegression(),
	'Insulin': GradientBoostingRegressor(n_estimators=100, random_state=42),
	'BMI': GradientBoostingRegressor(n_estimators=100, random_state=42)
	}

	# Train and evaluate each model
	for column, model in models.items():
	model.fit(X_train, y_train[column])
	y_pred = model.predict(X_test)

	# Evaluate the model's performance using Mean Squared Error (MSE)
	mse = mean_squared_error(y_test[column], y_pred)
	print(f"Mean Squared Error for {column}: {mse}")

	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import GradientBoostingRegressor
	from sklearn.metrics import mean_squared_error

	# Load your dataset, assuming it's named 'data'
	# For this example, we'll focus only on predicting 'Insulin'
	# Replace 'TargetColumn' with the appropriate column name that contains the missing 'Insulin' values
	X = data.drop(columns=['Insulin']) # Features
	y = data['Insulin'] # Target variable with missing values

	# Split the data into training and test sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Create a Gradient Boosting model for 'Insulin' prediction with hyperparameter tuning
	model = GradientBoostingRegressor(
	n_estimators=500,
	learning_rate=0.20,
	max_depth=5, #
	min_samples_split=2,
	min_samples_leaf=1,
	random_state=42
	)

	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)

	# Evaluate the model's performance using Mean Squared Error (MSE)
	mse = mean_squared_error(y_test, y_pred)
	print(f"Mean Squared Error for Insulin: {mse}")

	"""The High MSE led me to use the median to fill those columns where the Insulin levels were zero because the mean and median of the distribution was almost equal which meant that the disstribution was symmestrical and that it was a uniform distribution"""

	import pandas as pd


	median_insulin = data['Insulin'].median()

	# Replace zeros in the 'Insulin' column with the median value
	data['Insulin'].replace(0, median_insulin, inplace=True)

	"""Building the model for BloodPressure and BMI since the models performed well"""

	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LinearRegression
	from sklearn.ensemble import GradientBoostingRegressor
	from sklearn.metrics import mean_squared_error


	X = data.drop(columns=['BloodPressure', 'BMI']) # Features
	y = data[['BloodPressure', 'BMI']] # Target variables with missing values

	# Split the data into training and test sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Create a dictionary to store models for each column
	models = {
	'BloodPressure': LinearRegression(),

	'BMI': GradientBoostingRegressor(n_estimators=100, random_state=42)
	}

	# Train and evaluate each model
	for column, model in models.items():
	model.fit(X_train, y_train[column])
	y_pred = model.predict(X_test)

	# Evaluate the model's performance using Mean Squared Error (MSE)
	mse = mean_squared_error(y_test[column], y_pred)
	print(f"Mean Squared Error for {column}: {mse}")

	"""Filling the zero columns with the predicted values

	"""

	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LinearRegression
	from sklearn.ensemble import GradientBoostingRegressor
	from sklearn.metrics import mean_squared_error

	# Checking for zeros in the 'BloodPressure' and 'BMI' columns
	zero_bp_indices = data[data['BloodPressure'] == 0].index
	zero_bmi_indices = data[data['BMI'] == 0].index

	# Create a copy of the dataset to work with
	filled_data = data.copy()

	# Iterate through the zero indices for 'BloodPressure' and 'BMI'
	for index in zero_bp_indices:
	# Predict 'BloodPressure' using the trained model
	prediction_bp = models['BloodPressure'].predict([filled_data.drop(columns=['BloodPressure', 'BMI']).loc[index]])
	filled_data.at[index, 'BloodPressure'] = prediction_bp[0]

	for index in zero_bmi_indices:
	# Predict 'BMI' using the trained model
	prediction_bmi = models['BMI'].predict([filled_data.drop(columns=['BloodPressure', 'BMI']).loc[index]])
	filled_data.at[index, 'BMI'] = prediction_bmi[0]

	"""Rechecking for zeroes"""

	import pandas as pd


	# Check for zeros in the 'BloodPressure' column
	zeros_in_blood_pressure = (filled_data['BloodPressure'] == 0).sum()
	print(f"Number of zeros in 'BloodPressure': {zeros_in_blood_pressure}")

	# Check for zeros in the 'Insulin' column
	zeros_in_insulin = (filled_data['Insulin'] == 0).sum()
	print(f"Number of zeros in 'Insulin': {zeros_in_insulin}")

	# Check for zeros in the 'BMI' column
	zeros_in_bmi = (filled_data['BMI'] == 0).sum()
	print(f"Number of zeros in 'BMI': {zeros_in_bmi}")

	"""Building the model for prediction and passing the parameters in the form of a function to check whether an individual has diabetis or not"""

	from sklearn.ensemble import RandomForestClassifier
	import pandas as pd

	# Assuming you have a dataset with labels (0 for not diabetic, 1 for diabetic)
	X = data.drop(columns=['Outcome']) # Features
	y = data['Outcome'] # Target variable

	# Create and train a RandomForestClassifier
	classification_model = RandomForestClassifier(n_estimators=100, random_state=42)
	classification_model.fit(X, y)

	# Function to predict diabetes based on user input
	def predict_diabetes():
	print("Enter the following values for prediction:")
	Pregnancies = float(input("Pregnancies: "))
	Glucose = float(input("Glucose: "))
	BloodPressure = float(input("BloodPressure: "))
	SkinThickness = float(input("SkinThickness: "))
	Insulin = float(input("Insulin: "))
	BMI = float(input("BMI: "))
	DiabetesPedigreeFunction = float(input("DiabetesPedigreeFunction: "))
	Age = float(input("Age: "))

	individual_data = pd.DataFrame({
	'Pregnancies': [Pregnancies],
	'Glucose': [Glucose],
	'BloodPressure': [BloodPressure],
	'SkinThickness': [SkinThickness],
	'Insulin': [Insulin],
	'BMI': [BMI],
	'DiabetesPedigreeFunction': [DiabetesPedigreeFunction],
	'Age': [Age]
	})

	# Use the classification model to predict diabetes
	predicted_class = classification_model.predict(individual_data)

	if predicted_class == 1:
	return "The individual is predicted to have diabetes."
	else:
	return "The individual is predicted not to have diabetes."


	result = predict_diabetes()
	print(result)


	#saving the model
	import joblib

	filename = 'diabetes_prediction_model.joblib'
	joblib.dump(classification_model, filename)

	print(f"Model saved as {filename}")