File size: 10,752 Bytes
bc9062f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
# -*- coding: utf-8 -*-
"""DIABETES ANALYTICS.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1AJ9I7KBbC--mXyoABUgQPZHMZoRRnpo4

DIABETES ANALYSIS AND PREDICTION

Cleaning the data
"""

import pandas as pd
import numpy as np

data = pd.read_csv("diabetes.csv")
data.head()

# checking for the number of records
data.shape

"""This shows that the number of patients were 768"""

data.dtypes
#checking the data types of the variables

# checking for duplicates
data_duplicate= data[data.duplicated()]
# finding the sum of the duplicates
sum(data.duplicated())
# checking the stats of the data
data.describe()

"""a value of 0 indicates that there is no duplicated data

Checking the Relation between Skin Thickness and Insulin
"""

import matplotlib.pyplot as plt
plt.scatter(data['SkinThickness'], data['Insulin'])
plt.xscale('log')
plt.show()

"""Checking for zeroes"""

import pandas as pd


# Check for zeros in the 'BloodPressure' column
zeros_in_blood_pressure = (data['BloodPressure'] == 0).sum()
print(f"Number of zeros in 'BloodPressure': {zeros_in_blood_pressure}")

# Check for zeros in the 'Insulin' column
zeros_in_insulin = (data['Insulin'] == 0).sum()
print(f"Number of zeros in 'Insulin': {zeros_in_insulin}")

# Check for zeros in the 'BMI' column
zeros_in_bmi = (data['BMI'] == 0).sum()
print(f"Number of zeros in 'BMI': {zeros_in_bmi}")

"""These values of zero are impractical since the values for BloodPressure, Insulin and BMI pf an individual CAN NOT be zero

First Model for Predicting the Values where the BloodPressure, Insulin and BMI are 0
"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


X = data.drop(columns=['BloodPressure', 'Insulin', 'BMI'])  # Features
y = data[['BloodPressure', 'Insulin', 'BMI']]  # Target variables with missing values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create separate Random Forest models for each column
models = {}
for column in y.columns:
    model = RandomForestRegressor(n_estimators=100, random_state=42)  
    model.fit(X_train, y_train[column])
    y_pred = model.predict(X_test)

    # Evaluate the model's performance using Mean Squared Error (MSE)
    mse = mean_squared_error(y_test[column], y_pred)
    print(f"Mean Squared Error for {column}: {mse}")

"""The MSE of Insukin proved the model was not doing well so I tried to improve the model to predict more closer values for the Insulin"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error



X = data.drop(columns=['Insulin'])  # Features
y = data['Insulin']  # Target variable with missing values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Gradient Boosting model for 'Insulin' prediction
model = GradientBoostingRegressor(n_estimators=100, random_state=42)  # You can adjust hyperparameters as needed
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model's performance using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error for Insulin: {mse}")

"""Trying to improve the Insulin predictions"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error


X = data.drop(columns=['BloodPressure', 'BMI'])  # Features
y = data[['BloodPressure',  'BMI']]  # Target variables with missing values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a dictionary to store models for each column
models = {
    'BloodPressure': LinearRegression(),
    'Insulin': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'BMI': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate each model
for column, model in models.items():
    model.fit(X_train, y_train[column])
    y_pred = model.predict(X_test)

    # Evaluate the model's performance using Mean Squared Error (MSE)
    mse = mean_squared_error(y_test[column], y_pred)
    print(f"Mean Squared Error for {column}: {mse}")

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Load your dataset, assuming it's named 'data'
# For this example, we'll focus only on predicting 'Insulin'
# Replace 'TargetColumn' with the appropriate column name that contains the missing 'Insulin' values
X = data.drop(columns=['Insulin'])  # Features
y = data['Insulin']  # Target variable with missing values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Gradient Boosting model for 'Insulin' prediction with hyperparameter tuning
model = GradientBoostingRegressor(
    n_estimators=500,  
    learning_rate=0.20,  
    max_depth=5,  #
    min_samples_split=2,  
    min_samples_leaf=1,  
    random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model's performance using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error for Insulin: {mse}")

"""The High MSE led me to use the median to fill those columns where the Insulin levels were zero because the mean and median of the distribution was almost equal which meant that the disstribution was symmestrical and that it was a uniform distribution"""

import pandas as pd


median_insulin = data['Insulin'].median()

# Replace zeros in the 'Insulin' column with the median value
data['Insulin'].replace(0, median_insulin, inplace=True)

"""Building the model for BloodPressure and BMI since the models performed well"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error


X = data.drop(columns=['BloodPressure', 'BMI'])  # Features
y = data[['BloodPressure',  'BMI']]  # Target variables with missing values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a dictionary to store models for each column
models = {
    'BloodPressure': LinearRegression(),

    'BMI': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate each model
for column, model in models.items():
    model.fit(X_train, y_train[column])
    y_pred = model.predict(X_test)

    # Evaluate the model's performance using Mean Squared Error (MSE)
    mse = mean_squared_error(y_test[column], y_pred)
    print(f"Mean Squared Error for {column}: {mse}")

"""Filling the zero columns with the predicted values

"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Checking for zeros in the 'BloodPressure' and 'BMI' columns
zero_bp_indices = data[data['BloodPressure'] == 0].index
zero_bmi_indices = data[data['BMI'] == 0].index

# Create a copy of the dataset to work with
filled_data = data.copy()

# Iterate through the zero indices for 'BloodPressure' and 'BMI'
for index in zero_bp_indices:
    # Predict 'BloodPressure' using the trained model
    prediction_bp = models['BloodPressure'].predict([filled_data.drop(columns=['BloodPressure', 'BMI']).loc[index]])
    filled_data.at[index, 'BloodPressure'] = prediction_bp[0]

for index in zero_bmi_indices:
    # Predict 'BMI' using the trained model
    prediction_bmi = models['BMI'].predict([filled_data.drop(columns=['BloodPressure', 'BMI']).loc[index]])
    filled_data.at[index, 'BMI'] = prediction_bmi[0]

"""Rechecking for zeroes"""

import pandas as pd


# Check for zeros in the 'BloodPressure' column
zeros_in_blood_pressure = (filled_data['BloodPressure'] == 0).sum()
print(f"Number of zeros in 'BloodPressure': {zeros_in_blood_pressure}")

# Check for zeros in the 'Insulin' column
zeros_in_insulin = (filled_data['Insulin'] == 0).sum()
print(f"Number of zeros in 'Insulin': {zeros_in_insulin}")

# Check for zeros in the 'BMI' column
zeros_in_bmi = (filled_data['BMI'] == 0).sum()
print(f"Number of zeros in 'BMI': {zeros_in_bmi}")

"""Building the model for prediction and passing the parameters in the form of a function to check whether an individual has diabetis or not"""

from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Assuming you have a dataset with labels (0 for not diabetic, 1 for diabetic)
X = data.drop(columns=['Outcome'])  # Features
y = data['Outcome']  # Target variable

# Create and train a RandomForestClassifier
classification_model = RandomForestClassifier(n_estimators=100, random_state=42)
classification_model.fit(X, y)

# Function to predict diabetes based on user input
def predict_diabetes():
    print("Enter the following values for prediction:")
    Pregnancies = float(input("Pregnancies: "))
    Glucose = float(input("Glucose: "))
    BloodPressure = float(input("BloodPressure: "))
    SkinThickness = float(input("SkinThickness: "))
    Insulin = float(input("Insulin: "))
    BMI = float(input("BMI: "))
    DiabetesPedigreeFunction = float(input("DiabetesPedigreeFunction: "))
    Age = float(input("Age: "))

    individual_data = pd.DataFrame({
        'Pregnancies': [Pregnancies],
        'Glucose': [Glucose],
        'BloodPressure': [BloodPressure],
        'SkinThickness': [SkinThickness],
        'Insulin': [Insulin],
        'BMI': [BMI],
        'DiabetesPedigreeFunction': [DiabetesPedigreeFunction],
        'Age': [Age]
    })

    # Use the classification model to predict diabetes
    predicted_class = classification_model.predict(individual_data)

    if predicted_class == 1:
        return "The individual is predicted to have diabetes."
    else:
        return "The individual is predicted not to have diabetes."


result = predict_diabetes()
print(result)


#saving the model
import joblib

filename = 'diabetes_prediction_model.joblib'
joblib.dump(classification_model, filename)

print(f"Model saved as {filename}")