anirudhabokil's picture
Adding train.py and calling it from app.py
0650863 verified
raw
history blame
1.53 kB
import joblib
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score, mean_squared_error
import numpy as np
import pandas as pd
df_original = pd.read_csv("hf://datasets/anirudhabokil/insurance_data/insurance_data.csv")
target = 'charges'
#remove index column and assignt o new dataset df
df = df_original.drop(columns=['index'])
numerical_features = ['age', 'bmi', 'children']
categorical_features = ['sex', 'smoker', 'region']
X = df[numerical_features + categorical_features]
y = df[target]
print('Splitting data')
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)
preprocessor = make_column_transformer(
(StandardScaler(), numerical_features),
(OneHotEncoder(), categorical_features)
)
model_logistic_regression = LinearRegression(n_jobs=-1)
print('Estimating model pipelline')
model_pipeline = make_pipeline(preprocessor, model_logistic_regression)
model_pipeline.fit(Xtrain, ytrain)
prediction = model_pipeline.predict(Xtest)
print('Logging metrics')
print(f"R-squared: {r2_score(ytest, prediction)}")
print(f"RMSE: {root_mean_squared_error(ytest, prediction)}")
print("Serializing model")
saved_mode_path = 'model.joblib'
joblib.dump(model_pipeline, 'model.joblib')