import pandas as pd from sklearn.model_selection import train_test_split, cross_val_score from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report, confusion_matrix import seaborn as sns import matplotlib.pyplot as plt import gradio as gr #loading the dataset and select only the columns needed selected_columns = ['duration_mo', 'mos_ethnicity', 'complainant_ethnicity', 'is_force', 'is_abuse_of_authority', 'is_discourtesy', 'is_offensive_language', 'outcome_description'] df = pd.read_csv('my_dataset_logistic.csv', usecols=selected_columns) print(df.columns) print(df.head()) print(df.describe()) print(df.isnull().sum()) #set the name of the column to calculate accuracy X = df.drop('outcome_description', axis=1) y = df['outcome_description'] X.fillna(0, inplace=True) #split into training and test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #standardize the features scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) #train the model model = LogisticRegression(random_state=42) model.fit(X_train_scaled, y_train) #make predictions and evaluate the model y_pred = model.predict(X_test_scaled) accuracy = accuracy_score(y_test, y_pred) print(f'Accuracy: {accuracy:.2f}') #classification report with confusion matrix, correlation graph and standard deviation of all the variables print(classification_report(y_test, y_pred)) # Confusion Matrix conf_matrix = confusion_matrix(y_test, y_pred) plt.figure(figsize=(8, 6)) sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False,xticklabels=df['outcome_description'].unique(), yticklabels=df['outcome_description'].unique()) plt.title("Confusion Matrix") plt.xlabel("Predicted") plt.ylabel("Actual") plt.show() #Correlation Matrix correlation_matrix = df.corr() plt.figure(figsize=(10, 8)) sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5) plt.title('Correlation Matrix') plt.show() #plotting a bar chart to visualize better the correlation target_correlations = correlation_matrix['outcome_description'].sort_values(ascending=False) plt.figure(figsize=(10, 6)) target_correlations.drop('outcome_description').plot(kind='bar', color='blue') plt.title('Correlations with Target Variable') plt.xlabel('Features') plt.ylabel('Correlation') plt.show() #Standard Deviation std_dev = df.std() print('\nStandard deviation') print(std_dev) #gradio implementation #create the available options for the ethnicities mos_ethnicity_options = ["Hispanic", "White", "Black", "Asian", "American Indian", "Other Race", "Refused", "Unknown"] complainant_ethnicity_options = ["Hispanic", "White", "Black", "Asian", "American Indian", "Other Race", "Refused", "Unknown"] #defining the function to make predictions using the model def predict_outcome_duration(mos_ethnicity, complainant_ethnicity, is_force, is_abuse_of_authority, is_discourtesy, is_offensive_language, duration_mo): try: #converting values from string to int mos_ethnicity_encoded = mos_ethnicity_options.index(mos_ethnicity) complainant_ethnicity_encoded = complainant_ethnicity_options.index(complainant_ethnicity) #converting checkbox value to int is_force = int(is_force) is_abuse_of_authority = int(is_abuse_of_authority) is_discourtesy = int(is_discourtesy) is_offensive_language = int(is_offensive_language) input_data = [[duration_mo, mos_ethnicity_encoded, complainant_ethnicity_encoded, is_force, is_abuse_of_authority, is_discourtesy, is_offensive_language]] input_scaled = scaler.transform(input_data) prediction = model.predict(input_scaled)[0] #outputting the result return "Arrest" if prediction == 1 else "No Arrest" except Exception as e: return f"Error: {str(e)}" #creating the gradio interface, using dropdowns to show the different ethnicities, checkbox to identify which type of allegation it was and a slider with the duration in months mos_ethnicity_dropdown = gr.Dropdown(choices=mos_ethnicity_options,label="Defendant Ethnicity") complainant_ethnicity_dropdown = gr.Dropdown(choices=complainant_ethnicity_options, label="Complainant Ethnicity") is_force_checkbox = gr.Checkbox() is_abuse_of_authority_checkbox = gr.Checkbox() is_discourtesy_checkbox = gr.Checkbox() is_offensive_language_checkbox = gr.Checkbox() duration_mo_slider = gr.Slider(minimum=0, maximum=20, label="Duration in months") iface = gr.Interface( fn=predict_outcome_duration, inputs=[complainant_ethnicity_dropdown, mos_ethnicity_dropdown, is_force_checkbox, is_abuse_of_authority_checkbox, is_discourtesy_checkbox, is_offensive_language_checkbox, duration_mo_slider], outputs="text", live=True, title="Complaint Outcome Prediction" ) # Launch the Gradio Interface iface.launch(share=True)