File size: 5,018 Bytes
6b561c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import gradio as gr

#loading the dataset and select only the columns needed
selected_columns = ['duration_mo', 'mos_ethnicity', 'complainant_ethnicity', 'is_force', 'is_abuse_of_authority', 'is_discourtesy', 'is_offensive_language', 'outcome_description']
df = pd.read_csv('my_dataset_logistic.csv', usecols=selected_columns)

print(df.columns)
print(df.head())
print(df.describe())
print(df.isnull().sum())

#set the name of the column to calculate accuracy
X = df.drop('outcome_description', axis=1)
y = df['outcome_description']
X.fillna(0, inplace=True)

#split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#train the model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

#make predictions and evaluate the model
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

#classification report with confusion matrix, correlation graph and standard deviation of all the variables
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False,xticklabels=df['outcome_description'].unique(), yticklabels=df['outcome_description'].unique())
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

#Correlation Matrix
correlation_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix')
plt.show()

#plotting a bar chart to visualize better the correlation
target_correlations = correlation_matrix['outcome_description'].sort_values(ascending=False)
plt.figure(figsize=(10, 6))
target_correlations.drop('outcome_description').plot(kind='bar', color='blue')
plt.title('Correlations with Target Variable')
plt.xlabel('Features')
plt.ylabel('Correlation')
plt.show()

#Standard Deviation
std_dev = df.std()
print('\nStandard deviation')
print(std_dev)

#gradio implementation
#create the available options for the ethnicities
mos_ethnicity_options = ["Hispanic", "White", "Black", "Asian", "American Indian", "Other Race", "Refused", "Unknown"]
complainant_ethnicity_options = ["Hispanic", "White", "Black", "Asian", "American Indian", "Other Race", "Refused", "Unknown"]

#defining the function to make predictions using the model
def predict_outcome_duration(mos_ethnicity, complainant_ethnicity, is_force, is_abuse_of_authority, is_discourtesy, is_offensive_language, duration_mo):
    try:
        #converting values from string to int
        mos_ethnicity_encoded = mos_ethnicity_options.index(mos_ethnicity)
        complainant_ethnicity_encoded = complainant_ethnicity_options.index(complainant_ethnicity)

        #converting checkbox value to int
        is_force = int(is_force)
        is_abuse_of_authority = int(is_abuse_of_authority)
        is_discourtesy = int(is_discourtesy)
        is_offensive_language = int(is_offensive_language)

        input_data = [[duration_mo, mos_ethnicity_encoded, complainant_ethnicity_encoded, is_force, is_abuse_of_authority, is_discourtesy, is_offensive_language]]
        input_scaled = scaler.transform(input_data)
        prediction = model.predict(input_scaled)[0]

        #outputting the result
        return "Arrest" if prediction == 1 else "No Arrest"

    except Exception as e:
        return f"Error: {str(e)}"

#creating the gradio interface, using dropdowns to show the different ethnicities, checkbox to identify which type of allegation it was and a slider with the duration in months
mos_ethnicity_dropdown = gr.Dropdown(choices=mos_ethnicity_options,label="Defendant Ethnicity")
complainant_ethnicity_dropdown = gr.Dropdown(choices=complainant_ethnicity_options, label="Complainant Ethnicity")
is_force_checkbox = gr.Checkbox()
is_abuse_of_authority_checkbox = gr.Checkbox()
is_discourtesy_checkbox = gr.Checkbox()
is_offensive_language_checkbox = gr.Checkbox()
duration_mo_slider = gr.Slider(minimum=0, maximum=20, label="Duration in months")

iface = gr.Interface(
    fn=predict_outcome_duration,
    inputs=[complainant_ethnicity_dropdown, mos_ethnicity_dropdown, is_force_checkbox, is_abuse_of_authority_checkbox, is_discourtesy_checkbox, is_offensive_language_checkbox, duration_mo_slider],
    outputs="text",
    live=True,
    title="Complaint Outcome Prediction"
)

# Launch the Gradio Interface
iface.launch(share=True)