Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -47,9 +47,154 @@ import warnings
|
|
47 |
# Ignore all warnings
|
48 |
warnings.filterwarnings("ignore")
|
49 |
|
50 |
-
|
51 |
-
#
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
|
|
|
47 |
# Ignore all warnings
|
48 |
warnings.filterwarnings("ignore")
|
49 |
|
50 |
+
|
51 |
+
# Download the dataset
|
52 |
+
url = "http://www.ulb.ac.be/di/map/adalpozz/data/creditcard.Rdata"
|
53 |
+
dst_path = "./creditcard.Rdata"
|
54 |
+
wget.download(url, dst_path)
|
55 |
+
|
56 |
+
# Load the dataset
|
57 |
+
parsed_res = rdata.parser.parse_file(dst_path)
|
58 |
+
res = rdata.conversion.convert(parsed_res)
|
59 |
+
dataset = res['creditcard'].reset_index(drop=True).drop(['Time'], axis=1)
|
60 |
+
|
61 |
+
# Prepare the data
|
62 |
+
y = dataset['Class'].astype(int) # Convert to integers
|
63 |
+
df = dataset.drop(['Class'], axis=1)
|
64 |
+
df.columns = df.columns.astype(str)
|
65 |
+
|
66 |
+
print("Data subsets created")
|
67 |
+
|
68 |
+
# Split the data
|
69 |
+
X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.6, random_state=0, stratify=y)
|
70 |
+
X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=0.2, random_state=0, stratify=y_train)
|
71 |
+
|
72 |
+
# Reset indices
|
73 |
+
X_train.reset_index(drop=True, inplace=True)
|
74 |
+
y_train.reset_index(drop=True, inplace=True)
|
75 |
+
|
76 |
+
# Define the numerical features and the pipeline for numerical features
|
77 |
+
numerical_features = [f'V{i}' for i in range(1, 29)] + ['Amount']
|
78 |
+
|
79 |
+
numerical_pipeline = make_pipeline(
|
80 |
+
StandardScaler() # Example: Standardize numerical features
|
81 |
+
)
|
82 |
+
|
83 |
+
# Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately.
|
84 |
+
preprocessor = make_column_transformer(
|
85 |
+
(numerical_pipeline, numerical_features)
|
86 |
+
)
|
87 |
+
|
88 |
+
# Creating model
|
89 |
+
clf = MCD()
|
90 |
+
|
91 |
+
# Creating a pipeline combining preprocessing steps (imputation and encoding) with the MCD model
|
92 |
+
model_pipeline = make_pipeline(
|
93 |
+
preprocessor, # Applying preprocessing steps
|
94 |
+
clf # Training MCD model
|
95 |
+
)
|
96 |
+
|
97 |
+
print("Preprocessing Data")
|
98 |
+
|
99 |
+
# Fit the model and train model to predict anomalies
|
100 |
+
model_pipeline.fit(X_train)
|
101 |
+
y_test_pred = model_pipeline.predict(X_test)
|
102 |
+
|
103 |
+
# Define the predict function
|
104 |
+
def predict(csv_filename):
|
105 |
+
# Read the CSV file
|
106 |
+
df = pd.read_csv(csv_filename, header=None)
|
107 |
+
|
108 |
+
# Convert the DataFrame to a list of floats
|
109 |
+
client_data = df.iloc[0].tolist()
|
110 |
+
|
111 |
+
# Check if the length of client_data is 29
|
112 |
+
if len(client_data) != 29:
|
113 |
+
raise ValueError("The CSV file must contain exactly 29 values.")
|
114 |
+
|
115 |
+
# Unpack the list of values
|
116 |
+
V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, Amount = client_data
|
117 |
+
|
118 |
+
# Create the data dictionary
|
119 |
+
data = {
|
120 |
+
'V1': V1, 'V2': V2, 'V3': V3, 'V4': V4, 'V5': V5, 'V6': V6, 'V7': V7, 'V8': V8, 'V9': V9, 'V10': V10,
|
121 |
+
'V11': V11, 'V12': V12, 'V13': V13, 'V14': V14, 'V15': V15, 'V16': V16, 'V17': V17, 'V18': V18, 'V19': V19, 'V20': V20,
|
122 |
+
'V21': V21, 'V22': V22, 'V23': V23, 'V24': V24, 'V25': V25, 'V26': V26, 'V27': V27, 'V28': V28, 'Amount': Amount
|
123 |
+
}
|
124 |
+
|
125 |
+
# Convert the data dictionary to a DataFrame
|
126 |
+
input_df = pd.DataFrame([data])
|
127 |
+
|
128 |
+
# Make predictions using the loaded model
|
129 |
+
prediction = model_pipeline.predict(input_df)
|
130 |
+
|
131 |
+
return prediction[0], Amount # Return both the prediction and Amount
|
132 |
+
|
133 |
+
# Define a function to map the names to their respective CSV filenames
|
134 |
+
def get_csv_filename(name):
|
135 |
+
name_to_filename = {
|
136 |
+
'Ted': 'Ted.csv',
|
137 |
+
'Bill': 'Bill.csv',
|
138 |
+
'Jill': 'Jill.csv',
|
139 |
+
'Juan': 'Juan.csv'
|
140 |
+
}
|
141 |
+
return name_to_filename.get(name, 'Ted.csv') # Default to 'Ted.csv' if name not found
|
142 |
+
|
143 |
+
# Define the Gradio interface function for single prediction
|
144 |
+
def gradio_predict(name):
|
145 |
+
csv_filename = get_csv_filename(name)
|
146 |
+
prediction, amount = predict(csv_filename)
|
147 |
+
return f"The flagged transaction amount is {amount} and the prediction is {prediction}"
|
148 |
+
|
149 |
+
# Define the function for bulk analysis
|
150 |
+
def bulk_analysis(file):
|
151 |
+
# Read the uploaded CSV file
|
152 |
+
df = pd.read_csv(file.name)
|
153 |
+
|
154 |
+
# Assuming the last column is 'Amount' and the rest are features
|
155 |
+
X_test = df.iloc[:, :-1]
|
156 |
+
y_test = df.iloc[:, -1]
|
157 |
+
|
158 |
+
# Make predictions using the loaded model
|
159 |
+
y_test_pred = model_pipeline.predict(X_test)
|
160 |
+
|
161 |
+
# Debugging: Print counts of anomalies in actual and predicted
|
162 |
+
actual_anomalies = sum(y_test == 1)
|
163 |
+
predicted_anomalies = sum(y_test_pred == 1)
|
164 |
+
print(f"Actual anomalies: {actual_anomalies}, Predicted anomalies: {predicted_anomalies}")
|
165 |
+
|
166 |
+
# Find rows where actual and predicted are both 1
|
167 |
+
correctly_predicted_anomalies = X_test[(y_test == 1) & (y_test_pred == 1)]
|
168 |
+
print(f"Correctly predicted anomalies: {len(correctly_predicted_anomalies)}")
|
169 |
+
|
170 |
+
# Save the results to a CSV file
|
171 |
+
#result_filename = "correct_anomalies.csv"
|
172 |
+
#correctly_predicted_anomalies.to_csv(result_filename, index=False)
|
173 |
+
|
174 |
+
r#eturn result_filename # Return the path to the saved file
|
175 |
+
|
176 |
+
|
177 |
+
# Create the Gradio interface
|
178 |
+
iface = gr.Interface(
|
179 |
+
fn=gradio_predict,
|
180 |
+
inputs=gr.Dropdown(choices=['Ted', 'Bill', 'Jill', 'Juan'], label="Select a name"),
|
181 |
+
outputs="text"
|
182 |
+
)
|
183 |
+
|
184 |
+
# Add the bulk analysis upload interface
|
185 |
+
bulk_iface = gr.Interface(
|
186 |
+
fn=bulk_analysis,
|
187 |
+
inputs=gr.File(label="Bulk Analysis"),
|
188 |
+
outputs="text"
|
189 |
+
)
|
190 |
+
|
191 |
+
# Combine the interfaces
|
192 |
+
combined_iface = gr.TabbedInterface(
|
193 |
+
[iface, bulk_iface],
|
194 |
+
tab_names=["Single Prediction", "Bulk Analysis"]
|
195 |
+
)
|
196 |
+
|
197 |
+
# Launch the interface
|
198 |
+
combined_iface.launch(share=True)
|
199 |
|
200 |
|