Spaces:

kgauvin603
/

OCW-FraudDetection

Sleeping

App Files Files Community

kgauvin603 commited on Aug 29, 2024

Commit

b6bd2c7

verified ·

1 Parent(s): 176392b

Update app.py

Browse files

Files changed (1) hide show

app.py +149 -4

app.py CHANGED Viewed

@@ -47,9 +47,154 @@ import warnings
 # Ignore all warnings
 warnings.filterwarnings("ignore")
-# Run the training script placed in the same directory as app.py
-# The training script will train and persist a linear regression
-# model with the filename 'model.joblib'
-subprocess.run(['python', 'train.py'])

 # Ignore all warnings
 warnings.filterwarnings("ignore")
+# Download the dataset
+url = "http://www.ulb.ac.be/di/map/adalpozz/data/creditcard.Rdata"
+dst_path = "./creditcard.Rdata"
+wget.download(url, dst_path)
+# Load the dataset
+parsed_res = rdata.parser.parse_file(dst_path)
+res = rdata.conversion.convert(parsed_res)
+dataset = res['creditcard'].reset_index(drop=True).drop(['Time'], axis=1)
+# Prepare the data
+y = dataset['Class'].astype(int)  # Convert to integers
+df = dataset.drop(['Class'], axis=1)
+df.columns = df.columns.astype(str)
+print("Data subsets created")
+# Split the data
+X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.6, random_state=0, stratify=y)
+X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=0.2, random_state=0, stratify=y_train)
+# Reset indices
+X_train.reset_index(drop=True, inplace=True)
+y_train.reset_index(drop=True, inplace=True)
+# Define the numerical features and the pipeline for numerical features
+numerical_features = [f'V{i}' for i in range(1, 29)] + ['Amount']
+numerical_pipeline = make_pipeline(
+    StandardScaler()  # Example: Standardize numerical features
+)
+# Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately.
+preprocessor = make_column_transformer(
+    (numerical_pipeline, numerical_features)
+)
+# Creating model
+clf = MCD()
+# Creating a pipeline combining preprocessing steps (imputation and encoding) with the MCD model
+model_pipeline = make_pipeline(
+    preprocessor,  # Applying preprocessing steps
+    clf  # Training MCD model
+)
+print("Preprocessing Data")
+# Fit the model and train model to predict anomalies
+model_pipeline.fit(X_train)
+y_test_pred = model_pipeline.predict(X_test)
+# Define the predict function
+def predict(csv_filename):
+    # Read the CSV file
+    df = pd.read_csv(csv_filename, header=None)
+    # Convert the DataFrame to a list of floats
+    client_data = df.iloc[0].tolist()
+    # Check if the length of client_data is 29
+    if len(client_data) != 29:
+        raise ValueError("The CSV file must contain exactly 29 values.")
+    # Unpack the list of values
+    V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, Amount = client_data
+    # Create the data dictionary
+    data = {
+        'V1': V1, 'V2': V2, 'V3': V3, 'V4': V4, 'V5': V5, 'V6': V6, 'V7': V7, 'V8': V8, 'V9': V9, 'V10': V10,
+        'V11': V11, 'V12': V12, 'V13': V13, 'V14': V14, 'V15': V15, 'V16': V16, 'V17': V17, 'V18': V18, 'V19': V19, 'V20': V20,
+        'V21': V21, 'V22': V22, 'V23': V23, 'V24': V24, 'V25': V25, 'V26': V26, 'V27': V27, 'V28': V28, 'Amount': Amount
+    }
+    # Convert the data dictionary to a DataFrame
+    input_df = pd.DataFrame([data])
+    # Make predictions using the loaded model
+    prediction = model_pipeline.predict(input_df)
+    return prediction[0], Amount  # Return both the prediction and Amount
+# Define a function to map the names to their respective CSV filenames
+def get_csv_filename(name):
+    name_to_filename = {
+        'Ted': 'Ted.csv',
+        'Bill': 'Bill.csv',
+        'Jill': 'Jill.csv',
+        'Juan': 'Juan.csv'
+    }
+    return name_to_filename.get(name, 'Ted.csv')  # Default to 'Ted.csv' if name not found
+# Define the Gradio interface function for single prediction
+def gradio_predict(name):
+    csv_filename = get_csv_filename(name)
+    prediction, amount = predict(csv_filename)
+    return f"The flagged transaction amount is {amount} and the prediction is {prediction}"
+# Define the function for bulk analysis
+def bulk_analysis(file):
+    # Read the uploaded CSV file
+    df = pd.read_csv(file.name)
+    # Assuming the last column is 'Amount' and the rest are features
+    X_test = df.iloc[:, :-1]
+    y_test = df.iloc[:, -1]
+    # Make predictions using the loaded model
+    y_test_pred = model_pipeline.predict(X_test)
+    # Debugging: Print counts of anomalies in actual and predicted
+    actual_anomalies = sum(y_test == 1)
+    predicted_anomalies = sum(y_test_pred == 1)
+    print(f"Actual anomalies: {actual_anomalies}, Predicted anomalies: {predicted_anomalies}")
+    # Find rows where actual and predicted are both 1
+    correctly_predicted_anomalies = X_test[(y_test == 1) & (y_test_pred == 1)]
+    print(f"Correctly predicted anomalies: {len(correctly_predicted_anomalies)}")
+    # Save the results to a CSV file
+    #result_filename = "correct_anomalies.csv"
+    #correctly_predicted_anomalies.to_csv(result_filename, index=False)
+    r#eturn result_filename  # Return the path to the saved file
+# Create the Gradio interface
+iface = gr.Interface(
+    fn=gradio_predict,
+    inputs=gr.Dropdown(choices=['Ted', 'Bill', 'Jill', 'Juan'], label="Select a name"),
+    outputs="text"
+)
+# Add the bulk analysis upload interface
+bulk_iface = gr.Interface(
+    fn=bulk_analysis,
+    inputs=gr.File(label="Bulk Analysis"),
+    outputs="text"
+)
+# Combine the interfaces
+combined_iface = gr.TabbedInterface(
+    [iface, bulk_iface],
+    tab_names=["Single Prediction", "Bulk Analysis"]
+)
+# Launch the interface
+combined_iface.launch(share=True)