flokabukie's picture
Create app.py
5b475df
raw
history blame
5.93 kB
import streamlit as st
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load the non-anomaly data
non_anomaly_csv_filename = 'non_anomaly_data.csv'
non_anomaly_df = pd.read_csv(non_anomaly_csv_filename)
# Open the Mitos Spreadsheet file
#st.write("Opening Mitos Spreadsheet file...")
#st.csv_open("non_anomaly_data.csv")
# Display the first sheet
#st.write(st.get_active_sheet().name)
# Display the first row of the first sheet
#st.write(st.get_active_sheet().rows[0])
# Load the Isolation Forest model
model_filename = "IsolationForest.joblib"
isolation_forest = joblib.load(model_filename)
# Load the StandardScaler
scaler_filename = "StandardScaler.joblib"
scaler = joblib.load(scaler_filename)
st.title("Anomaly Detection App with Isolation Forest")
st.sidebar.title("Input Feature Values")
transaction_dollar_amount = st.sidebar.slider("Transaction Dollar Amount", min_value=0.0, max_value=10000.0)
longitude = st.sidebar.slider("Longitude (Long)", min_value=-180.0, max_value=180.0)
latitude = st.sidebar.slider("Latitude (Lat)", min_value=-90.0, max_value=90.0)
credit_card_limit = st.sidebar.slider("Credit Card Limit", min_value=0, max_value=50000)
year = st.sidebar.slider("Year", min_value=2000, max_value=2030)
month = st.sidebar.slider("Month", min_value=1, max_value=12)
day = st.sidebar.slider("Day", min_value=1, max_value=31)
submitted = st.sidebar.button("Submit")
if submitted:
input_data = {
'transaction_dollar_amount': transaction_dollar_amount,
'Long': longitude,
'Lat': latitude,
'credit_card_limit': credit_card_limit,
'year': year,
'month': month,
'day': day
}
selected_columns = pd.DataFrame([input_data])
# Standardize the input data using the loaded StandardScaler
selected_columns_scaled = scaler.transform(selected_columns)
# Apply Isolation Forest for anomaly detection on the non-anomaly dataset
non_anomaly_scores = isolation_forest.decision_function(scaler.transform(non_anomaly_df))
# Apply Isolation Forest for anomaly detection on your single input data
your_anomaly_score = isolation_forest.decision_function(selected_columns_scaled)[0]
# Calculate the minimum and maximum anomaly scores from non-anomaly data
min_non_anomaly_score = np.min(non_anomaly_scores)
max_non_anomaly_score = np.max(non_anomaly_scores)
# Add a margin of error for the range
margin = 0.5
min_threshold = min_non_anomaly_score - margin
max_threshold = max_non_anomaly_score + margin
# Determine if the input data point is an anomaly based on the score
#is_anomaly = your_anomaly_score >= np.percentile(non_anomaly_scores, 95)
# Determine if the input data point is an anomaly based on the score
is_anomaly = your_anomaly_score < min_threshold or your_anomaly_score > max_threshold
# Print the anomaly status
st.subheader("Anomaly Classification")
if is_anomaly:
st.write("Prediction Result: 🚨 Anomaly Detected!")
else:
st.write("Prediction Result: βœ… Not Anomaly")
# Create a bar plot to visualize the anomaly score distribution and your data point's score
plt.figure(figsize=(8, 5))
# Plot the distribution of anomaly scores from the non-anomaly dataset
sns.histplot(non_anomaly_scores, kde=True, color='gray', label='Non-Anomaly Score Distribution')
# Plot your data point's anomaly score
plt.axvline(x=your_anomaly_score, color='blue', linestyle='dashed', label='Your Data Point')
# Set labels and title
plt.xlabel('Anomaly Score')
plt.ylabel('Frequency')
plt.title('Anomaly Score Distribution and Your Data Point')
plt.legend()
#plt.grid(True)
# Display the histogram plot
st.pyplot(plt)
# Explain the results
st.write("The input data point has been classified as an anomaly." if is_anomaly
else "The input data point is not classified as an anomaly.")
st.write("The anomaly score is:", your_anomaly_score)
st.write("The threshold for anomaly detection is:", min_threshold, "to", max_threshold)
# Create a scatter plot for longitude and latitude
fig, ax = plt.subplots(figsize=(10, 8))
# Plot non-anomaly data
sns.scatterplot(data=non_anomaly_df, x='Long', y='Lat', color='lightgrey', label='Normal πŸ™οΈ', ax=ax)
# Plot input data
if is_anomaly:
ax.scatter(selected_columns['Long'], selected_columns['Lat'], color='red', label='Suspicious 🚩', s=100, marker='x')
anomaly_marker = 'Suspicious 🚩'
else:
ax.scatter(selected_columns['Long'], selected_columns['Lat'], color='green', label='Valid βœ…', s=100, marker='o')
anomaly_marker = 'Valid βœ…'
ax.set_xlabel("Longitude")
ax.set_ylabel("Latitude")
ax.set_title("Location Plot: Anomaly Detection πŸ—ΊοΈ")
ax.legend()
ax.grid(True)
# Show the scatter plot in Streamlit
st.subheader("Location Plot: Anomaly Detection πŸ—ΊοΈ")
st.pyplot(fig)
# Explanation based on the anomaly classification
st.subheader("Anomaly Classification")
if your_anomaly_score < min_threshold or your_anomaly_score > max_threshold:
st.write("Prediction Result: 🚨 Anomaly Detected!")
else:
st.write("Prediction Result: βœ… Not Anomaly")
# Explain the results
# Explain the results
st.write("The location plot visualizes the anomaly detection result based on longitude and latitude.")
if your_anomaly_score < min_threshold or your_anomaly_score > max_threshold:
st.write("The input data point is marked as Suspicious 🚩 due to its anomaly score.")
st.write("The red 'x' marker indicates a suspicious location.")
else:
st.write("The input data point is marked as Valid βœ… due to its anomaly score.")
st.write("The green 'o' marker indicates a valid location.")