web-phishing-detection

File size: 3,613 Bytes

eb30cad
b1ddb38
43010a1
8cd35aa
b1ddb38
 
 
43010a1
b1ddb38
43010a1
 
eb30cad
43010a1
b1ddb38
 
 
 
43010a1
b1ddb38
 
 
43010a1
8af0aaf
43010a1
 
 
 
 
 
 
 
 
e2e2b90
43010a1
8af0aaf
43010a1
 
 
 
 
 
 
 
 
b1ddb38
43010a1
502ce25
43010a1
 
b1ddb38
 
 
 
43010a1
 
 
eb30cad
43010a1
 
eb30cad
43010a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b45928
43010a1
 
 
37db18f
981e927
 
43010a1
 
981e927
 
37db18f
2f8164c
43010a1
eb30cad
 
43010a1

import gradio as gr
import nltk
import re
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
import pandas as pd

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load Stopwords and Initialize Lemmatizer
STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean and preprocess URL data
def preprocess_url(url):
    url = url.lower()  # Convert to lowercase
    url = re.sub(r'https?://', '', url)  # Remove http or https
    url = re.sub(r'www\.', '', url)  # Remove www
    url = re.sub(r'[^a-zA-Z0-9]', ' ', url)  # Remove special characters
    url = re.sub(r'\s+', ' ', url).strip()  # Remove extra spaces
    tokens = word_tokenize(url)  # Tokenize
    tokens = [word for word in tokens if word not in STOPWORDS]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return ' '.join(tokens)

# Function to clean and preprocess HTML data
def preprocess_html(html):
    html = re.sub(r'<[^>]+>', ' ', html)  # Remove HTML tags
    html = html.lower()  # Convert to lowercase
    html = re.sub(r'https?://', '', html)  # Remove http or https
    html = re.sub(r'[^a-zA-Z0-9]', ' ', html)  # Remove special characters
    html = re.sub(r'\s+', ' ', html).strip()  # Remove extra spaces
    tokens = word_tokenize(html)  # Tokenize
    tokens = [word for word in tokens if word not in STOPWORDS]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return ' '.join(tokens)

# Load trained model
model = keras.models.load_model('new_phishing_detection_model.keras')

# Define maximum length and number of words
max_url_length = 180
max_html_length = 2000
max_words = 10000

# Load the fitted tokenizers
with open('url_tokenizer.pkl', 'rb') as file:
    url_tokenizer = pickle.load(file)

with open('html_tokenizer.pkl', 'rb') as file:
    html_tokenizer = pickle.load(file)

# Define the prediction function
def predict_phishing(url, html):
    cleaned_url = preprocess_url(url)
    cleaned_html = preprocess_html(html)
    
    new_url_sequences = url_tokenizer.texts_to_sequences([cleaned_url])
    new_url_padded = pad_sequences(new_url_sequences, maxlen=max_url_length, padding='post', truncating='post')
    
    new_html_sequences = html_tokenizer.texts_to_sequences([cleaned_html])
    new_html_padded = pad_sequences(new_html_sequences, maxlen=max_html_length, padding='post', truncating='post')
    
    new_predictions_prob = model.predict([new_url_padded, new_html_padded])
    new_predictions = (new_predictions_prob > 0.5).astype(int)
    
    predicted_category = "Spam" if new_predictions[0][0] == 1 else "Legitimate"
    predicted_probability = f"{new_predictions_prob[0][0]:.4f}"
    
    return predicted_category, predicted_probability

# Create Gradio Interface
interface = gr.Interface(
    fn=predict_phishing,
    inputs=[
        gr.components.Textbox(label="URL"),
        gr.components.Textbox(label="HTML Snippet")
    ],
    outputs=[
        gr.components.Textbox(label="Predicted Category"),
        gr.components.Textbox(label="Predicted Probability")
    ],
    title="Phishing Detection Model",
    description="Enter a URL and its HTML content to predict if it's spam or legitimate."
)

# Launch the Gradio interface
interface.launch()