|
import gradio as gr |
|
import nltk |
|
import re |
|
import pickle |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import word_tokenize |
|
from nltk.stem import WordNetLemmatizer |
|
from tensorflow.keras.preprocessing.text import Tokenizer |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
from tensorflow import keras |
|
from sklearn.preprocessing import LabelEncoder |
|
|
|
|
|
nltk.download('punkt') |
|
nltk.download('stopwords') |
|
nltk.download('wordnet') |
|
|
|
|
|
STOPWORDS = set(stopwords.words('english')) |
|
lemmatizer = WordNetLemmatizer() |
|
|
|
|
|
def preprocess_url(url): |
|
url = url.lower() |
|
url = re.sub(r'https?://', '', url) |
|
url = re.sub(r'www\.', '', url) |
|
url = re.sub(r'[^a-zA-Z0-9]', ' ', url) |
|
url = re.sub(r'\s+', ' ', url).strip() |
|
tokens = word_tokenize(url) |
|
tokens = [word for word in tokens if word not in STOPWORDS] |
|
tokens = [lemmatizer.lemmatize(word) for word in tokens] |
|
return ' '.join(tokens) |
|
|
|
|
|
def preprocess_html(html): |
|
html = re.sub(r'<[^>]+>', ' ', html) |
|
html = html.lower() |
|
html = re.sub(r'https?://', '', html) |
|
html = re.sub(r'[^a-zA-Z0-9]', ' ', html) |
|
html = re.sub(r'\s+', ' ', html).strip() |
|
tokens = word_tokenize(html) |
|
tokens = [word for word in tokens if word not in STOPWORDS] |
|
tokens = [lemmatizer.lemmatize(word) for word in tokens] |
|
return ' '.join(tokens) |
|
|
|
|
|
model = keras.models.load_model('new_phishing_detection_model.keras') |
|
|
|
|
|
max_url_length = 180 |
|
max_html_length = 2000 |
|
max_words = 10000 |
|
|
|
|
|
with open('url_tokenizer.pkl', 'rb') as file: |
|
url_tokenizer = pickle.load(file) |
|
|
|
with open('html_tokenizer.pkl', 'rb') as file: |
|
html_tokenizer = pickle.load(file) |
|
|
|
|
|
with open('label_encoder.pkl', 'rb') as file: |
|
label_encoder = pickle.load(file) |
|
|
|
|
|
def predict_phishing(url, html): |
|
cleaned_url = preprocess_url(url) |
|
cleaned_html = preprocess_html(html) |
|
|
|
new_url_sequences = url_tokenizer.texts_to_sequences([cleaned_url]) |
|
new_url_padded = pad_sequences(new_url_sequences, maxlen=max_url_length, padding='post', truncating='post') |
|
|
|
new_html_sequences = html_tokenizer.texts_to_sequences([cleaned_html]) |
|
new_html_padded = pad_sequences(new_html_sequences, maxlen=max_html_length, padding='post', truncating='post') |
|
|
|
new_predictions_prob = model.predict([new_url_padded, new_html_padded]) |
|
new_predictions = (new_predictions_prob > 0.6).astype(int) |
|
|
|
predicted_category = label_encoder.inverse_transform(new_predictions)[0] |
|
predicted_probability = f"{new_predictions_prob[0][0]:.4f}" |
|
|
|
return predicted_category.capitalize(), predicted_probability |
|
|
|
|
|
interface = gr.Interface( |
|
fn=predict_phishing, |
|
inputs=[ |
|
gr.components.Textbox(label="URL"), |
|
gr.components.Textbox(label="HTML Snippet") |
|
], |
|
outputs=[ |
|
gr.components.Textbox(label="Predicted Category"), |
|
gr.components.Textbox(label="Predicted Probability") |
|
], |
|
title="Phishing Detection Model", |
|
description="Enter a URL and its HTML content to predict if it's spam or legitimate." |
|
) |
|
|
|
|
|
interface.launch() |