|
import gradio as gr |
|
import tensorflow as tf |
|
import numpy as np |
|
import nltk |
|
import pickle |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import word_tokenize |
|
from nltk.stem import WordNetLemmatizer |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
from tensorflow.keras.preprocessing.text import Tokenizer |
|
from sklearn.preprocessing import LabelEncoder |
|
from sklearn.model_selection import train_test_split |
|
import pandas as pd |
|
import re |
|
|
|
|
|
model = tf.keras.models.load_model('new_phishing_detection_model.keras') |
|
|
|
|
|
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005), |
|
loss='binary_crossentropy', |
|
metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]) |
|
|
|
|
|
nltk.download('punkt') |
|
nltk.download('stopwords') |
|
nltk.download('wordnet') |
|
|
|
STOPWORDS = set(stopwords.words('english')) |
|
lemmatizer = WordNetLemmatizer() |
|
|
|
def preprocess_url(url): |
|
url = url.lower() |
|
url = re.sub(r'https?://', '', url) |
|
url = re.sub(r'www\.', '', url) |
|
url = re.sub(r'[^a-zA-Z0-9]', ' ', url) |
|
url = re.sub(r'\s+', ' ', url).strip() |
|
tokens = word_tokenize(url) |
|
tokens = [word for word in tokens if word not in STOPWORDS] |
|
tokens = [lemmatizer.lemmatize(word) for word in tokens] |
|
return ' '.join(tokens) |
|
|
|
def preprocess_html(html): |
|
html = re.sub(r'<[^>]+>', ' ', html) |
|
html = html.lower() |
|
html = re.sub(r'https?://', '', html) |
|
html = re.sub(r'[^a-zA-Z0-9]', ' ', html) |
|
html = re.sub(r'\s+', ' ', html).strip() |
|
tokens = word_tokenize(html) |
|
tokens = [word for word in tokens if word not in STOPWORDS] |
|
tokens = [lemmatizer.lemmatize(word) for word in tokens] |
|
return ' '.join(tokens) |
|
|
|
|
|
max_url_length = 180 |
|
max_html_length = 2000 |
|
max_words = 10000 |
|
|
|
|
|
url_df = pd.read_csv('url_data.csv') |
|
html_df = pd.read_csv('html_data.csv') |
|
|
|
|
|
url_df['Cleaned_Data'] = url_df['Data'].apply(preprocess_url) |
|
|
|
|
|
html_df['Cleaned_Data'] = html_df['Data'].apply(preprocess_html) |
|
|
|
|
|
url_tokenizer = Tokenizer(num_words=max_words, char_level=True) |
|
url_tokenizer.fit_on_texts(url_df['Cleaned_Data']) |
|
url_sequences = url_tokenizer.texts_to_sequences(url_df['Cleaned_Data']) |
|
url_padded = pad_sequences(url_sequences, maxlen=max_url_length, padding='post', truncating='post') |
|
|
|
|
|
html_tokenizer = Tokenizer(num_words=max_words) |
|
html_tokenizer.fit_on_texts(html_df['Cleaned_Data']) |
|
html_sequences = html_tokenizer.texts_to_sequences(html_df['Cleaned_Data']) |
|
html_padded = pad_sequences(html_sequences, maxlen=max_html_length, padding='post', truncating='post') |
|
|
|
|
|
label_encoder = LabelEncoder() |
|
url_df['Category_Encoded'] = label_encoder.fit_transform(url_df['Category']) |
|
html_df['Category_Encoded'] = label_encoder.transform(html_df['Category']) |
|
|
|
|
|
url_X_train, url_X_test, url_y_train, url_y_test = train_test_split(url_padded, url_df['Category_Encoded'], test_size=0.2, random_state=42) |
|
html_X_train, html_X_test, html_y_train, html_y_test = train_test_split(html_padded, html_df['Category_Encoded'], test_size=0.2, random_state=42) |
|
|
|
def preprocess_input(input_text, tokenizer, max_length): |
|
sequences = tokenizer.texts_to_sequences([input_text]) |
|
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post') |
|
return padded_sequences |
|
|
|
def get_prediction(input_text, input_type): |
|
is_url = input_type == "URL" |
|
if is_url: |
|
cleaned_text = preprocess_url(input_text) |
|
input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length) |
|
input_data = [input_data, np.zeros((1, max_html_length))] |
|
else: |
|
cleaned_text = preprocess_html(input_text) |
|
input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length) |
|
input_data = [np.zeros((1, max_url_length)), input_data] |
|
|
|
prediction = model.predict(input_data)[0][0] |
|
return prediction |
|
|
|
def ensemble_prediction(input_text, input_type, n_ensemble=5): |
|
predictions = [get_prediction(input_text, input_type) for _ in range(n_ensemble)] |
|
avg_prediction = np.mean(predictions) |
|
return avg_prediction |
|
|
|
def phishing_detection(input_text, input_type): |
|
prediction = ensemble_prediction(input_text, input_type) |
|
threshold = 0.5 |
|
if prediction > threshold: |
|
return f"Warning: This site is likely a phishing site! ({prediction:.2f})" |
|
else: |
|
return f"Safe: This site is not likely a phishing site. ({prediction:.2f})" |
|
|
|
iface = gr.Interface( |
|
fn=phishing_detection, |
|
inputs=[ |
|
gr.components.Textbox(lines=5, placeholder="Enter URL or HTML code"), |
|
gr.components.Radio(["URL", "HTML"], type="value", label="Input Type") |
|
], |
|
outputs=gr.components.Textbox(label="Phishing Detection Result"), |
|
title="Phishing Detection Model", |
|
description="Check if a URL or HTML is Phishing.", |
|
theme="default" |
|
) |
|
|
|
iface.launch() |