File size: 4,480 Bytes
eb30cad
b1ddb38
43010a1
8cd35aa
b1ddb38
 
 
43010a1
b1ddb38
43010a1
cd663f1
eb30cad
43010a1
b1ddb38
 
 
 
43010a1
b1ddb38
 
 
43010a1
8af0aaf
43010a1
 
0d284b2
43010a1
 
 
 
 
 
e2e2b90
43010a1
8af0aaf
43010a1
 
 
 
 
 
 
 
 
b1ddb38
43010a1
10614c0
43010a1
 
b1ddb38
 
 
 
43010a1
 
 
eb30cad
43010a1
 
eb30cad
cd663f1
 
 
 
43010a1
 
 
 
e5bfa3c
43010a1
 
e5bfa3c
43010a1
 
e5bfa3c
43010a1
10614c0
e5bfa3c
cd663f1
43010a1
e5bfa3c
cd663f1
8b45928
43010a1
 
 
37db18f
aa2c611
 
43010a1
 
aa2c611
 
37db18f
2e2cbdb
 
0d284b2
2ea44e1
 
 
 
 
 
 
 
 
 
 
 
eb30cad
 
2e2cbdb
 
 
2ea44e1
 
 
 
 
2e2cbdb
 
 
 
 
 
 
 
 
43010a1
2e2cbdb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import gradio as gr
import nltk
import re
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load Stopwords and Initialize Lemmatizer
STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean and preprocess URL data
def preprocess_url(url):
    url = url.lower()  # Convert to lowercase
    url = re.sub(r'https?://', '', url)  # Remove http or https
    url = re.sub(r'www\.', '', url)  # Remove www
    url = re.sub(r'[^a-zA-Z0-9]', ' ', url)  # Remove special characters
    url = re.sub(r'\s+', ' ', url).strip()  # Remove extra spaces
    tokens = word_tokenize(url)  # Tokenize
    tokens = [word for word in tokens if word not in STOPWORDS]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return ' '.join(tokens)

# Function to clean and preprocess HTML data
def preprocess_html(html):
    html = re.sub(r'<[^>]+>', ' ', html)  # Remove HTML tags
    html = html.lower()  # Convert to lowercase
    html = re.sub(r'https?://', '', html)  # Remove http or https
    html = re.sub(r'[^a-zA-Z0-9]', ' ', html)  # Remove special characters
    html = re.sub(r'\s+', ' ', html).strip()  # Remove extra spaces
    tokens = word_tokenize(html)  # Tokenize
    tokens = [word for word in tokens if word not in STOPWORDS]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return ' '.join(tokens)

# Load trained model
model = keras.models.load_model('new_phishing_detection_model.keras')

# Define maximum length and number of words
max_url_length = 180
max_html_length = 2000
max_words = 10000

# Load the fitted tokenizers
with open('url_tokenizer.pkl', 'rb') as file:
    url_tokenizer = pickle.load(file)

with open('html_tokenizer.pkl', 'rb') as file:
    html_tokenizer = pickle.load(file)

# Load the label encoder
with open('label_encoder.pkl', 'rb') as file:
    label_encoder = pickle.load(file)

# Define the prediction function
def predict_phishing(url, html):
    cleaned_url = preprocess_url(url)
    cleaned_html = preprocess_html(html)

    new_url_sequences = url_tokenizer.texts_to_sequences([cleaned_url])
    new_url_padded = pad_sequences(new_url_sequences, maxlen=max_url_length, padding='post', truncating='post')

    new_html_sequences = html_tokenizer.texts_to_sequences([cleaned_html])
    new_html_padded = pad_sequences(new_html_sequences, maxlen=max_html_length, padding='post', truncating='post')

    new_predictions_prob = model.predict([new_url_padded, new_html_padded])
    new_predictions = (new_predictions_prob > 0.6).astype(int)  # Adjust threshold if needed

    predicted_category = label_encoder.inverse_transform(new_predictions)[0]
    predicted_probability = f"{new_predictions_prob[0][0]:.4f}"

    return predicted_category.capitalize(), predicted_probability

# Create Gradio Interface
interface = gr.Interface(
    fn=predict_phishing,
    inputs=[
        gr.components.Textbox(label="URL"),
        gr.components.Textbox(label="HTML Snippet", lines=10, placeholder="Paste HTML content here")
    ],
    outputs=[
        gr.components.Textbox(label="Predicted Category"),
        gr.components.Textbox(label="Predicted Probability")
    ],
    title="Phishing Detection Model",
    description="Enter a URL and its HTML content to predict if it's spam or legitimate. It's recommended to provide both for accurate results.",
    live=True,
    css="""
    .interface-container { 
        border: 2px solid #4CAF50; 
        border-radius: 10px; 
        padding: 20px; 
        text-align: center;
    }
    .gr-textbox, .gr-textbox textarea, .gr-button {
        margin-left: auto !important;
        margin-right: auto !important;
    }
    """
)

# Footer text
footer = gr.Markdown("""
---
<div style="text-align: center;">
    Made with ❤️ by Ramadhirra<br>
    Model by Ramadhirra<br>
    WebUI by Ramadhirra
</div>
""")

# Combine the interface and footer
app = gr.Blocks()

with app:
    interface.render()
    footer.render()

# Launch the Gradio interface
app.launch()