rmdhirr commited on
Commit
43010a1
1 Parent(s): c8496ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -112
app.py CHANGED
@@ -1,143 +1,96 @@
1
- import logging
2
  import gradio as gr
3
- import tensorflow as tf
4
- import numpy as np
5
  import nltk
 
6
  import pickle
7
  from nltk.corpus import stopwords
8
  from nltk.tokenize import word_tokenize
9
  from nltk.stem import WordNetLemmatizer
 
10
  from tensorflow.keras.preprocessing.sequence import pad_sequences
11
- import re
12
- from tensorflow.keras import models, optimizers
13
- from tensorflow.keras.metrics import Precision, Recall
14
-
15
- # Set up logging
16
- logging.basicConfig(level=logging.DEBUG)
17
-
18
- # Load the model
19
- try:
20
- model = tf.keras.models.load_model('new_phishing_detection_model.keras')
21
- logging.info("Model loaded successfully.")
22
- except Exception as e:
23
- logging.error(f"Error loading model: {e}")
24
-
25
- # Compile the model with standard loss and metrics
26
- try:
27
- model.compile(optimizer=optimizers.Adam(learning_rate=0.0005),
28
- loss='binary_crossentropy',
29
- metrics=['accuracy', Precision(), Recall()])
30
- logging.info("Model compiled successfully.")
31
- except Exception as e:
32
- logging.error(f"Error compiling model: {e}")
33
 
34
- # Preprocessing functions
35
  nltk.download('punkt')
36
  nltk.download('stopwords')
37
  nltk.download('wordnet')
38
 
 
39
  STOPWORDS = set(stopwords.words('english'))
40
  lemmatizer = WordNetLemmatizer()
41
 
 
42
  def preprocess_url(url):
43
- try:
44
- url = url.lower()
45
- url = re.sub(r'https?://', '', url)
46
- url = re.sub(r'www\.', '', url)
47
- url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
48
- url = re.sub(r'\s+', ' ', url).strip()
49
- tokens = word_tokenize(url)
50
- tokens = [word for word in tokens if word not in STOPWORDS]
51
- tokens = [lemmatizer.lemmatize(word) for word in tokens]
52
- return ' '.join(tokens)
53
- except Exception as e:
54
- logging.error(f"Error in URL preprocessing: {e}")
55
- return ""
56
 
 
57
  def preprocess_html(html):
58
- try:
59
- html = re.sub(r'<[^>]+>', ' ', html)
60
- html = html.lower()
61
- html = re.sub(r'https?://', '', html)
62
- html = re.sub(r'[^a-zA-Z0-9]', ' ', html)
63
- html = re.sub(r'\s+', ' ', html).strip()
64
- tokens = word_tokenize(html)
65
- tokens = [word for word in tokens if word not in STOPWORDS]
66
- tokens = [lemmatizer.lemmatize(word) for word in tokens]
67
- return ' '.join(tokens)
68
- except Exception as e:
69
- logging.error(f"Error in HTML preprocessing: {e}")
70
- return ""
71
 
72
- # Define maximum lengths
 
 
 
73
  max_url_length = 180
74
  max_html_length = 2000
75
  max_words = 10000
76
 
77
- # Load tokenizers
78
- try:
79
- with open('url_tokenizer.pkl', 'rb') as f:
80
- url_tokenizer = pickle.load(f)
81
- with open('html_tokenizer.pkl', 'rb') as f:
82
- html_tokenizer = pickle.load(f)
83
- logging.info("Tokenizers loaded successfully.")
84
- except Exception as e:
85
- logging.error(f"Error loading tokenizers: {e}")
86
-
87
- def preprocess_input(input_text, tokenizer, max_length):
88
- try:
89
- sequences = tokenizer.texts_to_sequences([input_text])
90
- padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
91
- return padded_sequences
92
- except Exception as e:
93
- logging.error(f"Error in input preprocessing: {e}")
94
- return np.zeros((1, max_length))
95
 
96
- def get_prediction(input_text, input_type):
97
- try:
98
- is_url = input_type == "URL"
99
- if is_url:
100
- cleaned_text = preprocess_url(input_text)
101
- input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length)
102
- input_data = [input_data, np.zeros((1, max_html_length))] # dummy HTML input
103
- else:
104
- cleaned_text = preprocess_html(input_text)
105
- input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length)
106
- input_data = [np.zeros((1, max_url_length)), input_data] # dummy URL input
107
-
108
- prediction = model.predict(input_data)[0][0]
109
- return prediction
110
- except Exception as e:
111
- logging.error(f"Error in prediction: {e}")
112
- return 0.0
113
 
114
- def ensemble_prediction(input_text, input_type, n_ensemble=5):
115
- try:
116
- predictions = [get_prediction(input_text, input_type) for _ in range(n_ensemble)]
117
- avg_prediction = np.mean(predictions)
118
- return avg_prediction
119
- except Exception as e:
120
- logging.error(f"Error in ensemble prediction: {e}")
121
- return 0.0
 
 
 
 
 
 
 
 
 
 
122
 
123
- def phishing_detection(input_text, input_type):
124
- prediction = ensemble_prediction(input_text, input_type)
125
- threshold = 0.5 # Keep the threshold unchanged
126
- if prediction > threshold:
127
- return f"Warning: This site is likely a phishing site! ({prediction:.2f})"
128
- else:
129
- return f"Safe: This site is not likely a phishing site. ({prediction:.2f})"
130
-
131
- iface = gr.Interface(
132
- fn=phishing_detection,
133
  inputs=[
134
- gr.components.Textbox(lines=5, placeholder="Enter URL or HTML code"),
135
- gr.components.Radio(["URL", "HTML"], type="value", label="Input Type")
 
 
 
 
136
  ],
137
- outputs=gr.components.Textbox(label="Phishing Detection Result"),
138
  title="Phishing Detection Model",
139
- description="Check if a URL or HTML is Phishing.",
140
- theme="default"
141
  )
142
 
143
- iface.launch()
 
 
 
1
  import gradio as gr
 
 
2
  import nltk
3
+ import re
4
  import pickle
5
  from nltk.corpus import stopwords
6
  from nltk.tokenize import word_tokenize
7
  from nltk.stem import WordNetLemmatizer
8
+ from tensorflow.keras.preprocessing.text import Tokenizer
9
  from tensorflow.keras.preprocessing.sequence import pad_sequences
10
+ from tensorflow import keras
11
+ import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ # Ensure necessary NLTK resources are downloaded
14
  nltk.download('punkt')
15
  nltk.download('stopwords')
16
  nltk.download('wordnet')
17
 
18
+ # Load Stopwords and Initialize Lemmatizer
19
  STOPWORDS = set(stopwords.words('english'))
20
  lemmatizer = WordNetLemmatizer()
21
 
22
+ # Function to clean and preprocess URL data
23
  def preprocess_url(url):
24
+ url = url.lower() # Convert to lowercase
25
+ url = re.sub(r'https?://', '', url) # Remove http or https
26
+ url = re.sub(r'www\.', '', url) # Remove www
27
+ url = re.sub(r'[^a-zA-Z0-9]', ' ', url) # Remove special characters
28
+ url = re.sub(r'\s+', ' ', url).strip() # Remove extra spaces
29
+ tokens = word_tokenize(url) # Tokenize
30
+ tokens = [word for word in tokens if word not in STOPWORDS] # Remove stopwords
31
+ tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatization
32
+ return ' '.join(tokens)
 
 
 
 
33
 
34
+ # Function to clean and preprocess HTML data
35
  def preprocess_html(html):
36
+ html = re.sub(r'<[^>]+>', ' ', html) # Remove HTML tags
37
+ html = html.lower() # Convert to lowercase
38
+ html = re.sub(r'https?://', '', html) # Remove http or https
39
+ html = re.sub(r'[^a-zA-Z0-9]', ' ', html) # Remove special characters
40
+ html = re.sub(r'\s+', ' ', html).strip() # Remove extra spaces
41
+ tokens = word_tokenize(html) # Tokenize
42
+ tokens = [word for word in tokens if word not in STOPWORDS] # Remove stopwords
43
+ tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatization
44
+ return ' '.join(tokens)
 
 
 
 
45
 
46
+ # Load trained model
47
+ model = keras.models.load_model('/content/drive/MyDrive/fix_phishing_detection_model.keras')
48
+
49
+ # Define maximum length and number of words
50
  max_url_length = 180
51
  max_html_length = 2000
52
  max_words = 10000
53
 
54
+ # Load the fitted tokenizers
55
+ with open('url_tokenizer.pkl', 'rb') as file:
56
+ url_tokenizer = pickle.load(file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
+ with open('html_tokenizer.pkl', 'rb') as file:
59
+ html_tokenizer = pickle.load(file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ # Define the prediction function
62
+ def predict_phishing(url, html):
63
+ cleaned_url = preprocess_url(url)
64
+ cleaned_html = preprocess_html(html)
65
+
66
+ new_url_sequences = url_tokenizer.texts_to_sequences([cleaned_url])
67
+ new_url_padded = pad_sequences(new_url_sequences, maxlen=max_url_length, padding='post', truncating='post')
68
+
69
+ new_html_sequences = html_tokenizer.texts_to_sequences([cleaned_html])
70
+ new_html_padded = pad_sequences(new_html_sequences, maxlen=max_html_length, padding='post', truncating='post')
71
+
72
+ new_predictions_prob = model.predict([new_url_padded, new_html_padded])
73
+ new_predictions = (new_predictions_prob > 0.5).astype(int)
74
+
75
+ predicted_category = "Spam" if new_predictions[0][0] == 1 else "Legitimate"
76
+ predicted_probability = f"{new_predictions_prob[0][0]:.4f}"
77
+
78
+ return predicted_category, predicted_probability
79
 
80
+ # Create Gradio Interface
81
+ interface = gr.Interface(
82
+ fn=predict_phishing,
 
 
 
 
 
 
 
83
  inputs=[
84
+ gr.inputs.Textbox(label="URL"),
85
+ gr.inputs.Textbox(label="HTML Snippet")
86
+ ],
87
+ outputs=[
88
+ gr.outputs.Textbox(label="Predicted Category"),
89
+ gr.outputs.Textbox(label="Predicted Probability")
90
  ],
 
91
  title="Phishing Detection Model",
92
+ description="Enter a URL and its HTML content to predict if it's spam or legitimate."
 
93
  )
94
 
95
+ # Launch the Gradio interface
96
+ interface.launch()