rmdhirr commited on
Commit
c8496ce
1 Parent(s): 8ed2c1d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -44
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import gradio as gr
2
  import tensorflow as tf
3
  import numpy as np
@@ -8,14 +9,27 @@ from nltk.tokenize import word_tokenize
8
  from nltk.stem import WordNetLemmatizer
9
  from tensorflow.keras.preprocessing.sequence import pad_sequences
10
  import re
 
 
 
 
 
11
 
12
  # Load the model
13
- model = tf.keras.models.load_model('new_phishing_detection_model.keras')
 
 
 
 
14
 
15
  # Compile the model with standard loss and metrics
16
- model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
17
- loss='binary_crossentropy',
18
- metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
 
 
 
 
19
 
20
  # Preprocessing functions
21
  nltk.download('punkt')
@@ -26,26 +40,34 @@ STOPWORDS = set(stopwords.words('english'))
26
  lemmatizer = WordNetLemmatizer()
27
 
28
  def preprocess_url(url):
29
- url = url.lower()
30
- url = re.sub(r'https?://', '', url)
31
- url = re.sub(r'www\.', '', url)
32
- url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
33
- url = re.sub(r'\s+', ' ', url).strip()
34
- tokens = word_tokenize(url)
35
- tokens = [word for word in tokens if word not in STOPWORDS]
36
- tokens = [lemmatizer.lemmatize(word) for word in tokens]
37
- return ' '.join(tokens)
 
 
 
 
38
 
39
  def preprocess_html(html):
40
- html = re.sub(r'<[^>]+>', ' ', html)
41
- html = html.lower()
42
- html = re.sub(r'https?://', '', html)
43
- html = re.sub(r'[^a-zA-Z0-9]', ' ', html)
44
- html = re.sub(r'\s+', ' ', html).strip()
45
- tokens = word_tokenize(html)
46
- tokens = [word for word in tokens if word not in STOPWORDS]
47
- tokens = [lemmatizer.lemmatize(word) for word in tokens]
48
- return ' '.join(tokens)
 
 
 
 
49
 
50
  # Define maximum lengths
51
  max_url_length = 180
@@ -53,34 +75,50 @@ max_html_length = 2000
53
  max_words = 10000
54
 
55
  # Load tokenizers
56
- with open('url_tokenizer.pkl', 'rb') as f:
57
- url_tokenizer = pickle.load(f)
58
- with open('html_tokenizer.pkl', 'rb') as f:
59
- html_tokenizer = pickle.load(f)
 
 
 
 
60
 
61
  def preprocess_input(input_text, tokenizer, max_length):
62
- sequences = tokenizer.texts_to_sequences([input_text])
63
- padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
64
- return padded_sequences
 
 
 
 
65
 
66
  def get_prediction(input_text, input_type):
67
- is_url = input_type == "URL"
68
- if is_url:
69
- cleaned_text = preprocess_url(input_text)
70
- input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length)
71
- input_data = [input_data, np.zeros((1, max_html_length))] # dummy HTML input
72
- else:
73
- cleaned_text = preprocess_html(input_text)
74
- input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length)
75
- input_data = [np.zeros((1, max_url_length)), input_data] # dummy URL input
76
-
77
- prediction = model.predict(input_data)[0][0]
78
- return prediction
 
 
 
 
79
 
80
  def ensemble_prediction(input_text, input_type, n_ensemble=5):
81
- predictions = [get_prediction(input_text, input_type) for _ in range(n_ensemble)]
82
- avg_prediction = np.mean(predictions)
83
- return avg_prediction
 
 
 
 
84
 
85
  def phishing_detection(input_text, input_type):
86
  prediction = ensemble_prediction(input_text, input_type)
 
1
+ import logging
2
  import gradio as gr
3
  import tensorflow as tf
4
  import numpy as np
 
9
  from nltk.stem import WordNetLemmatizer
10
  from tensorflow.keras.preprocessing.sequence import pad_sequences
11
  import re
12
+ from tensorflow.keras import models, optimizers
13
+ from tensorflow.keras.metrics import Precision, Recall
14
+
15
+ # Set up logging
16
+ logging.basicConfig(level=logging.DEBUG)
17
 
18
  # Load the model
19
+ try:
20
+ model = tf.keras.models.load_model('new_phishing_detection_model.keras')
21
+ logging.info("Model loaded successfully.")
22
+ except Exception as e:
23
+ logging.error(f"Error loading model: {e}")
24
 
25
  # Compile the model with standard loss and metrics
26
+ try:
27
+ model.compile(optimizer=optimizers.Adam(learning_rate=0.0005),
28
+ loss='binary_crossentropy',
29
+ metrics=['accuracy', Precision(), Recall()])
30
+ logging.info("Model compiled successfully.")
31
+ except Exception as e:
32
+ logging.error(f"Error compiling model: {e}")
33
 
34
  # Preprocessing functions
35
  nltk.download('punkt')
 
40
  lemmatizer = WordNetLemmatizer()
41
 
42
  def preprocess_url(url):
43
+ try:
44
+ url = url.lower()
45
+ url = re.sub(r'https?://', '', url)
46
+ url = re.sub(r'www\.', '', url)
47
+ url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
48
+ url = re.sub(r'\s+', ' ', url).strip()
49
+ tokens = word_tokenize(url)
50
+ tokens = [word for word in tokens if word not in STOPWORDS]
51
+ tokens = [lemmatizer.lemmatize(word) for word in tokens]
52
+ return ' '.join(tokens)
53
+ except Exception as e:
54
+ logging.error(f"Error in URL preprocessing: {e}")
55
+ return ""
56
 
57
  def preprocess_html(html):
58
+ try:
59
+ html = re.sub(r'<[^>]+>', ' ', html)
60
+ html = html.lower()
61
+ html = re.sub(r'https?://', '', html)
62
+ html = re.sub(r'[^a-zA-Z0-9]', ' ', html)
63
+ html = re.sub(r'\s+', ' ', html).strip()
64
+ tokens = word_tokenize(html)
65
+ tokens = [word for word in tokens if word not in STOPWORDS]
66
+ tokens = [lemmatizer.lemmatize(word) for word in tokens]
67
+ return ' '.join(tokens)
68
+ except Exception as e:
69
+ logging.error(f"Error in HTML preprocessing: {e}")
70
+ return ""
71
 
72
  # Define maximum lengths
73
  max_url_length = 180
 
75
  max_words = 10000
76
 
77
  # Load tokenizers
78
+ try:
79
+ with open('url_tokenizer.pkl', 'rb') as f:
80
+ url_tokenizer = pickle.load(f)
81
+ with open('html_tokenizer.pkl', 'rb') as f:
82
+ html_tokenizer = pickle.load(f)
83
+ logging.info("Tokenizers loaded successfully.")
84
+ except Exception as e:
85
+ logging.error(f"Error loading tokenizers: {e}")
86
 
87
  def preprocess_input(input_text, tokenizer, max_length):
88
+ try:
89
+ sequences = tokenizer.texts_to_sequences([input_text])
90
+ padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
91
+ return padded_sequences
92
+ except Exception as e:
93
+ logging.error(f"Error in input preprocessing: {e}")
94
+ return np.zeros((1, max_length))
95
 
96
  def get_prediction(input_text, input_type):
97
+ try:
98
+ is_url = input_type == "URL"
99
+ if is_url:
100
+ cleaned_text = preprocess_url(input_text)
101
+ input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length)
102
+ input_data = [input_data, np.zeros((1, max_html_length))] # dummy HTML input
103
+ else:
104
+ cleaned_text = preprocess_html(input_text)
105
+ input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length)
106
+ input_data = [np.zeros((1, max_url_length)), input_data] # dummy URL input
107
+
108
+ prediction = model.predict(input_data)[0][0]
109
+ return prediction
110
+ except Exception as e:
111
+ logging.error(f"Error in prediction: {e}")
112
+ return 0.0
113
 
114
  def ensemble_prediction(input_text, input_type, n_ensemble=5):
115
+ try:
116
+ predictions = [get_prediction(input_text, input_type) for _ in range(n_ensemble)]
117
+ avg_prediction = np.mean(predictions)
118
+ return avg_prediction
119
+ except Exception as e:
120
+ logging.error(f"Error in ensemble prediction: {e}")
121
+ return 0.0
122
 
123
  def phishing_detection(input_text, input_type):
124
  prediction = ensemble_prediction(input_text, input_type)