rmdhirr commited on
Commit
37db18f
1 Parent(s): 37ab448

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -27
app.py CHANGED
@@ -8,7 +8,6 @@ from nltk.tokenize import word_tokenize
8
  from nltk.stem import WordNetLemmatizer
9
  from tensorflow.keras.preprocessing.sequence import pad_sequences
10
  import re
11
- from urllib.parse import urlparse
12
 
13
  # Load the model
14
  model = tf.keras.models.load_model('new_phishing_detection_model.keras')
@@ -26,23 +25,17 @@ nltk.download('wordnet')
26
  STOPWORDS = set(stopwords.words('english'))
27
  lemmatizer = WordNetLemmatizer()
28
 
29
- def extract_domain(url):
30
- domain = urlparse(url).netloc
31
- return domain
32
-
33
- def normalize_length(text, target_length=50):
34
- if len(text) < target_length:
35
- text = text + " " * (target_length - len(text))
36
  else:
37
- text = text[:target_length]
38
- return text
39
 
40
  def preprocess_url(url):
41
  url = url.lower()
42
  url = re.sub(r'https?://', '', url)
43
  url = re.sub(r'www\.', '', url)
44
- domain = extract_domain(url)
45
- url = re.sub(domain, '', url)
46
  url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
47
  url = re.sub(r'\s+', ' ', url).strip()
48
  url = normalize_length(url)
@@ -77,30 +70,34 @@ def preprocess_input(input_text, tokenizer, max_length):
77
  padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
78
  return padded_sequences
79
 
80
- def get_prediction(input_text):
81
- cleaned_url = preprocess_url(input_text)
82
- cleaned_html = preprocess_html(input_text)
83
-
84
- url_data = preprocess_input(cleaned_url, url_tokenizer, max_url_length)
85
- html_data = preprocess_input(cleaned_html, html_tokenizer, max_html_length)
86
-
87
- # Ensure the input shapes are correct
88
- url_data = np.expand_dims(url_data, axis=0) # Add batch dimension
89
- html_data = np.expand_dims(html_data, axis=0) # Add batch dimension
90
 
91
- prediction = model.predict([url_data, html_data])[0][0]
92
  return prediction
93
 
94
- def phishing_detection(input_text):
95
- prediction = get_prediction(input_text)
96
- if prediction > 0.5:
 
97
  return f"Warning: This site is likely a phishing site! ({prediction:.2f})"
98
  else:
99
  return f"Safe: This site is not likely a phishing site. ({prediction:.2f})"
100
 
101
  iface = gr.Interface(
102
  fn=phishing_detection,
103
- inputs=gr.components.Textbox(lines=5, placeholder="Enter URL or HTML code"),
 
 
 
104
  outputs=gr.components.Textbox(label="Phishing Detection Result"),
105
  title="Phishing Detection Model",
106
  description="Check if a URL or HTML is Phishing.",
 
8
  from nltk.stem import WordNetLemmatizer
9
  from tensorflow.keras.preprocessing.sequence import pad_sequences
10
  import re
 
11
 
12
  # Load the model
13
  model = tf.keras.models.load_model('new_phishing_detection_model.keras')
 
25
  STOPWORDS = set(stopwords.words('english'))
26
  lemmatizer = WordNetLemmatizer()
27
 
28
+ def normalize_length(url, target_length=50):
29
+ if len(url) < target_length:
30
+ url = url + " " * (target_length - len(url))
 
 
 
 
31
  else:
32
+ url = url[:target_length]
33
+ return url
34
 
35
  def preprocess_url(url):
36
  url = url.lower()
37
  url = re.sub(r'https?://', '', url)
38
  url = re.sub(r'www\.', '', url)
 
 
39
  url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
40
  url = re.sub(r'\s+', ' ', url).strip()
41
  url = normalize_length(url)
 
70
  padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
71
  return padded_sequences
72
 
73
+ def get_prediction(input_text, input_type):
74
+ is_url = input_type == "URL"
75
+ if is_url:
76
+ cleaned_text = preprocess_url(input_text)
77
+ input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length)
78
+ input_data = [input_data, np.zeros((1, max_html_length))] # dummy HTML input
79
+ else:
80
+ cleaned_text = preprocess_html(input_text)
81
+ input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length)
82
+ input_data = [np.zeros((1, max_url_length)), input_data] # dummy URL input
83
 
84
+ prediction = model.predict(input_data)[0][0]
85
  return prediction
86
 
87
+ def phishing_detection(input_text, input_type):
88
+ prediction = get_prediction(input_text, input_type)
89
+ threshold = 0.5 # Adjusted threshold
90
+ if prediction > threshold:
91
  return f"Warning: This site is likely a phishing site! ({prediction:.2f})"
92
  else:
93
  return f"Safe: This site is not likely a phishing site. ({prediction:.2f})"
94
 
95
  iface = gr.Interface(
96
  fn=phishing_detection,
97
+ inputs=[
98
+ gr.components.Textbox(lines=5, placeholder="Enter URL or HTML code"),
99
+ gr.components.Radio(["URL", "HTML"], type="value", label="Input Type")
100
+ ],
101
  outputs=gr.components.Textbox(label="Phishing Detection Result"),
102
  title="Phishing Detection Model",
103
  description="Check if a URL or HTML is Phishing.",