web-phishing-detection

App Files Files Community

rmdhirr commited on Jun 16

Commit

37db18f

•

1 Parent(s): 37ab448

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -27

app.py CHANGED Viewed

@@ -8,7 +8,6 @@ from nltk.tokenize import word_tokenize
 from nltk.stem import WordNetLemmatizer
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 import re
-from urllib.parse import urlparse
 # Load the model
 model = tf.keras.models.load_model('new_phishing_detection_model.keras')
@@ -26,23 +25,17 @@ nltk.download('wordnet')
 STOPWORDS = set(stopwords.words('english'))
 lemmatizer = WordNetLemmatizer()
-def extract_domain(url):
-    domain = urlparse(url).netloc
-    return domain
-def normalize_length(text, target_length=50):
-    if len(text) < target_length:
-        text = text + " " * (target_length - len(text))
     else:
-        text = text[:target_length]
-    return text
 def preprocess_url(url):
     url = url.lower()
     url = re.sub(r'https?://', '', url)
     url = re.sub(r'www\.', '', url)
-    domain = extract_domain(url)
-    url = re.sub(domain, '', url)
     url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
     url = re.sub(r'\s+', ' ', url).strip()
     url = normalize_length(url)
@@ -77,30 +70,34 @@ def preprocess_input(input_text, tokenizer, max_length):
     padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
     return padded_sequences
-def get_prediction(input_text):
-    cleaned_url = preprocess_url(input_text)
-    cleaned_html = preprocess_html(input_text)
-    url_data = preprocess_input(cleaned_url, url_tokenizer, max_url_length)
-    html_data = preprocess_input(cleaned_html, html_tokenizer, max_html_length)
-    # Ensure the input shapes are correct
-    url_data = np.expand_dims(url_data, axis=0)  # Add batch dimension
-    html_data = np.expand_dims(html_data, axis=0)  # Add batch dimension
-    prediction = model.predict([url_data, html_data])[0][0]
     return prediction
-def phishing_detection(input_text):
-    prediction = get_prediction(input_text)
-    if prediction > 0.5:
         return f"Warning: This site is likely a phishing site! ({prediction:.2f})"
     else:
         return f"Safe: This site is not likely a phishing site. ({prediction:.2f})"
 iface = gr.Interface(
     fn=phishing_detection,
-    inputs=gr.components.Textbox(lines=5, placeholder="Enter URL or HTML code"),
     outputs=gr.components.Textbox(label="Phishing Detection Result"),
     title="Phishing Detection Model",
     description="Check if a URL or HTML is Phishing.",

 from nltk.stem import WordNetLemmatizer
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 import re
 # Load the model
 model = tf.keras.models.load_model('new_phishing_detection_model.keras')
 STOPWORDS = set(stopwords.words('english'))
 lemmatizer = WordNetLemmatizer()
+def normalize_length(url, target_length=50):
+    if len(url) < target_length:
+        url = url + " " * (target_length - len(url))
     else:
+        url = url[:target_length]
+    return url
 def preprocess_url(url):
     url = url.lower()
     url = re.sub(r'https?://', '', url)
     url = re.sub(r'www\.', '', url)
     url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
     url = re.sub(r'\s+', ' ', url).strip()
     url = normalize_length(url)
     padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
     return padded_sequences
+def get_prediction(input_text, input_type):
+    is_url = input_type == "URL"
+    if is_url:
+        cleaned_text = preprocess_url(input_text)
+        input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length)
+        input_data = [input_data, np.zeros((1, max_html_length))]  # dummy HTML input
+    else:
+        cleaned_text = preprocess_html(input_text)
+        input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length)
+        input_data = [np.zeros((1, max_url_length)), input_data]  # dummy URL input
+    prediction = model.predict(input_data)[0][0]
     return prediction
+def phishing_detection(input_text, input_type):
+    prediction = get_prediction(input_text, input_type)
+    threshold = 0.5  # Adjusted threshold
+    if prediction > threshold:
         return f"Warning: This site is likely a phishing site! ({prediction:.2f})"
     else:
         return f"Safe: This site is not likely a phishing site. ({prediction:.2f})"
 iface = gr.Interface(
     fn=phishing_detection,
+    inputs=[
+        gr.components.Textbox(lines=5, placeholder="Enter URL or HTML code"),
+        gr.components.Radio(["URL", "HTML"], type="value", label="Input Type")
+    ],
     outputs=gr.components.Textbox(label="Phishing Detection Result"),
     title="Phishing Detection Model",
     description="Check if a URL or HTML is Phishing.",