web-phishing-detection

App Files Files Community

rmdhirr commited on Jun 16

Commit

1a416ed

•

1 Parent(s): 8af0aaf

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -34

app.py CHANGED Viewed

@@ -7,10 +7,6 @@ from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
 from nltk.stem import WordNetLemmatizer
 from tensorflow.keras.preprocessing.sequence import pad_sequences
-from tensorflow.keras.preprocessing.text import Tokenizer
-from sklearn.preprocessing import LabelEncoder
-from sklearn.model_selection import train_test_split
-import pandas as pd
 import re
 # Load the model
@@ -56,36 +52,11 @@ max_url_length = 180
 max_html_length = 2000
 max_words = 10000
-# Load datasets
-url_df = pd.read_csv('url_data.csv')
-html_df = pd.read_csv('html_data.csv')
-# Clean URL 'Data' Columns
-url_df['Cleaned_Data'] = url_df['Data'].apply(preprocess_url)
-# Clean HTML 'Data' Columns
-html_df['Cleaned_Data'] = html_df['Data'].apply(preprocess_html)
-# URL Tokenization and Padding
-url_tokenizer = Tokenizer(num_words=max_words, char_level=True)
-url_tokenizer.fit_on_texts(url_df['Cleaned_Data'])
-url_sequences = url_tokenizer.texts_to_sequences(url_df['Cleaned_Data'])
-url_padded = pad_sequences(url_sequences, maxlen=max_url_length, padding='post', truncating='post')
-# HTML Tokenization and Padding
-html_tokenizer = Tokenizer(num_words=max_words)
-html_tokenizer.fit_on_texts(html_df['Cleaned_Data'])
-html_sequences = html_tokenizer.texts_to_sequences(html_df['Cleaned_Data'])
-html_padded = pad_sequences(html_sequences, maxlen=max_html_length, padding='post', truncating='post')
-# Encode 'Category' Column
-label_encoder = LabelEncoder()
-url_df['Category_Encoded'] = label_encoder.fit_transform(url_df['Category'])
-html_df['Category_Encoded'] = label_encoder.transform(html_df['Category'])
-# Split datasets into training and testing sets
-url_X_train, url_X_test, url_y_train, url_y_test = train_test_split(url_padded, url_df['Category_Encoded'], test_size=0.2, random_state=42)
-html_X_train, html_X_test, html_y_train, html_y_test = train_test_split(html_padded, html_df['Category_Encoded'], test_size=0.2, random_state=42)
 def preprocess_input(input_text, tokenizer, max_length):
     sequences = tokenizer.texts_to_sequences([input_text])

 from nltk.tokenize import word_tokenize
 from nltk.stem import WordNetLemmatizer
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 import re
 # Load the model
 max_html_length = 2000
 max_words = 10000
+# Load tokenizers
+with open('url_tokenizer.pkl', 'rb') as f:
+    url_tokenizer = pickle.load(f)
+with open('html_tokenizer.pkl', 'rb') as f:
+    html_tokenizer = pickle.load(f)
 def preprocess_input(input_text, tokenizer, max_length):
     sequences = tokenizer.texts_to_sequences([input_text])