Update app.py
Browse files
app.py
CHANGED
@@ -7,10 +7,6 @@ from nltk.corpus import stopwords
|
|
7 |
from nltk.tokenize import word_tokenize
|
8 |
from nltk.stem import WordNetLemmatizer
|
9 |
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
10 |
-
from tensorflow.keras.preprocessing.text import Tokenizer
|
11 |
-
from sklearn.preprocessing import LabelEncoder
|
12 |
-
from sklearn.model_selection import train_test_split
|
13 |
-
import pandas as pd
|
14 |
import re
|
15 |
|
16 |
# Load the model
|
@@ -56,36 +52,11 @@ max_url_length = 180
|
|
56 |
max_html_length = 2000
|
57 |
max_words = 10000
|
58 |
|
59 |
-
# Load
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
url_df['Cleaned_Data'] = url_df['Data'].apply(preprocess_url)
|
65 |
-
|
66 |
-
# Clean HTML 'Data' Columns
|
67 |
-
html_df['Cleaned_Data'] = html_df['Data'].apply(preprocess_html)
|
68 |
-
|
69 |
-
# URL Tokenization and Padding
|
70 |
-
url_tokenizer = Tokenizer(num_words=max_words, char_level=True)
|
71 |
-
url_tokenizer.fit_on_texts(url_df['Cleaned_Data'])
|
72 |
-
url_sequences = url_tokenizer.texts_to_sequences(url_df['Cleaned_Data'])
|
73 |
-
url_padded = pad_sequences(url_sequences, maxlen=max_url_length, padding='post', truncating='post')
|
74 |
-
|
75 |
-
# HTML Tokenization and Padding
|
76 |
-
html_tokenizer = Tokenizer(num_words=max_words)
|
77 |
-
html_tokenizer.fit_on_texts(html_df['Cleaned_Data'])
|
78 |
-
html_sequences = html_tokenizer.texts_to_sequences(html_df['Cleaned_Data'])
|
79 |
-
html_padded = pad_sequences(html_sequences, maxlen=max_html_length, padding='post', truncating='post')
|
80 |
-
|
81 |
-
# Encode 'Category' Column
|
82 |
-
label_encoder = LabelEncoder()
|
83 |
-
url_df['Category_Encoded'] = label_encoder.fit_transform(url_df['Category'])
|
84 |
-
html_df['Category_Encoded'] = label_encoder.transform(html_df['Category'])
|
85 |
-
|
86 |
-
# Split datasets into training and testing sets
|
87 |
-
url_X_train, url_X_test, url_y_train, url_y_test = train_test_split(url_padded, url_df['Category_Encoded'], test_size=0.2, random_state=42)
|
88 |
-
html_X_train, html_X_test, html_y_train, html_y_test = train_test_split(html_padded, html_df['Category_Encoded'], test_size=0.2, random_state=42)
|
89 |
|
90 |
def preprocess_input(input_text, tokenizer, max_length):
|
91 |
sequences = tokenizer.texts_to_sequences([input_text])
|
|
|
7 |
from nltk.tokenize import word_tokenize
|
8 |
from nltk.stem import WordNetLemmatizer
|
9 |
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
|
|
|
|
|
|
|
10 |
import re
|
11 |
|
12 |
# Load the model
|
|
|
52 |
max_html_length = 2000
|
53 |
max_words = 10000
|
54 |
|
55 |
+
# Load tokenizers
|
56 |
+
with open('url_tokenizer.pkl', 'rb') as f:
|
57 |
+
url_tokenizer = pickle.load(f)
|
58 |
+
with open('html_tokenizer.pkl', 'rb') as f:
|
59 |
+
html_tokenizer = pickle.load(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
def preprocess_input(input_text, tokenizer, max_length):
|
62 |
sequences = tokenizer.texts_to_sequences([input_text])
|