rmdhirr commited on
Commit
1a416ed
1 Parent(s): 8af0aaf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -34
app.py CHANGED
@@ -7,10 +7,6 @@ from nltk.corpus import stopwords
7
  from nltk.tokenize import word_tokenize
8
  from nltk.stem import WordNetLemmatizer
9
  from tensorflow.keras.preprocessing.sequence import pad_sequences
10
- from tensorflow.keras.preprocessing.text import Tokenizer
11
- from sklearn.preprocessing import LabelEncoder
12
- from sklearn.model_selection import train_test_split
13
- import pandas as pd
14
  import re
15
 
16
  # Load the model
@@ -56,36 +52,11 @@ max_url_length = 180
56
  max_html_length = 2000
57
  max_words = 10000
58
 
59
- # Load datasets
60
- url_df = pd.read_csv('url_data.csv')
61
- html_df = pd.read_csv('html_data.csv')
62
-
63
- # Clean URL 'Data' Columns
64
- url_df['Cleaned_Data'] = url_df['Data'].apply(preprocess_url)
65
-
66
- # Clean HTML 'Data' Columns
67
- html_df['Cleaned_Data'] = html_df['Data'].apply(preprocess_html)
68
-
69
- # URL Tokenization and Padding
70
- url_tokenizer = Tokenizer(num_words=max_words, char_level=True)
71
- url_tokenizer.fit_on_texts(url_df['Cleaned_Data'])
72
- url_sequences = url_tokenizer.texts_to_sequences(url_df['Cleaned_Data'])
73
- url_padded = pad_sequences(url_sequences, maxlen=max_url_length, padding='post', truncating='post')
74
-
75
- # HTML Tokenization and Padding
76
- html_tokenizer = Tokenizer(num_words=max_words)
77
- html_tokenizer.fit_on_texts(html_df['Cleaned_Data'])
78
- html_sequences = html_tokenizer.texts_to_sequences(html_df['Cleaned_Data'])
79
- html_padded = pad_sequences(html_sequences, maxlen=max_html_length, padding='post', truncating='post')
80
-
81
- # Encode 'Category' Column
82
- label_encoder = LabelEncoder()
83
- url_df['Category_Encoded'] = label_encoder.fit_transform(url_df['Category'])
84
- html_df['Category_Encoded'] = label_encoder.transform(html_df['Category'])
85
-
86
- # Split datasets into training and testing sets
87
- url_X_train, url_X_test, url_y_train, url_y_test = train_test_split(url_padded, url_df['Category_Encoded'], test_size=0.2, random_state=42)
88
- html_X_train, html_X_test, html_y_train, html_y_test = train_test_split(html_padded, html_df['Category_Encoded'], test_size=0.2, random_state=42)
89
 
90
  def preprocess_input(input_text, tokenizer, max_length):
91
  sequences = tokenizer.texts_to_sequences([input_text])
 
7
  from nltk.tokenize import word_tokenize
8
  from nltk.stem import WordNetLemmatizer
9
  from tensorflow.keras.preprocessing.sequence import pad_sequences
 
 
 
 
10
  import re
11
 
12
  # Load the model
 
52
  max_html_length = 2000
53
  max_words = 10000
54
 
55
+ # Load tokenizers
56
+ with open('url_tokenizer.pkl', 'rb') as f:
57
+ url_tokenizer = pickle.load(f)
58
+ with open('html_tokenizer.pkl', 'rb') as f:
59
+ html_tokenizer = pickle.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  def preprocess_input(input_text, tokenizer, max_length):
62
  sequences = tokenizer.texts_to_sequences([input_text])