rmdhirr commited on
Commit
8af0aaf
1 Parent(s): 8b45928

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -18
app.py CHANGED
@@ -7,6 +7,10 @@ from nltk.corpus import stopwords
7
  from nltk.tokenize import word_tokenize
8
  from nltk.stem import WordNetLemmatizer
9
  from tensorflow.keras.preprocessing.sequence import pad_sequences
 
 
 
 
10
  import re
11
 
12
  # Load the model
@@ -25,31 +29,63 @@ nltk.download('wordnet')
25
  STOPWORDS = set(stopwords.words('english'))
26
  lemmatizer = WordNetLemmatizer()
27
 
28
- def normalize_length(text, target_length):
29
- text = text[:target_length].ljust(target_length)
30
- return text
 
 
 
 
 
 
 
31
 
32
- def preprocess_text(text, is_url=True):
33
- text = text.lower()
34
- if is_url:
35
- text = re.sub(r'https?://', '', text)
36
- text = re.sub(r'www\.', '', text)
37
- text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
38
- text = re.sub(r'\s+', ' ', text).strip()
39
- tokens = word_tokenize(text)
40
  tokens = [word for word in tokens if word not in STOPWORDS]
41
  tokens = [lemmatizer.lemmatize(word) for word in tokens]
42
  return ' '.join(tokens)
43
 
 
44
  max_url_length = 180
45
  max_html_length = 2000
46
  max_words = 10000
47
 
48
- # Load tokenizers
49
- with open('url_tokenizer.pkl', 'rb') as f:
50
- url_tokenizer = pickle.load(f)
51
- with open('html_tokenizer.pkl', 'rb') as f:
52
- html_tokenizer = pickle.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  def preprocess_input(input_text, tokenizer, max_length):
55
  sequences = tokenizer.texts_to_sequences([input_text])
@@ -59,11 +95,11 @@ def preprocess_input(input_text, tokenizer, max_length):
59
  def get_prediction(input_text, input_type):
60
  is_url = input_type == "URL"
61
  if is_url:
62
- cleaned_text = preprocess_text(input_text, is_url=True)
63
  input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length)
64
  input_data = [input_data, np.zeros((1, max_html_length))] # dummy HTML input
65
  else:
66
- cleaned_text = preprocess_text(input_text, is_url=False)
67
  input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length)
68
  input_data = [np.zeros((1, max_url_length)), input_data] # dummy URL input
69
 
 
7
  from nltk.tokenize import word_tokenize
8
  from nltk.stem import WordNetLemmatizer
9
  from tensorflow.keras.preprocessing.sequence import pad_sequences
10
+ from tensorflow.keras.preprocessing.text import Tokenizer
11
+ from sklearn.preprocessing import LabelEncoder
12
+ from sklearn.model_selection import train_test_split
13
+ import pandas as pd
14
  import re
15
 
16
  # Load the model
 
29
  STOPWORDS = set(stopwords.words('english'))
30
  lemmatizer = WordNetLemmatizer()
31
 
32
+ def preprocess_url(url):
33
+ url = url.lower()
34
+ url = re.sub(r'https?://', '', url)
35
+ url = re.sub(r'www\.', '', url)
36
+ url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
37
+ url = re.sub(r'\s+', ' ', url).strip()
38
+ tokens = word_tokenize(url)
39
+ tokens = [word for word in tokens if word not in STOPWORDS]
40
+ tokens = [lemmatizer.lemmatize(word) for word in tokens]
41
+ return ' '.join(tokens)
42
 
43
+ def preprocess_html(html):
44
+ html = re.sub(r'<[^>]+>', ' ', html)
45
+ html = html.lower()
46
+ html = re.sub(r'https?://', '', html)
47
+ html = re.sub(r'[^a-zA-Z0-9]', ' ', html)
48
+ html = re.sub(r'\s+', ' ', html).strip()
49
+ tokens = word_tokenize(html)
 
50
  tokens = [word for word in tokens if word not in STOPWORDS]
51
  tokens = [lemmatizer.lemmatize(word) for word in tokens]
52
  return ' '.join(tokens)
53
 
54
+ # Define maximum lengths
55
  max_url_length = 180
56
  max_html_length = 2000
57
  max_words = 10000
58
 
59
+ # Load datasets
60
+ url_df = pd.read_csv('url_data.csv')
61
+ html_df = pd.read_csv('html_data.csv')
62
+
63
+ # Clean URL 'Data' Columns
64
+ url_df['Cleaned_Data'] = url_df['Data'].apply(preprocess_url)
65
+
66
+ # Clean HTML 'Data' Columns
67
+ html_df['Cleaned_Data'] = html_df['Data'].apply(preprocess_html)
68
+
69
+ # URL Tokenization and Padding
70
+ url_tokenizer = Tokenizer(num_words=max_words, char_level=True)
71
+ url_tokenizer.fit_on_texts(url_df['Cleaned_Data'])
72
+ url_sequences = url_tokenizer.texts_to_sequences(url_df['Cleaned_Data'])
73
+ url_padded = pad_sequences(url_sequences, maxlen=max_url_length, padding='post', truncating='post')
74
+
75
+ # HTML Tokenization and Padding
76
+ html_tokenizer = Tokenizer(num_words=max_words)
77
+ html_tokenizer.fit_on_texts(html_df['Cleaned_Data'])
78
+ html_sequences = html_tokenizer.texts_to_sequences(html_df['Cleaned_Data'])
79
+ html_padded = pad_sequences(html_sequences, maxlen=max_html_length, padding='post', truncating='post')
80
+
81
+ # Encode 'Category' Column
82
+ label_encoder = LabelEncoder()
83
+ url_df['Category_Encoded'] = label_encoder.fit_transform(url_df['Category'])
84
+ html_df['Category_Encoded'] = label_encoder.transform(html_df['Category'])
85
+
86
+ # Split datasets into training and testing sets
87
+ url_X_train, url_X_test, url_y_train, url_y_test = train_test_split(url_padded, url_df['Category_Encoded'], test_size=0.2, random_state=42)
88
+ html_X_train, html_X_test, html_y_train, html_y_test = train_test_split(html_padded, html_df['Category_Encoded'], test_size=0.2, random_state=42)
89
 
90
  def preprocess_input(input_text, tokenizer, max_length):
91
  sequences = tokenizer.texts_to_sequences([input_text])
 
95
  def get_prediction(input_text, input_type):
96
  is_url = input_type == "URL"
97
  if is_url:
98
+ cleaned_text = preprocess_url(input_text)
99
  input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length)
100
  input_data = [input_data, np.zeros((1, max_html_length))] # dummy HTML input
101
  else:
102
+ cleaned_text = preprocess_html(input_text)
103
  input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length)
104
  input_data = [np.zeros((1, max_url_length)), input_data] # dummy URL input
105