import gradio as gr
import tensorflow as tf
import numpy as np
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import re

# Load the model
model = tf.keras.models.load_model('new_phishing_detection_model.keras')

# Compile the model with standard loss and metrics
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
              loss='binary_crossentropy',
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

# Preprocessing functions
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_url(url):
    url = url.lower()
    url = re.sub(r'https?://', '', url)
    url = re.sub(r'www\.', '', url)
    url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
    url = re.sub(r'\s+', ' ', url).strip()
    tokens = word_tokenize(url)
    tokens = [word for word in tokens if word not in STOPWORDS]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

def preprocess_html(html):
    html = re.sub(r'<[^>]+>', ' ', html)
    html = html.lower()
    html = re.sub(r'https?://', '', html)
    html = re.sub(r'[^a-zA-Z0-9]', ' ', html)
    html = re.sub(r'\s+', ' ', html).strip()
    tokens = word_tokenize(html)
    tokens = [word for word in tokens if word not in STOPWORDS]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Define maximum lengths
max_url_length = 180
max_html_length = 2000
max_words = 10000

# Load datasets
url_df = pd.read_csv('url_data.csv')
html_df = pd.read_csv('html_data.csv')

# Clean URL 'Data' Columns
url_df['Cleaned_Data'] = url_df['Data'].apply(preprocess_url)

# Clean HTML 'Data' Columns
html_df['Cleaned_Data'] = html_df['Data'].apply(preprocess_html)

# URL Tokenization and Padding
url_tokenizer = Tokenizer(num_words=max_words, char_level=True)
url_tokenizer.fit_on_texts(url_df['Cleaned_Data'])
url_sequences = url_tokenizer.texts_to_sequences(url_df['Cleaned_Data'])
url_padded = pad_sequences(url_sequences, maxlen=max_url_length, padding='post', truncating='post')

# HTML Tokenization and Padding
html_tokenizer = Tokenizer(num_words=max_words)
html_tokenizer.fit_on_texts(html_df['Cleaned_Data'])
html_sequences = html_tokenizer.texts_to_sequences(html_df['Cleaned_Data'])
html_padded = pad_sequences(html_sequences, maxlen=max_html_length, padding='post', truncating='post')

# Encode 'Category' Column
label_encoder = LabelEncoder()
url_df['Category_Encoded'] = label_encoder.fit_transform(url_df['Category'])
html_df['Category_Encoded'] = label_encoder.transform(html_df['Category'])

# Split datasets into training and testing sets
url_X_train, url_X_test, url_y_train, url_y_test = train_test_split(url_padded, url_df['Category_Encoded'], test_size=0.2, random_state=42)
html_X_train, html_X_test, html_y_train, html_y_test = train_test_split(html_padded, html_df['Category_Encoded'], test_size=0.2, random_state=42)

def preprocess_input(input_text, tokenizer, max_length):
    sequences = tokenizer.texts_to_sequences([input_text])
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
    return padded_sequences

def get_prediction(input_text, input_type):
    is_url = input_type == "URL"
    if is_url:
        cleaned_text = preprocess_url(input_text)
        input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length)
        input_data = [input_data, np.zeros((1, max_html_length))]  # dummy HTML input
    else:
        cleaned_text = preprocess_html(input_text)
        input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length)
        input_data = [np.zeros((1, max_url_length)), input_data]  # dummy URL input
    
    prediction = model.predict(input_data)[0][0]
    return prediction

def ensemble_prediction(input_text, input_type, n_ensemble=5):
    predictions = [get_prediction(input_text, input_type) for _ in range(n_ensemble)]
    avg_prediction = np.mean(predictions)
    return avg_prediction

def phishing_detection(input_text, input_type):
    prediction = ensemble_prediction(input_text, input_type)
    threshold = 0.5  # Keep the threshold unchanged
    if prediction > threshold:
        return f"Warning: This site is likely a phishing site! ({prediction:.2f})"
    else:
        return f"Safe: This site is not likely a phishing site. ({prediction:.2f})"

iface = gr.Interface(
    fn=phishing_detection,
    inputs=[
        gr.components.Textbox(lines=5, placeholder="Enter URL or HTML code"), 
        gr.components.Radio(["URL", "HTML"], type="value", label="Input Type")
    ],
    outputs=gr.components.Textbox(label="Phishing Detection Result"),
    title="Phishing Detection Model",
    description="Check if a URL or HTML is Phishing.",
    theme="default"
)

iface.launch()