import asyncio import pandas as pd from concurrent.futures import ThreadPoolExecutor from flask import Flask, request, render_template from catboost import CatBoostClassifier # Import CatBoost from url_process import extract_url_features # Ensure you have the appropriate feature extraction function import os # Batch Processing: Ensures URLs are processed in manageable chunks def process_urls_in_batches(urls, batch_size=10): for i in range(0, len(urls), batch_size): yield urls[i:i + batch_size] # Async function for non-blocking DNS lookups and HTTP requests async def async_extract_features(url): features = await asyncio.to_thread(extract_url_features, url) return features # ThreadPoolExecutor for CPU-bound tasks like feature extraction def extract_features_in_parallel(urls): with ThreadPoolExecutor(max_workers=5) as executor: return list(executor.map(extract_url_features, urls)) # Load the CatBoost model for inference def predict_with_catboost(features_df, model_path): try: print(f"Attempting to load model from: {model_path}") print(f"File exists: {os.path.exists(model_path)}") print(f"File size: {os.path.getsize(model_path)}") model = CatBoostClassifier() model.load_model(model_path) predictions = model.predict(features_df) return predictions except Exception as e: print(f"Error loading model: {str(e)}") raise # Flask App Setup app = Flask(__name__) @app.route("/", methods=["GET", "POST"]) async def index(): result = None url_features = None if request.method == "POST": # Get the URL input from the form url = request.form["url"] try: # Asynchronously process the URL features features = await async_extract_features(url) # Convert the features to DataFrame (in case you need to do further processing) features_df = pd.DataFrame([features]) # Try multiple possible model locations possible_paths = [ os.path.join(os.getcwd(), "catboost_model.bin"), "/app/catboost_model.bin", # Docker container path "catboost_model.bin" ] model_path = None for path in possible_paths: if os.path.exists(path): model_path = path break if model_path is None: raise FileNotFoundError("Model file not found in any expected location") predictions = predict_with_catboost(features_df, model_path) # Determine if the URL is malicious or legitimate if predictions[0] == 1: result = "Malicious" else: result = "Legitimate" # Optionally, display the extracted features url_features = features except Exception as e: result = f"Error processing URL: {str(e)}" return render_template("index.html", result=result, url_features=url_features) if __name__ == "__main__": app.run(debug=False,host="0.0.0.0",port=7860)