import streamlit as st
import requests
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
import torch
import re

# Function to scrape a URL using Beautiful Soup
def scrape_url(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        text = ' '.join(soup.stripped_strings)
        return text
    except requests.exceptions.RequestException as e:
        st.error(f"Error scraping URL: {e}")
        return None
    except Exception as e:
        st.error(f"Error during text extraction {e}")
        return None


@st.cache_resource
def load_model():
    tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b-it",
                                                torch_dtype=torch.bfloat16,
                                                low_cpu_mem_usage=True,
                                                device_map="auto",
                                                trust_remote_code=True)
    return tokenizer, model

tokenizer, model = load_model()

def generate_json_output(text):
    # Truncate text to a reasonable length to avoid index out of bounds errors
    max_chars = 3000  # Adjust this value based on experimentation
    truncated_text = text[:max_chars]
    
    prompt = f"""You are a web page text scanner. Your task is to carefully review text from a web page.

The following text is extracted from a web page.

"{truncated_text}"

Answer the following questions:
1) What brand does the page represent?
2) Summarize the intent of the page in one sentence. Do not leak PII data.

You should output your answers strictly in the following JSON format, but do NOT use markdown:
{{\"brand\": \"<brand>\", \"intent\": \"<intent>\"}}
"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=256, return_dict_in_generate=True)
    generated_tokens = outputs.sequences[:, inputs.input_ids.shape[1]:]
    response = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

    output_json = None
    try:
        # Use regex to find the JSON block
        match = re.search(r'\{[\s\S]*?\}', response)
        if match:
            json_str = match.group(0)
            output_json = json.loads(json_str)
        else:
           output_json = {"error": "JSON output could not be parsed from the model response"}
    except json.JSONDecodeError:
         output_json = {"error": "JSON output could not be parsed from the model response"}
    except ValueError as e:
         output_json = {"error": f"Error processing the response: {e}"}
    
    return response, output_json

# Streamlit app
def main():
    st.title("Google Brand and Intent Detection")
    st.write("Google's brand and intent detection reverse engineered from Chrome by [DEJAN AI](https://dejan.ai/).")

    input_method = st.radio("Input Method", ["URL", "Text Input"])

    if input_method == "URL":
        url = st.text_input("Enter URL to scan:")
        if url:
            with st.spinner("Scraping the URL..."):
                scraped_text = scrape_url(url)
            if scraped_text:
                 with st.spinner("Generating Output..."):
                    raw_response, output_json = generate_json_output(scraped_text)
                 st.subheader("Analysis Results:")
                 if "error" in output_json:
                    st.error(output_json["error"])
                    st.write("Could not produce a valid response from the model, please check the raw model output for debugging.")
                 else:
                    st.write(f"**Brand:** {output_json.get('brand', 'N/A')}")
                    st.write(f"**Intent:** {output_json.get('intent', 'N/A')}")
                 with st.expander("Show Scraped Text"):
                    st.write(scraped_text)
                 with st.expander("Show Raw JSON"):
                    st.json(output_json)
            elif scraped_text is None:
                st.error("Could not scrape the URL. Please check the URL and try again.")


    elif input_method == "Text Input":
        text_input = st.text_area("Enter text to analyze:")
        if text_input:
             with st.spinner("Generating Output..."):
                raw_response, output_json = generate_json_output(text_input)

             st.subheader("Analysis Results:")
             if "error" in output_json:
                st.error(output_json["error"])
                st.write("Could not produce a valid response from the model, please check the raw model output for debugging.")
             else:
                st.write(f"**Brand:** {output_json.get('brand', 'N/A')}")
                st.write(f"**Intent:** {output_json.get('intent', 'N/A')}")
             with st.expander("Show Input Text"):
                st.write(text_input)
             with st.expander("Show Raw JSON"):
                st.json(output_json)


if __name__ == "__main__":
    main()