import streamlit as st import requests from bs4 import BeautifulSoup from transformers import AutoTokenizer, AutoModelForCausalLM import json import torch import re # Function to scrape a URL using Beautiful Soup def scrape_url(url): try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') text = ' '.join(soup.stripped_strings) return text except requests.exceptions.RequestException as e: st.error(f"Error scraping URL: {e}") return None except Exception as e: st.error(f"Error during text extraction {e}") return None @st.cache_resource def load_model(): tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b-it", torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, device_map="auto", trust_remote_code=True) return tokenizer, model tokenizer, model = load_model() def generate_json_output(text): # Truncate text to a reasonable length to avoid index out of bounds errors max_chars = 3000 # Adjust this value based on experimentation truncated_text = text[:max_chars] prompt = f"""You are a web page text scanner. Your task is to carefully review text from a web page. The following text is extracted from a web page. "{truncated_text}" Answer the following questions: 1) What brand does the page represent? 2) Summarize the intent of the page in one sentence. Do not leak PII data. You should output your answers strictly in the following JSON format, but do NOT use markdown: {{\"brand\": \"\", \"intent\": \"\"}} """ inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate(**inputs, max_new_tokens=256, return_dict_in_generate=True) generated_tokens = outputs.sequences[:, inputs.input_ids.shape[1]:] response = tokenizer.decode(generated_tokens[0], skip_special_tokens=True) output_json = None try: # Use regex to find the JSON block match = re.search(r'\{[\s\S]*?\}', response) if match: json_str = match.group(0) output_json = json.loads(json_str) else: output_json = {"error": "JSON output could not be parsed from the model response"} except json.JSONDecodeError: output_json = {"error": "JSON output could not be parsed from the model response"} except ValueError as e: output_json = {"error": f"Error processing the response: {e}"} return response, output_json # Streamlit app def main(): st.title("Google Brand and Intent Detection") st.write("Google's brand and intent detection reverse engineered from Chrome by [DEJAN AI](https://dejan.ai/).") input_method = st.radio("Input Method", ["URL", "Text Input"]) if input_method == "URL": url = st.text_input("Enter URL to scan:") if url: with st.spinner("Scraping the URL..."): scraped_text = scrape_url(url) if scraped_text: with st.spinner("Generating Output..."): raw_response, output_json = generate_json_output(scraped_text) st.subheader("Analysis Results:") if "error" in output_json: st.error(output_json["error"]) st.write("Could not produce a valid response from the model, please check the raw model output for debugging.") else: st.write(f"**Brand:** {output_json.get('brand', 'N/A')}") st.write(f"**Intent:** {output_json.get('intent', 'N/A')}") with st.expander("Show Scraped Text"): st.write(scraped_text) with st.expander("Show Raw JSON"): st.json(output_json) elif scraped_text is None: st.error("Could not scrape the URL. Please check the URL and try again.") elif input_method == "Text Input": text_input = st.text_area("Enter text to analyze:") if text_input: with st.spinner("Generating Output..."): raw_response, output_json = generate_json_output(text_input) st.subheader("Analysis Results:") if "error" in output_json: st.error(output_json["error"]) st.write("Could not produce a valid response from the model, please check the raw model output for debugging.") else: st.write(f"**Brand:** {output_json.get('brand', 'N/A')}") st.write(f"**Intent:** {output_json.get('intent', 'N/A')}") with st.expander("Show Input Text"): st.write(text_input) with st.expander("Show Raw JSON"): st.json(output_json) if __name__ == "__main__": main()