import json import os import socket import ssl from urllib.parse import urlparse import requests import tldextract import whois from bs4 import BeautifulSoup from dotenv import load_dotenv from duckduckgo_search import DDGS from requests.exceptions import HTTPError, Timeout, RequestException from serpapi import GoogleSearch load_dotenv() from src import my_tools from src import url_phase from src import backlink_check def verify_event_website(event_name, url): score = 0 details = {} # Normalize inputs event_name_lower = event_name.lower() headers = {"User-Agent": "Mozilla/5.0"} try: response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() except HTTPError as http_err: if response.status_code == 403: details = {"error": "Access forbidden: 403 Forbidden", "score": -10} elif response.status_code == 404: details = {"error": "Page not found: 404 Not Found", "score": -10} elif response.status_code == 500: details = {"error": "Server error: 500 Internal Server Error", "score": -10} else: details = {"error": f"HTTP error occurred: {http_err}", "score": -10} except Timeout as timeout_err: details = {"error": f"Request timed out: {timeout_err}", "score": -10} except RequestException as req_err: details = {"error": f"Request error occurred: {req_err}", "score": -10} except Exception as e: details = {"error": f"An error occurred: {e}", "score": -10} domain_parts = tldextract.extract(url) domain = domain_parts.domain + '.' + domain_parts.suffix if any(part in domain.lower() for part in event_name_lower.replace("de", "").split()): score += 1 details["domain"] = domain # WHOIS try: w = whois.whois(domain) if w and any( event_name_lower in str(v).lower() for v in [w.get('org'), w.get('name'), w.get('registrant_name')] ): score += 2 details["whois_org"] = w.get('org') except: details["whois_org"] = "N/A" # SSL Cert Org ssl_org = get_ssl_organization(domain) if ssl_org and event_name_lower in ssl_org.lower(): score += 2 details["ssl_org"] = ssl_org ranking = google_search_ranking(event_name, url) if ranking and ranking <= 10: # If the URL appears in the top 10 results score += 3 # High rank boosts the score details["google_search_rank"] = ranking if ranking else "Not found in top 10" # Wikipedia Backlink Check - Optional (stub) # Could use a Wikipedia API or scraper to see if the URL appears on the event's page score += wikipedia_link_score(event_name, url) # Backlink Check backlinks = backlink_check.verify_url(url) if backlinks: score += 1 details["backlinks"] = backlinks # Final trust score details["event_name"] = event_name details["url"] = url details["score"] = score return details def get_ssl_organization(domain): try: ctx = ssl.create_default_context() with ctx.wrap_socket(socket.socket(), server_hostname=domain) as s: s.settimeout(5.0) s.connect((domain, 443)) cert = s.getpeercert() return cert.get('subject', [[('', '')]])[0][1] # Org name except: return None def get_structured_data(soup): json_ld = soup.find_all("script", type="application/ld+json") structured = [] for tag in json_ld: try: data = json.loads(tag.string) if isinstance(data, dict) and "SportsEvent" in str(data.get("@type", "")): structured.append(data) except: continue return structured def google_search_ranking(event_name, url): """ This function checks the Google search ranking of a specific URL when searching for the event_name. Args: event_name (str): The name of the event (e.g., "Tour de France"). url (str): The official event URL (e.g., "https://www.letour.fr/en/"). Returns: int: The ranking of the URL in the search results (1-based). """ search_params = { "q": event_name, # Search for the event name "api_key": os.getenv("SERPAPI_API_KEY") or os.getenv("SERPER_API_KEY"), } # Perform the search using SerpAPI search = GoogleSearch(search_params) search_results = search.get_dict() # Extract the domain from the provided URL parsed_url = urlparse(url) domain = parsed_url.netloc.lower() redirected_url = url_phase.check_redirection(url) redirected_domain = urlparse(redirected_url).netloc.lower() # Check if either the original or redirected domain appears in the search results for index, result_dic in enumerate(search_results.get('organic_results', [])): if 'link' in result_dic: # print(f"Checking {result_dic['link']}") result_url = url_phase.check_redirection(result_dic['link']) result_domain = urlparse(result_url).netloc.lower() # If either the original or redirected domain matches if domain in result_domain or redirected_domain in result_domain: return index + 1 # Return 1-based index (ranking) return None # URL not found in the top results def get_wikipedia_external_links(url_wiki): response = requests.get(url_wiki) response.raise_for_status() if response.status_code != 200: print("Failed to fetch Wikipedia page") return {} soup = BeautifulSoup(response.text, 'html.parser') table_sections = [] infobox = soup.find("table", class_="infobox") if infobox: # for b in infobox.find_all("td", class_="infobox-data"): for tr in infobox.find_all("tr"): for th in tr.find_all("th", class_="infobox-label"): if "Web site" in th.text: table_sections.append(url_phase.check_redirection(tr.a.get("href"))) # Extract external links ext_links = [] external_span = soup.find(id="External_links").find_all_next("span", class_="official-website") for ext_a in external_span: for a in ext_a.find_all("a", href=True): ext_links.append(url_phase.check_redirection(a.get("href"))) return {"table_sections": table_sections, "external_links": ext_links} def calculate_weighted_score(external_links, table_sections, url): """ Calculate a weighted score for the URL based on its presence in specific sections. Args: external_links (list): List of external links found on the Wikipedia page. table_sections (list): List of sections in the Wikipedia article. url (str): The URL to check for. Returns: float: A weighted score based on the occurrence and section importance. """ score = 0 table_freq = table_sections.count(url.lower()) score += table_freq * 3.0 frequency = external_links.count(url.lower()) # Count how many times the URL appears score += frequency * 1.0 # Weighted score by frequency and section return score def wikipedia_link_score(tour_name, url): data = None wiki_url = get_wikipedia_url(tour_name) if wiki_url: data = get_wikipedia_external_links(wiki_url) if not data: return 0.0 external_links = data.get("external_links", []) table_sections = data.get("table_sections", []) return calculate_weighted_score(external_links, table_sections, url.lower()) def extract_official_website(wiki_url: str) -> str | None: """Extract the official website from the Wikipedia page.""" response = requests.get(wiki_url, headers={"User-Agent": "Mozilla/5.0"}) if response.status_code != 200: return None soup = BeautifulSoup(response.text, "html.parser") # Search the infobox first for the official website infobox = soup.find("table", class_="infobox") if infobox: for a in infobox.find_all("a", href=True): if a["href"].startswith("http") and "official website" in a.get_text().lower(): return url_phase.check_redirection(a["href"]) # If not found in the infobox, search the whole page for links with "official website" for a in soup.find_all("a", href=True): if "official website" in a.get_text().lower() and a["href"].startswith("http"): return url_phase.check_redirection(a["href"]) # Fallback: Check for possible external links section external_links_section = soup.find("span", {"id": "External_links"}) if external_links_section: parent = external_links_section.find_parent("h2") if parent: for link in parent.find_next("ul").find_all("a", href=True): if "official website" in link.get_text().lower() and link["href"].startswith("http"): return url_phase.check_redirection(link["href"]) return None def get_wikipedia_url(query): """Search Google for the Wikipedia page of name.""" query = f"{query} site:wikipedia.org" ddgs = DDGS() for result in ddgs.text(query, max_results=5): # for result in search(query, pause=5): if "wikipedia.org" in result.get('href'): return result.get('href') return None def get_official_website(tour_name: str) -> str: """ Get the official website of a cycling tour. Args: tour_name: The name of the cycling tour. Returns: url of official website """ wiki_url = get_wikipedia_url(tour_name) if wiki_url: return extract_official_website(wiki_url) return "Not Found" # "Not Found" # Example use if __name__ == "__main__": print(google_search_ranking("Facebook", "https://www.facebook.com/")) # ver_dict = verify_event_website("Benelux Tour", "https://renewitour.com/nl/") # for k, v in ver_dict.items(): # print(f"{k}: {v}") # score = wikipedia_link_score("Tour de France", "https://www.letour.fr/en/") # print(score)