Spaces:
Build error
Build error
| import json | |
| import os | |
| import socket | |
| import ssl | |
| from urllib.parse import urlparse | |
| import requests | |
| import tldextract | |
| import whois | |
| from bs4 import BeautifulSoup | |
| from dotenv import load_dotenv | |
| from duckduckgo_search import DDGS | |
| from requests.exceptions import HTTPError, Timeout, RequestException | |
| from serpapi import GoogleSearch | |
| load_dotenv() | |
| from src import my_tools | |
| from src import url_phase | |
| from src import backlink_check | |
| def verify_event_website(event_name, url): | |
| score = 0 | |
| details = {} | |
| # Normalize inputs | |
| event_name_lower = event_name.lower() | |
| headers = {"User-Agent": "Mozilla/5.0"} | |
| try: | |
| response = requests.get(url, headers=headers, timeout=10) | |
| response.raise_for_status() | |
| except HTTPError as http_err: | |
| if response.status_code == 403: | |
| details = {"error": "Access forbidden: 403 Forbidden", "score": -10} | |
| elif response.status_code == 404: | |
| details = {"error": "Page not found: 404 Not Found", "score": -10} | |
| elif response.status_code == 500: | |
| details = {"error": "Server error: 500 Internal Server Error", "score": -10} | |
| else: | |
| details = {"error": f"HTTP error occurred: {http_err}", "score": -10} | |
| except Timeout as timeout_err: | |
| details = {"error": f"Request timed out: {timeout_err}", "score": -10} | |
| except RequestException as req_err: | |
| details = {"error": f"Request error occurred: {req_err}", "score": -10} | |
| except Exception as e: | |
| details = {"error": f"An error occurred: {e}", "score": -10} | |
| domain_parts = tldextract.extract(url) | |
| domain = domain_parts.domain + '.' + domain_parts.suffix | |
| if any(part in domain.lower() for part in event_name_lower.replace("de", "").split()): | |
| score += 1 | |
| details["domain"] = domain | |
| # WHOIS | |
| try: | |
| w = whois.whois(domain) | |
| if w and any( | |
| event_name_lower in str(v).lower() for v in [w.get('org'), w.get('name'), w.get('registrant_name')] | |
| ): | |
| score += 2 | |
| details["whois_org"] = w.get('org') | |
| except: | |
| details["whois_org"] = "N/A" | |
| # SSL Cert Org | |
| ssl_org = get_ssl_organization(domain) | |
| if ssl_org and event_name_lower in ssl_org.lower(): | |
| score += 2 | |
| details["ssl_org"] = ssl_org | |
| ranking = google_search_ranking(event_name, url) | |
| if ranking and ranking <= 10: # If the URL appears in the top 10 results | |
| score += 3 # High rank boosts the score | |
| details["google_search_rank"] = ranking if ranking else "Not found in top 10" | |
| # Wikipedia Backlink Check - Optional (stub) | |
| # Could use a Wikipedia API or scraper to see if the URL appears on the event's page | |
| score += wikipedia_link_score(event_name, url) | |
| # Backlink Check | |
| backlinks = backlink_check.verify_url(url) | |
| if backlinks: | |
| score += 1 | |
| details["backlinks"] = backlinks | |
| # Final trust score | |
| details["event_name"] = event_name | |
| details["url"] = url | |
| details["score"] = score | |
| return details | |
| def get_ssl_organization(domain): | |
| try: | |
| ctx = ssl.create_default_context() | |
| with ctx.wrap_socket(socket.socket(), server_hostname=domain) as s: | |
| s.settimeout(5.0) | |
| s.connect((domain, 443)) | |
| cert = s.getpeercert() | |
| return cert.get('subject', [[('', '')]])[0][1] # Org name | |
| except: | |
| return None | |
| def get_structured_data(soup): | |
| json_ld = soup.find_all("script", type="application/ld+json") | |
| structured = [] | |
| for tag in json_ld: | |
| try: | |
| data = json.loads(tag.string) | |
| if isinstance(data, dict) and "SportsEvent" in str(data.get("@type", "")): | |
| structured.append(data) | |
| except: | |
| continue | |
| return structured | |
| def google_search_ranking(event_name, url): | |
| """ | |
| This function checks the Google search ranking of a specific URL when searching for the event_name. | |
| Args: | |
| event_name (str): The name of the event (e.g., "Tour de France"). | |
| url (str): The official event URL (e.g., "https://www.letour.fr/en/"). | |
| Returns: | |
| int: The ranking of the URL in the search results (1-based). | |
| """ | |
| search_params = { | |
| "q": event_name, # Search for the event name | |
| "api_key": os.getenv("SERPAPI_API_KEY") or os.getenv("SERPER_API_KEY"), | |
| } | |
| # Perform the search using SerpAPI | |
| search = GoogleSearch(search_params) | |
| search_results = search.get_dict() | |
| # Extract the domain from the provided URL | |
| parsed_url = urlparse(url) | |
| domain = parsed_url.netloc.lower() | |
| redirected_url = url_phase.check_redirection(url) | |
| redirected_domain = urlparse(redirected_url).netloc.lower() | |
| # Check if either the original or redirected domain appears in the search results | |
| for index, result_dic in enumerate(search_results.get('organic_results', [])): | |
| if 'link' in result_dic: | |
| # print(f"Checking {result_dic['link']}") | |
| result_url = url_phase.check_redirection(result_dic['link']) | |
| result_domain = urlparse(result_url).netloc.lower() | |
| # If either the original or redirected domain matches | |
| if domain in result_domain or redirected_domain in result_domain: | |
| return index + 1 # Return 1-based index (ranking) | |
| return None # URL not found in the top results | |
| def get_wikipedia_external_links(url_wiki): | |
| response = requests.get(url_wiki) | |
| response.raise_for_status() | |
| if response.status_code != 200: | |
| print("Failed to fetch Wikipedia page") | |
| return {} | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| table_sections = [] | |
| infobox = soup.find("table", class_="infobox") | |
| if infobox: | |
| # for b in infobox.find_all("td", class_="infobox-data"): | |
| for tr in infobox.find_all("tr"): | |
| for th in tr.find_all("th", class_="infobox-label"): | |
| if "Web site" in th.text: | |
| table_sections.append(url_phase.check_redirection(tr.a.get("href"))) | |
| # Extract external links | |
| ext_links = [] | |
| external_span = soup.find(id="External_links").find_all_next("span", class_="official-website") | |
| for ext_a in external_span: | |
| for a in ext_a.find_all("a", href=True): | |
| ext_links.append(url_phase.check_redirection(a.get("href"))) | |
| return {"table_sections": table_sections, "external_links": ext_links} | |
| def calculate_weighted_score(external_links, table_sections, url): | |
| """ | |
| Calculate a weighted score for the URL based on its presence in specific sections. | |
| Args: | |
| external_links (list): List of external links found on the Wikipedia page. | |
| table_sections (list): List of sections in the Wikipedia article. | |
| url (str): The URL to check for. | |
| Returns: | |
| float: A weighted score based on the occurrence and section importance. | |
| """ | |
| score = 0 | |
| table_freq = table_sections.count(url.lower()) | |
| score += table_freq * 3.0 | |
| frequency = external_links.count(url.lower()) # Count how many times the URL appears | |
| score += frequency * 1.0 # Weighted score by frequency and section | |
| return score | |
| def wikipedia_link_score(tour_name, url): | |
| data = None | |
| wiki_url = get_wikipedia_url(tour_name) | |
| if wiki_url: | |
| data = get_wikipedia_external_links(wiki_url) | |
| if not data: | |
| return 0.0 | |
| external_links = data.get("external_links", []) | |
| table_sections = data.get("table_sections", []) | |
| return calculate_weighted_score(external_links, table_sections, url.lower()) | |
| def extract_official_website(wiki_url: str) -> str | None: | |
| """Extract the official website from the Wikipedia page.""" | |
| response = requests.get(wiki_url, headers={"User-Agent": "Mozilla/5.0"}) | |
| if response.status_code != 200: | |
| return None | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| # Search the infobox first for the official website | |
| infobox = soup.find("table", class_="infobox") | |
| if infobox: | |
| for a in infobox.find_all("a", href=True): | |
| if a["href"].startswith("http") and "official website" in a.get_text().lower(): | |
| return url_phase.check_redirection(a["href"]) | |
| # If not found in the infobox, search the whole page for links with "official website" | |
| for a in soup.find_all("a", href=True): | |
| if "official website" in a.get_text().lower() and a["href"].startswith("http"): | |
| return url_phase.check_redirection(a["href"]) | |
| # Fallback: Check for possible external links section | |
| external_links_section = soup.find("span", {"id": "External_links"}) | |
| if external_links_section: | |
| parent = external_links_section.find_parent("h2") | |
| if parent: | |
| for link in parent.find_next("ul").find_all("a", href=True): | |
| if "official website" in link.get_text().lower() and link["href"].startswith("http"): | |
| return url_phase.check_redirection(link["href"]) | |
| return None | |
| def get_wikipedia_url(query): | |
| """Search Google for the Wikipedia page of name.""" | |
| query = f"{query} site:wikipedia.org" | |
| ddgs = DDGS() | |
| for result in ddgs.text(query, max_results=5): | |
| # for result in search(query, pause=5): | |
| if "wikipedia.org" in result.get('href'): | |
| return result.get('href') | |
| return None | |
| def get_official_website(tour_name: str) -> str: | |
| """ | |
| Get the official website of a cycling tour. | |
| Args: | |
| tour_name: The name of the cycling tour. | |
| Returns: | |
| url of official website | |
| """ | |
| wiki_url = get_wikipedia_url(tour_name) | |
| if wiki_url: | |
| return extract_official_website(wiki_url) | |
| return "Not Found" | |
| # "Not Found" | |
| # Example use | |
| if __name__ == "__main__": | |
| print(google_search_ranking("Facebook", "https://www.facebook.com/")) | |
| # ver_dict = verify_event_website("Benelux Tour", "https://renewitour.com/nl/") | |
| # for k, v in ver_dict.items(): | |
| # print(f"{k}: {v}") | |
| # score = wikipedia_link_score("Tour de France", "https://www.letour.fr/en/") | |
| # print(score) | |