Spaces:

aaa23308
/

OfficialURLFinder

Build error

App Files Files Community

OfficialURLFinder / src /url_function.py

aaa23308

Upload folder using huggingface_hub

16502ad verified 11 months ago

raw

history blame contribute delete

10.1 kB

	import json
	import os
	import socket
	import ssl
	from urllib.parse import urlparse

	import requests
	import tldextract
	import whois
	from bs4 import BeautifulSoup
	from dotenv import load_dotenv
	from duckduckgo_search import DDGS
	from requests.exceptions import HTTPError, Timeout, RequestException
	from serpapi import GoogleSearch

	load_dotenv()
	from src import my_tools
	from src import url_phase
	from src import backlink_check


	def verify_event_website(event_name, url):
	score = 0
	details = {}

	# Normalize inputs
	event_name_lower = event_name.lower()
	headers = {"User-Agent": "Mozilla/5.0"}

	try:
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()
	except HTTPError as http_err:
	if response.status_code == 403:
	details = {"error": "Access forbidden: 403 Forbidden", "score": -10}
	elif response.status_code == 404:
	details = {"error": "Page not found: 404 Not Found", "score": -10}
	elif response.status_code == 500:
	details = {"error": "Server error: 500 Internal Server Error", "score": -10}
	else:
	details = {"error": f"HTTP error occurred: {http_err}", "score": -10}
	except Timeout as timeout_err:
	details = {"error": f"Request timed out: {timeout_err}", "score": -10}
	except RequestException as req_err:
	details = {"error": f"Request error occurred: {req_err}", "score": -10}
	except Exception as e:
	details = {"error": f"An error occurred: {e}", "score": -10}

	domain_parts = tldextract.extract(url)
	domain = domain_parts.domain + '.' + domain_parts.suffix
	if any(part in domain.lower() for part in event_name_lower.replace("de", "").split()):
	score += 1
	details["domain"] = domain

	# WHOIS
	try:
	w = whois.whois(domain)
	if w and any(
	event_name_lower in str(v).lower() for v in [w.get('org'), w.get('name'), w.get('registrant_name')]
	):
	score += 2
	details["whois_org"] = w.get('org')
	except:
	details["whois_org"] = "N/A"

	# SSL Cert Org
	ssl_org = get_ssl_organization(domain)
	if ssl_org and event_name_lower in ssl_org.lower():
	score += 2
	details["ssl_org"] = ssl_org

	ranking = google_search_ranking(event_name, url)
	if ranking and ranking <= 10: # If the URL appears in the top 10 results
	score += 3 # High rank boosts the score
	details["google_search_rank"] = ranking if ranking else "Not found in top 10"

	# Wikipedia Backlink Check - Optional (stub)
	# Could use a Wikipedia API or scraper to see if the URL appears on the event's page
	score += wikipedia_link_score(event_name, url)

	# Backlink Check
	backlinks = backlink_check.verify_url(url)
	if backlinks:
	score += 1
	details["backlinks"] = backlinks

	# Final trust score
	details["event_name"] = event_name
	details["url"] = url
	details["score"] = score

	return details


	def get_ssl_organization(domain):
	try:
	ctx = ssl.create_default_context()
	with ctx.wrap_socket(socket.socket(), server_hostname=domain) as s:
	s.settimeout(5.0)
	s.connect((domain, 443))
	cert = s.getpeercert()
	return cert.get('subject', [[('', '')]])[0][1] # Org name
	except:
	return None


	def get_structured_data(soup):
	json_ld = soup.find_all("script", type="application/ld+json")
	structured = []
	for tag in json_ld:
	try:
	data = json.loads(tag.string)
	if isinstance(data, dict) and "SportsEvent" in str(data.get("@type", "")):
	structured.append(data)
	except:
	continue
	return structured


	def google_search_ranking(event_name, url):
	"""
	This function checks the Google search ranking of a specific URL when searching for the event_name.

	Args:
	event_name (str): The name of the event (e.g., "Tour de France").
	url (str): The official event URL (e.g., "https://www.letour.fr/en/").

	Returns:
	int: The ranking of the URL in the search results (1-based).
	"""
	search_params = {
	"q": event_name, # Search for the event name
	"api_key": os.getenv("SERPAPI_API_KEY") or os.getenv("SERPER_API_KEY"),
	}

	# Perform the search using SerpAPI
	search = GoogleSearch(search_params)
	search_results = search.get_dict()
	# Extract the domain from the provided URL
	parsed_url = urlparse(url)
	domain = parsed_url.netloc.lower()
	redirected_url = url_phase.check_redirection(url)
	redirected_domain = urlparse(redirected_url).netloc.lower()
	# Check if either the original or redirected domain appears in the search results
	for index, result_dic in enumerate(search_results.get('organic_results', [])):
	if 'link' in result_dic:
	# print(f"Checking {result_dic['link']}")
	result_url = url_phase.check_redirection(result_dic['link'])
	result_domain = urlparse(result_url).netloc.lower()

	# If either the original or redirected domain matches
	if domain in result_domain or redirected_domain in result_domain:
	return index + 1 # Return 1-based index (ranking)

	return None # URL not found in the top results


	def get_wikipedia_external_links(url_wiki):
	response = requests.get(url_wiki)
	response.raise_for_status()
	if response.status_code != 200:
	print("Failed to fetch Wikipedia page")
	return {}
	soup = BeautifulSoup(response.text, 'html.parser')
	table_sections = []
	infobox = soup.find("table", class_="infobox")
	if infobox:
	# for b in infobox.find_all("td", class_="infobox-data"):
	for tr in infobox.find_all("tr"):
	for th in tr.find_all("th", class_="infobox-label"):
	if "Web site" in th.text:
	table_sections.append(url_phase.check_redirection(tr.a.get("href")))

	# Extract external links
	ext_links = []
	external_span = soup.find(id="External_links").find_all_next("span", class_="official-website")
	for ext_a in external_span:
	for a in ext_a.find_all("a", href=True):
	ext_links.append(url_phase.check_redirection(a.get("href")))
	return {"table_sections": table_sections, "external_links": ext_links}


	def calculate_weighted_score(external_links, table_sections, url):
	"""
	Calculate a weighted score for the URL based on its presence in specific sections.

	Args:
	external_links (list): List of external links found on the Wikipedia page.
	table_sections (list): List of sections in the Wikipedia article.
	url (str): The URL to check for.

	Returns:
	float: A weighted score based on the occurrence and section importance.
	"""
	score = 0
	table_freq = table_sections.count(url.lower())
	score += table_freq * 3.0
	frequency = external_links.count(url.lower()) # Count how many times the URL appears
	score += frequency * 1.0 # Weighted score by frequency and section
	return score


	def wikipedia_link_score(tour_name, url):
	data = None
	wiki_url = get_wikipedia_url(tour_name)
	if wiki_url:
	data = get_wikipedia_external_links(wiki_url)
	if not data:
	return 0.0

	external_links = data.get("external_links", [])
	table_sections = data.get("table_sections", [])

	return calculate_weighted_score(external_links, table_sections, url.lower())


	def extract_official_website(wiki_url: str) -> str \| None:
	"""Extract the official website from the Wikipedia page."""
	response = requests.get(wiki_url, headers={"User-Agent": "Mozilla/5.0"})
	if response.status_code != 200:
	return None
	soup = BeautifulSoup(response.text, "html.parser")

	# Search the infobox first for the official website
	infobox = soup.find("table", class_="infobox")
	if infobox:
	for a in infobox.find_all("a", href=True):
	if a["href"].startswith("http") and "official website" in a.get_text().lower():
	return url_phase.check_redirection(a["href"])

	# If not found in the infobox, search the whole page for links with "official website"
	for a in soup.find_all("a", href=True):
	if "official website" in a.get_text().lower() and a["href"].startswith("http"):
	return url_phase.check_redirection(a["href"])

	# Fallback: Check for possible external links section
	external_links_section = soup.find("span", {"id": "External_links"})
	if external_links_section:
	parent = external_links_section.find_parent("h2")
	if parent:
	for link in parent.find_next("ul").find_all("a", href=True):
	if "official website" in link.get_text().lower() and link["href"].startswith("http"):
	return url_phase.check_redirection(link["href"])

	return None

	def get_wikipedia_url(query):
	"""Search Google for the Wikipedia page of name."""
	query = f"{query} site:wikipedia.org"
	ddgs = DDGS()
	for result in ddgs.text(query, max_results=5):
	# for result in search(query, pause=5):
	if "wikipedia.org" in result.get('href'):
	return result.get('href')
	return None

	def get_official_website(tour_name: str) -> str:
	"""
	Get the official website of a cycling tour.
	Args:
	tour_name: The name of the cycling tour.
	Returns:
	url of official website
	"""
	wiki_url = get_wikipedia_url(tour_name)
	if wiki_url:
	return extract_official_website(wiki_url)
	return "Not Found"
	# "Not Found"

	# Example use
	if __name__ == "__main__":
	print(google_search_ranking("Facebook", "https://www.facebook.com/"))
	# ver_dict = verify_event_website("Benelux Tour", "https://renewitour.com/nl/")
	# for k, v in ver_dict.items():
	# print(f"{k}: {v}")
	# score = wikipedia_link_score("Tour de France", "https://www.letour.fr/en/")
	# print(score)