OfficialURLFinder / src /url_function.py
aaa23308's picture
Upload folder using huggingface_hub
16502ad verified
import json
import os
import socket
import ssl
from urllib.parse import urlparse
import requests
import tldextract
import whois
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from duckduckgo_search import DDGS
from requests.exceptions import HTTPError, Timeout, RequestException
from serpapi import GoogleSearch
load_dotenv()
from src import my_tools
from src import url_phase
from src import backlink_check
def verify_event_website(event_name, url):
score = 0
details = {}
# Normalize inputs
event_name_lower = event_name.lower()
headers = {"User-Agent": "Mozilla/5.0"}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
except HTTPError as http_err:
if response.status_code == 403:
details = {"error": "Access forbidden: 403 Forbidden", "score": -10}
elif response.status_code == 404:
details = {"error": "Page not found: 404 Not Found", "score": -10}
elif response.status_code == 500:
details = {"error": "Server error: 500 Internal Server Error", "score": -10}
else:
details = {"error": f"HTTP error occurred: {http_err}", "score": -10}
except Timeout as timeout_err:
details = {"error": f"Request timed out: {timeout_err}", "score": -10}
except RequestException as req_err:
details = {"error": f"Request error occurred: {req_err}", "score": -10}
except Exception as e:
details = {"error": f"An error occurred: {e}", "score": -10}
domain_parts = tldextract.extract(url)
domain = domain_parts.domain + '.' + domain_parts.suffix
if any(part in domain.lower() for part in event_name_lower.replace("de", "").split()):
score += 1
details["domain"] = domain
# WHOIS
try:
w = whois.whois(domain)
if w and any(
event_name_lower in str(v).lower() for v in [w.get('org'), w.get('name'), w.get('registrant_name')]
):
score += 2
details["whois_org"] = w.get('org')
except:
details["whois_org"] = "N/A"
# SSL Cert Org
ssl_org = get_ssl_organization(domain)
if ssl_org and event_name_lower in ssl_org.lower():
score += 2
details["ssl_org"] = ssl_org
ranking = google_search_ranking(event_name, url)
if ranking and ranking <= 10: # If the URL appears in the top 10 results
score += 3 # High rank boosts the score
details["google_search_rank"] = ranking if ranking else "Not found in top 10"
# Wikipedia Backlink Check - Optional (stub)
# Could use a Wikipedia API or scraper to see if the URL appears on the event's page
score += wikipedia_link_score(event_name, url)
# Backlink Check
backlinks = backlink_check.verify_url(url)
if backlinks:
score += 1
details["backlinks"] = backlinks
# Final trust score
details["event_name"] = event_name
details["url"] = url
details["score"] = score
return details
def get_ssl_organization(domain):
try:
ctx = ssl.create_default_context()
with ctx.wrap_socket(socket.socket(), server_hostname=domain) as s:
s.settimeout(5.0)
s.connect((domain, 443))
cert = s.getpeercert()
return cert.get('subject', [[('', '')]])[0][1] # Org name
except:
return None
def get_structured_data(soup):
json_ld = soup.find_all("script", type="application/ld+json")
structured = []
for tag in json_ld:
try:
data = json.loads(tag.string)
if isinstance(data, dict) and "SportsEvent" in str(data.get("@type", "")):
structured.append(data)
except:
continue
return structured
def google_search_ranking(event_name, url):
"""
This function checks the Google search ranking of a specific URL when searching for the event_name.
Args:
event_name (str): The name of the event (e.g., "Tour de France").
url (str): The official event URL (e.g., "https://www.letour.fr/en/").
Returns:
int: The ranking of the URL in the search results (1-based).
"""
search_params = {
"q": event_name, # Search for the event name
"api_key": os.getenv("SERPAPI_API_KEY") or os.getenv("SERPER_API_KEY"),
}
# Perform the search using SerpAPI
search = GoogleSearch(search_params)
search_results = search.get_dict()
# Extract the domain from the provided URL
parsed_url = urlparse(url)
domain = parsed_url.netloc.lower()
redirected_url = url_phase.check_redirection(url)
redirected_domain = urlparse(redirected_url).netloc.lower()
# Check if either the original or redirected domain appears in the search results
for index, result_dic in enumerate(search_results.get('organic_results', [])):
if 'link' in result_dic:
# print(f"Checking {result_dic['link']}")
result_url = url_phase.check_redirection(result_dic['link'])
result_domain = urlparse(result_url).netloc.lower()
# If either the original or redirected domain matches
if domain in result_domain or redirected_domain in result_domain:
return index + 1 # Return 1-based index (ranking)
return None # URL not found in the top results
def get_wikipedia_external_links(url_wiki):
response = requests.get(url_wiki)
response.raise_for_status()
if response.status_code != 200:
print("Failed to fetch Wikipedia page")
return {}
soup = BeautifulSoup(response.text, 'html.parser')
table_sections = []
infobox = soup.find("table", class_="infobox")
if infobox:
# for b in infobox.find_all("td", class_="infobox-data"):
for tr in infobox.find_all("tr"):
for th in tr.find_all("th", class_="infobox-label"):
if "Web site" in th.text:
table_sections.append(url_phase.check_redirection(tr.a.get("href")))
# Extract external links
ext_links = []
external_span = soup.find(id="External_links").find_all_next("span", class_="official-website")
for ext_a in external_span:
for a in ext_a.find_all("a", href=True):
ext_links.append(url_phase.check_redirection(a.get("href")))
return {"table_sections": table_sections, "external_links": ext_links}
def calculate_weighted_score(external_links, table_sections, url):
"""
Calculate a weighted score for the URL based on its presence in specific sections.
Args:
external_links (list): List of external links found on the Wikipedia page.
table_sections (list): List of sections in the Wikipedia article.
url (str): The URL to check for.
Returns:
float: A weighted score based on the occurrence and section importance.
"""
score = 0
table_freq = table_sections.count(url.lower())
score += table_freq * 3.0
frequency = external_links.count(url.lower()) # Count how many times the URL appears
score += frequency * 1.0 # Weighted score by frequency and section
return score
def wikipedia_link_score(tour_name, url):
data = None
wiki_url = get_wikipedia_url(tour_name)
if wiki_url:
data = get_wikipedia_external_links(wiki_url)
if not data:
return 0.0
external_links = data.get("external_links", [])
table_sections = data.get("table_sections", [])
return calculate_weighted_score(external_links, table_sections, url.lower())
def extract_official_website(wiki_url: str) -> str | None:
"""Extract the official website from the Wikipedia page."""
response = requests.get(wiki_url, headers={"User-Agent": "Mozilla/5.0"})
if response.status_code != 200:
return None
soup = BeautifulSoup(response.text, "html.parser")
# Search the infobox first for the official website
infobox = soup.find("table", class_="infobox")
if infobox:
for a in infobox.find_all("a", href=True):
if a["href"].startswith("http") and "official website" in a.get_text().lower():
return url_phase.check_redirection(a["href"])
# If not found in the infobox, search the whole page for links with "official website"
for a in soup.find_all("a", href=True):
if "official website" in a.get_text().lower() and a["href"].startswith("http"):
return url_phase.check_redirection(a["href"])
# Fallback: Check for possible external links section
external_links_section = soup.find("span", {"id": "External_links"})
if external_links_section:
parent = external_links_section.find_parent("h2")
if parent:
for link in parent.find_next("ul").find_all("a", href=True):
if "official website" in link.get_text().lower() and link["href"].startswith("http"):
return url_phase.check_redirection(link["href"])
return None
def get_wikipedia_url(query):
"""Search Google for the Wikipedia page of name."""
query = f"{query} site:wikipedia.org"
ddgs = DDGS()
for result in ddgs.text(query, max_results=5):
# for result in search(query, pause=5):
if "wikipedia.org" in result.get('href'):
return result.get('href')
return None
def get_official_website(tour_name: str) -> str:
"""
Get the official website of a cycling tour.
Args:
tour_name: The name of the cycling tour.
Returns:
url of official website
"""
wiki_url = get_wikipedia_url(tour_name)
if wiki_url:
return extract_official_website(wiki_url)
return "Not Found"
# "Not Found"
# Example use
if __name__ == "__main__":
print(google_search_ranking("Facebook", "https://www.facebook.com/"))
# ver_dict = verify_event_website("Benelux Tour", "https://renewitour.com/nl/")
# for k, v in ver_dict.items():
# print(f"{k}: {v}")
# score = wikipedia_link_score("Tour de France", "https://www.letour.fr/en/")
# print(score)