from concurrent.futures.thread import ThreadPoolExecutor from functools import partial from colorama import Fore, init import requests import subprocess import sys import importlib from . import ( ArxivScraper, BeautifulSoupScraper, PyMuPDFScraper, WebBaseLoaderScraper, BrowserScraper, TavilyExtract ) class Scraper: """ Scraper class to extract the content from the links """ def __init__(self, urls, user_agent, scraper): """ Initialize the Scraper class. Args: urls: """ self.urls = urls self.session = requests.Session() self.session.headers.update({"User-Agent": user_agent}) self.scraper = scraper if self.scraper == "tavily_extract": self._check_pkg(self.scraper) def run(self): """ Extracts the content from the links """ partial_extract = partial(self.extract_data_from_url, session=self.session) with ThreadPoolExecutor(max_workers=20) as executor: contents = executor.map(partial_extract, self.urls) res = [content for content in contents if content["raw_content"] is not None] return res def _check_pkg(self, scrapper_name : str) -> None: """ Checks and ensures required Python packages are available for scrapers that need dependencies beyond requirements.txt. When adding a new scraper to the repo, update `pkg_map` with its required information and call check_pkg() during initialization. """ pkg_map = { "tavily_extract": {"package_installation_name": "tavily-python", "import_name": "tavily"}, } pkg = pkg_map[scrapper_name] if not importlib.util.find_spec(pkg["import_name"]): pkg_inst_name = pkg["package_installation_name"] init(autoreset=True) print(Fore.YELLOW + f"{pkg_inst_name} not found. Attempting to install...") try: subprocess.check_call([sys.executable, "-m", "pip", "install", pkg_inst_name]) print(Fore.GREEN + f"{pkg_inst_name} installed successfully.") except subprocess.CalledProcessError: raise ImportError( Fore.RED + f"Unable to install {pkg_inst_name}. Please install manually with " f"`pip install -U {pkg_inst_name}`" ) def extract_data_from_url(self, link, session): """ Extracts the data from the link """ try: Scraper = self.get_scraper(link) scraper = Scraper(link, session) content, image_urls, title = scraper.scrape() if len(content) < 100: return {"url": link, "raw_content": None, "image_urls": [], "title": ""} return {"url": link, "raw_content": content, "image_urls": image_urls, "title": title} except Exception as e: return {"url": link, "raw_content": None, "image_urls": [], "title": ""} def get_scraper(self, link): """ The function `get_scraper` determines the appropriate scraper class based on the provided link or a default scraper if none matches. Args: link: The `get_scraper` method takes a `link` parameter which is a URL link to a webpage or a PDF file. Based on the type of content the link points to, the method determines the appropriate scraper class to use for extracting data from that content. Returns: The `get_scraper` method returns the scraper class based on the provided link. The method checks the link to determine the appropriate scraper class to use based on predefined mappings in the `SCRAPER_CLASSES` dictionary. If the link ends with ".pdf", it selects the `PyMuPDFScraper` class. If the link contains "arxiv.org", it selects the `ArxivScraper """ SCRAPER_CLASSES = { "pdf": PyMuPDFScraper, "arxiv": ArxivScraper, "bs": BeautifulSoupScraper, "web_base_loader": WebBaseLoaderScraper, "browser": BrowserScraper, "tavily_extract": TavilyExtract } scraper_key = None if link.endswith(".pdf"): scraper_key = "pdf" elif "arxiv.org" in link: scraper_key = "arxiv" else: scraper_key = self.scraper scraper_class = SCRAPER_CLASSES.get(scraper_key) if scraper_class is None: raise Exception("Scraper not found.") return scraper_class