from typing import List, Dict, Any, Tuple from colorama import Fore, Style from ..scraper import Scraper from ..config.config import Config from ..utils.logger import get_formatted_logger logger = get_formatted_logger() def scrape_urls(urls, cfg=None) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: """ Scrapes the urls Args: urls: List of urls cfg: Config (optional) Returns: Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: Tuple containing scraped content and images """ scraped_data = [] images = [] user_agent = ( cfg.user_agent if cfg else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36" ) try: scraper = Scraper(urls, user_agent, cfg.scraper) scraped_data = scraper.run() for item in scraped_data: if 'image_urls' in item: images.extend([img for img in item['image_urls']]) except Exception as e: print(f"{Fore.RED}Error in scrape_urls: {e}{Style.RESET_ALL}") return scraped_data, images async def filter_urls(urls: List[str], config: Config) -> List[str]: """ Filter URLs based on configuration settings. Args: urls (List[str]): List of URLs to filter. config (Config): Configuration object. Returns: List[str]: Filtered list of URLs. """ filtered_urls = [] for url in urls: # Add your filtering logic here # For example, you might want to exclude certain domains or URL patterns if not any(excluded in url for excluded in config.excluded_domains): filtered_urls.append(url) return filtered_urls async def extract_main_content(html_content: str) -> str: """ Extract the main content from HTML. Args: html_content (str): Raw HTML content. Returns: str: Extracted main content. """ # Implement content extraction logic here # This could involve using libraries like BeautifulSoup or custom parsing logic # For now, we'll just return the raw HTML as a placeholder return html_content async def process_scraped_data(scraped_data: List[Dict[str, Any]], config: Config) -> List[Dict[str, Any]]: """ Process the scraped data to extract and clean the main content. Args: scraped_data (List[Dict[str, Any]]): List of dictionaries containing scraped data. config (Config): Configuration object. Returns: List[Dict[str, Any]]: Processed scraped data. """ processed_data = [] for item in scraped_data: if item['status'] == 'success': main_content = await extract_main_content(item['content']) processed_data.append({ 'url': item['url'], 'content': main_content, 'status': 'success' }) else: processed_data.append(item) return processed_data