Spaces:
Running
Running
from typing import List, Dict, Any, Tuple | |
from colorama import Fore, Style | |
from ..scraper import Scraper | |
from ..config.config import Config | |
from ..utils.logger import get_formatted_logger | |
logger = get_formatted_logger() | |
def scrape_urls(urls, cfg=None) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: | |
""" | |
Scrapes the urls | |
Args: | |
urls: List of urls | |
cfg: Config (optional) | |
Returns: | |
Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: Tuple containing scraped content and images | |
""" | |
scraped_data = [] | |
images = [] | |
user_agent = ( | |
cfg.user_agent | |
if cfg | |
else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36" | |
) | |
try: | |
scraper = Scraper(urls, user_agent, cfg.scraper) | |
scraped_data = scraper.run() | |
for item in scraped_data: | |
if 'image_urls' in item: | |
images.extend([img for img in item['image_urls']]) | |
except Exception as e: | |
print(f"{Fore.RED}Error in scrape_urls: {e}{Style.RESET_ALL}") | |
return scraped_data, images | |
async def filter_urls(urls: List[str], config: Config) -> List[str]: | |
""" | |
Filter URLs based on configuration settings. | |
Args: | |
urls (List[str]): List of URLs to filter. | |
config (Config): Configuration object. | |
Returns: | |
List[str]: Filtered list of URLs. | |
""" | |
filtered_urls = [] | |
for url in urls: | |
# Add your filtering logic here | |
# For example, you might want to exclude certain domains or URL patterns | |
if not any(excluded in url for excluded in config.excluded_domains): | |
filtered_urls.append(url) | |
return filtered_urls | |
async def extract_main_content(html_content: str) -> str: | |
""" | |
Extract the main content from HTML. | |
Args: | |
html_content (str): Raw HTML content. | |
Returns: | |
str: Extracted main content. | |
""" | |
# Implement content extraction logic here | |
# This could involve using libraries like BeautifulSoup or custom parsing logic | |
# For now, we'll just return the raw HTML as a placeholder | |
return html_content | |
async def process_scraped_data(scraped_data: List[Dict[str, Any]], config: Config) -> List[Dict[str, Any]]: | |
""" | |
Process the scraped data to extract and clean the main content. | |
Args: | |
scraped_data (List[Dict[str, Any]]): List of dictionaries containing scraped data. | |
config (Config): Configuration object. | |
Returns: | |
List[Dict[str, Any]]: Processed scraped data. | |
""" | |
processed_data = [] | |
for item in scraped_data: | |
if item['status'] == 'success': | |
main_content = await extract_main_content(item['content']) | |
processed_data.append({ | |
'url': item['url'], | |
'content': main_content, | |
'status': 'success' | |
}) | |
else: | |
processed_data.append(item) | |
return processed_data | |