Shreyas094's picture
Upload 528 files
372531f verified
raw
history blame
3.1 kB
from typing import List, Dict, Any, Tuple
from colorama import Fore, Style
from ..scraper import Scraper
from ..config.config import Config
from ..utils.logger import get_formatted_logger
logger = get_formatted_logger()
def scrape_urls(urls, cfg=None) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
"""
Scrapes the urls
Args:
urls: List of urls
cfg: Config (optional)
Returns:
Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: Tuple containing scraped content and images
"""
scraped_data = []
images = []
user_agent = (
cfg.user_agent
if cfg
else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
)
try:
scraper = Scraper(urls, user_agent, cfg.scraper)
scraped_data = scraper.run()
for item in scraped_data:
if 'image_urls' in item:
images.extend([img for img in item['image_urls']])
except Exception as e:
print(f"{Fore.RED}Error in scrape_urls: {e}{Style.RESET_ALL}")
return scraped_data, images
async def filter_urls(urls: List[str], config: Config) -> List[str]:
"""
Filter URLs based on configuration settings.
Args:
urls (List[str]): List of URLs to filter.
config (Config): Configuration object.
Returns:
List[str]: Filtered list of URLs.
"""
filtered_urls = []
for url in urls:
# Add your filtering logic here
# For example, you might want to exclude certain domains or URL patterns
if not any(excluded in url for excluded in config.excluded_domains):
filtered_urls.append(url)
return filtered_urls
async def extract_main_content(html_content: str) -> str:
"""
Extract the main content from HTML.
Args:
html_content (str): Raw HTML content.
Returns:
str: Extracted main content.
"""
# Implement content extraction logic here
# This could involve using libraries like BeautifulSoup or custom parsing logic
# For now, we'll just return the raw HTML as a placeholder
return html_content
async def process_scraped_data(scraped_data: List[Dict[str, Any]], config: Config) -> List[Dict[str, Any]]:
"""
Process the scraped data to extract and clean the main content.
Args:
scraped_data (List[Dict[str, Any]]): List of dictionaries containing scraped data.
config (Config): Configuration object.
Returns:
List[Dict[str, Any]]: Processed scraped data.
"""
processed_data = []
for item in scraped_data:
if item['status'] == 'success':
main_content = await extract_main_content(item['content'])
processed_data.append({
'url': item['url'],
'content': main_content,
'status': 'success'
})
else:
processed_data.append(item)
return processed_data