|
|
|
import aiohttp |
|
import feedparser |
|
import asyncio |
|
from datetime import datetime, timedelta |
|
from config import load_feeds, load_api_keys, SETTINGS |
|
import logging |
|
from urllib.parse import urlparse, urlunparse |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
class NewsFetcher: |
|
def __init__(self): |
|
self.feeds = load_feeds() |
|
self.api_keys = load_api_keys() |
|
self.session = None |
|
|
|
async def __aenter__(self): |
|
self.session = aiohttp.ClientSession( |
|
timeout=aiohttp.ClientTimeout(total=SETTINGS["request_timeout"]), |
|
headers={"User-Agent": "NewsBot/1.0"} |
|
) |
|
return self |
|
|
|
async def __aexit__(self, *exc): |
|
await self.session.close() |
|
self.session = None |
|
|
|
async def fetch(self, url: str, source_type: str = "rss") -> Union[str, Dict, None]: |
|
retries = 0 |
|
while retries < SETTINGS["max_retries"]: |
|
try: |
|
async with self.session.get(url) as response: |
|
if response.status == 200: |
|
return await response.json() if source_type == "newsapi" else await response.text() |
|
logger.warning(f"Fehler beim Abrufen von {url}: Status {response.status}") |
|
except Exception as e: |
|
logger.error(f"Verbindungsfehler: {str(e)}") |
|
retries += 1 |
|
await asyncio.sleep(5) |
|
return None |
|
|
|
async def process_rss(self, feed_config: Dict, processed_links: set) -> List[Dict]: |
|
content = await self.fetch(feed_config["url"]) |
|
if not content: |
|
return [] |
|
|
|
feed = feedparser.parse(content) |
|
articles = [] |
|
cutoff_time = datetime.now() - timedelta(hours=24) |
|
|
|
for entry in feed.entries[:SETTINGS["max_articles"]]: |
|
try: |
|
pub_date = datetime(*entry.published_parsed[:6]) if hasattr(entry, 'published_parsed') else datetime.now() |
|
if pub_date < cutoff_time: |
|
continue |
|
|
|
article = { |
|
"title": entry.title, |
|
"link": self.normalize_url(entry.link), |
|
"source": feed_config["name"], |
|
"description": entry.get("summary", "")[:500], |
|
"published": pub_date, |
|
"category": feed_config.get("category", "general") |
|
} |
|
|
|
if article["link"] not in processed_links: |
|
articles.append(article) |
|
processed_links.add(article["link"]) |
|
except Exception as e: |
|
logger.error(f"Fehler beim Verarbeiten des Artikels: {str(e)}") |
|
return articles |
|
|
|
async def get_news_api(self, feed_config: Dict, processed_links: set) -> List[Dict]: |
|
api_key = self.api_keys.get("newsapi") |
|
if not api_key: |
|
logger.error("NewsAPI-Key fehlt!") |
|
return [] |
|
|
|
url = f"https://newsapi.org/v2/everything?q={feed_config['query']}&pageSize={SETTINGS['newsapi_page_size']}&apiKey={api_key}" |
|
data = await self.fetch(url, "newsapi") |
|
if not data: |
|
return [] |
|
|
|
articles = [] |
|
cutoff_time = datetime.now() - timedelta(hours=24) |
|
|
|
for article in data.get("articles", []): |
|
try: |
|
pub_date = datetime.fromisoformat(article["publishedAt"].rstrip("Z")) |
|
if pub_date < cutoff_time: |
|
continue |
|
|
|
entry = { |
|
"title": article["title"], |
|
"link": self.normalize_url(article["url"]), |
|
"source": feed_config["name"], |
|
"description": article.get("description", "")[:500], |
|
"published": pub_date, |
|
"category": feed_config.get("category", "general") |
|
} |
|
|
|
if entry["link"] not in processed_links: |
|
articles.append(entry) |
|
processed_links.add(entry["link"]) |
|
except Exception as e: |
|
logger.error(f"Fehler bei NewsAPI-Artikel: {str(e)}") |
|
return articles |
|
|
|
def normalize_url(self, url: str) -> str: |
|
parsed = urlparse(url) |
|
return urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", "")) |