Test2 / fetch_news.py
Dunevhhhh's picture
Create fetch_news.py
b58bf24 verified
# fetch_news.py
import aiohttp
import feedparser
import asyncio
from datetime import datetime, timedelta
from config import load_feeds, load_api_keys, SETTINGS
import logging
from urllib.parse import urlparse, urlunparse
logger = logging.getLogger(__name__)
class NewsFetcher:
def __init__(self):
self.feeds = load_feeds()
self.api_keys = load_api_keys()
self.session = None
async def __aenter__(self):
self.session = aiohttp.ClientSession(
timeout=aiohttp.ClientTimeout(total=SETTINGS["request_timeout"]),
headers={"User-Agent": "NewsBot/1.0"}
)
return self
async def __aexit__(self, *exc):
await self.session.close()
self.session = None
async def fetch(self, url: str, source_type: str = "rss") -> Union[str, Dict, None]:
retries = 0
while retries < SETTINGS["max_retries"]:
try:
async with self.session.get(url) as response:
if response.status == 200:
return await response.json() if source_type == "newsapi" else await response.text()
logger.warning(f"Fehler beim Abrufen von {url}: Status {response.status}")
except Exception as e:
logger.error(f"Verbindungsfehler: {str(e)}")
retries += 1
await asyncio.sleep(5)
return None
async def process_rss(self, feed_config: Dict, processed_links: set) -> List[Dict]:
content = await self.fetch(feed_config["url"])
if not content:
return []
feed = feedparser.parse(content)
articles = []
cutoff_time = datetime.now() - timedelta(hours=24)
for entry in feed.entries[:SETTINGS["max_articles"]]:
try:
pub_date = datetime(*entry.published_parsed[:6]) if hasattr(entry, 'published_parsed') else datetime.now()
if pub_date < cutoff_time:
continue
article = {
"title": entry.title,
"link": self.normalize_url(entry.link),
"source": feed_config["name"],
"description": entry.get("summary", "")[:500],
"published": pub_date,
"category": feed_config.get("category", "general")
}
if article["link"] not in processed_links:
articles.append(article)
processed_links.add(article["link"])
except Exception as e:
logger.error(f"Fehler beim Verarbeiten des Artikels: {str(e)}")
return articles
async def get_news_api(self, feed_config: Dict, processed_links: set) -> List[Dict]:
api_key = self.api_keys.get("newsapi")
if not api_key:
logger.error("NewsAPI-Key fehlt!")
return []
url = f"https://newsapi.org/v2/everything?q={feed_config['query']}&pageSize={SETTINGS['newsapi_page_size']}&apiKey={api_key}"
data = await self.fetch(url, "newsapi")
if not data:
return []
articles = []
cutoff_time = datetime.now() - timedelta(hours=24)
for article in data.get("articles", []):
try:
pub_date = datetime.fromisoformat(article["publishedAt"].rstrip("Z"))
if pub_date < cutoff_time:
continue
entry = {
"title": article["title"],
"link": self.normalize_url(article["url"]),
"source": feed_config["name"],
"description": article.get("description", "")[:500],
"published": pub_date,
"category": feed_config.get("category", "general")
}
if entry["link"] not in processed_links:
articles.append(entry)
processed_links.add(entry["link"])
except Exception as e:
logger.error(f"Fehler bei NewsAPI-Artikel: {str(e)}")
return articles
def normalize_url(self, url: str) -> str:
parsed = urlparse(url)
return urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))