File size: 4,287 Bytes
b58bf24 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
# fetch_news.py
import aiohttp
import feedparser
import asyncio
from datetime import datetime, timedelta
from config import load_feeds, load_api_keys, SETTINGS
import logging
from urllib.parse import urlparse, urlunparse
logger = logging.getLogger(__name__)
class NewsFetcher:
def __init__(self):
self.feeds = load_feeds()
self.api_keys = load_api_keys()
self.session = None
async def __aenter__(self):
self.session = aiohttp.ClientSession(
timeout=aiohttp.ClientTimeout(total=SETTINGS["request_timeout"]),
headers={"User-Agent": "NewsBot/1.0"}
)
return self
async def __aexit__(self, *exc):
await self.session.close()
self.session = None
async def fetch(self, url: str, source_type: str = "rss") -> Union[str, Dict, None]:
retries = 0
while retries < SETTINGS["max_retries"]:
try:
async with self.session.get(url) as response:
if response.status == 200:
return await response.json() if source_type == "newsapi" else await response.text()
logger.warning(f"Fehler beim Abrufen von {url}: Status {response.status}")
except Exception as e:
logger.error(f"Verbindungsfehler: {str(e)}")
retries += 1
await asyncio.sleep(5)
return None
async def process_rss(self, feed_config: Dict, processed_links: set) -> List[Dict]:
content = await self.fetch(feed_config["url"])
if not content:
return []
feed = feedparser.parse(content)
articles = []
cutoff_time = datetime.now() - timedelta(hours=24)
for entry in feed.entries[:SETTINGS["max_articles"]]:
try:
pub_date = datetime(*entry.published_parsed[:6]) if hasattr(entry, 'published_parsed') else datetime.now()
if pub_date < cutoff_time:
continue
article = {
"title": entry.title,
"link": self.normalize_url(entry.link),
"source": feed_config["name"],
"description": entry.get("summary", "")[:500],
"published": pub_date,
"category": feed_config.get("category", "general")
}
if article["link"] not in processed_links:
articles.append(article)
processed_links.add(article["link"])
except Exception as e:
logger.error(f"Fehler beim Verarbeiten des Artikels: {str(e)}")
return articles
async def get_news_api(self, feed_config: Dict, processed_links: set) -> List[Dict]:
api_key = self.api_keys.get("newsapi")
if not api_key:
logger.error("NewsAPI-Key fehlt!")
return []
url = f"https://newsapi.org/v2/everything?q={feed_config['query']}&pageSize={SETTINGS['newsapi_page_size']}&apiKey={api_key}"
data = await self.fetch(url, "newsapi")
if not data:
return []
articles = []
cutoff_time = datetime.now() - timedelta(hours=24)
for article in data.get("articles", []):
try:
pub_date = datetime.fromisoformat(article["publishedAt"].rstrip("Z"))
if pub_date < cutoff_time:
continue
entry = {
"title": article["title"],
"link": self.normalize_url(article["url"]),
"source": feed_config["name"],
"description": article.get("description", "")[:500],
"published": pub_date,
"category": feed_config.get("category", "general")
}
if entry["link"] not in processed_links:
articles.append(entry)
processed_links.add(entry["link"])
except Exception as e:
logger.error(f"Fehler bei NewsAPI-Artikel: {str(e)}")
return articles
def normalize_url(self, url: str) -> str:
parsed = urlparse(url)
return urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", "")) |