File size: 4,287 Bytes
b58bf24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# fetch_news.py
import aiohttp
import feedparser
import asyncio
from datetime import datetime, timedelta
from config import load_feeds, load_api_keys, SETTINGS
import logging
from urllib.parse import urlparse, urlunparse

logger = logging.getLogger(__name__)

class NewsFetcher:
    def __init__(self):
        self.feeds = load_feeds()
        self.api_keys = load_api_keys()
        self.session = None

    async def __aenter__(self):
        self.session = aiohttp.ClientSession(
            timeout=aiohttp.ClientTimeout(total=SETTINGS["request_timeout"]),
            headers={"User-Agent": "NewsBot/1.0"}
        )
        return self

    async def __aexit__(self, *exc):
        await self.session.close()
        self.session = None

    async def fetch(self, url: str, source_type: str = "rss") -> Union[str, Dict, None]:
        retries = 0
        while retries < SETTINGS["max_retries"]:
            try:
                async with self.session.get(url) as response:
                    if response.status == 200:
                        return await response.json() if source_type == "newsapi" else await response.text()
                    logger.warning(f"Fehler beim Abrufen von {url}: Status {response.status}")
            except Exception as e:
                logger.error(f"Verbindungsfehler: {str(e)}")
                retries += 1
                await asyncio.sleep(5)
        return None

    async def process_rss(self, feed_config: Dict, processed_links: set) -> List[Dict]:
        content = await self.fetch(feed_config["url"])
        if not content:
            return []

        feed = feedparser.parse(content)
        articles = []
        cutoff_time = datetime.now() - timedelta(hours=24)

        for entry in feed.entries[:SETTINGS["max_articles"]]:
            try:
                pub_date = datetime(*entry.published_parsed[:6]) if hasattr(entry, 'published_parsed') else datetime.now()
                if pub_date < cutoff_time:
                    continue

                article = {
                    "title": entry.title,
                    "link": self.normalize_url(entry.link),
                    "source": feed_config["name"],
                    "description": entry.get("summary", "")[:500],
                    "published": pub_date,
                    "category": feed_config.get("category", "general")
                }

                if article["link"] not in processed_links:
                    articles.append(article)
                    processed_links.add(article["link"])
            except Exception as e:
                logger.error(f"Fehler beim Verarbeiten des Artikels: {str(e)}")
        return articles

    async def get_news_api(self, feed_config: Dict, processed_links: set) -> List[Dict]:
        api_key = self.api_keys.get("newsapi")
        if not api_key:
            logger.error("NewsAPI-Key fehlt!")
            return []

        url = f"https://newsapi.org/v2/everything?q={feed_config['query']}&pageSize={SETTINGS['newsapi_page_size']}&apiKey={api_key}"
        data = await self.fetch(url, "newsapi")
        if not data:
            return []

        articles = []
        cutoff_time = datetime.now() - timedelta(hours=24)

        for article in data.get("articles", []):
            try:
                pub_date = datetime.fromisoformat(article["publishedAt"].rstrip("Z"))
                if pub_date < cutoff_time:
                    continue

                entry = {
                    "title": article["title"],
                    "link": self.normalize_url(article["url"]),
                    "source": feed_config["name"],
                    "description": article.get("description", "")[:500],
                    "published": pub_date,
                    "category": feed_config.get("category", "general")
                }

                if entry["link"] not in processed_links:
                    articles.append(entry)
                    processed_links.add(entry["link"])
            except Exception as e:
                logger.error(f"Fehler bei NewsAPI-Artikel: {str(e)}")
        return articles

    def normalize_url(self, url: str) -> str:
        parsed = urlparse(url)
        return urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))