File size: 4,276 Bytes
ef2ea4e
 
b530948
e322c9a
548c8eb
ef2ea4e
b530948
 
 
ef2ea4e
b530948
 
 
 
 
 
 
 
e322c9a
b530948
e322c9a
 
b530948
e322c9a
b530948
 
 
 
 
 
e322c9a
b530948
e322c9a
 
b530948
e322c9a
b530948
 
 
 
 
 
ef2ea4e
f9f1c6b
ef2ea4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548c8eb
d91886e
 
c44f2ff
 
 
 
 
 
 
d91886e
548c8eb
ef2ea4e
e322c9a
bf65228
b530948
 
 
 
 
 
ef2ea4e
b530948
 
ef2ea4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import re
import httpx
from typing import List, Dict
from bs4 import BeautifulSoup
from fastapi import APIRouter, HTTPException

router = APIRouter()

# 🎯 IMDb GraphQL
GRAPHQL_URL = "https://api.graphql.imdb.com"
HEADERS = {"Content-Type": "application/json"}

QUERY = """
query GetNews($first: Int!) {
  movieNews: news(first: $first, category: MOVIE) {
    edges {
      node {
        id
        articleTitle { plainText }
        externalUrl
        date
        text { plaidHtml }
        image { url }
      }
    }
  }
  tvNews: news(first: $first, category: TV) {
    edges {
      node {
        id
        articleTitle { plainText }
        externalUrl
        date
        text { plaidHtml }
        image { url }
      }
    }
  }
}
"""

# 🔧 Supabase Config
SUPABASE_URL = "https://iiwbixdrrhejkthxygak.supabase.co"
SUPABASE_KEY = os.getenv("SUPA_KEY")
SUPABASE_ROLE_KEY = os.getenv("SUPA_SERVICE_KEY")

if not SUPABASE_KEY or not SUPABASE_ROLE_KEY:
    raise ValueError("❌ SUPA_KEY or SUPA_SERVICE_KEY not set in environment!")

SUPABASE_HEADERS = {
    "apikey": SUPABASE_KEY,
    "Authorization": f"Bearer {SUPABASE_KEY}",
    "Content-Type": "application/json"
}

SUPABASE_ROLE_HEADERS = {
    "apikey": SUPABASE_ROLE_KEY,
    "Authorization": f"Bearer {SUPABASE_ROLE_KEY}",
    "Content-Type": "application/json"
}

# 🧼 HTML Cleanup
def clean_html(raw_html: str) -> str:
    text = BeautifulSoup(raw_html or "", "html.parser").get_text(separator=" ", strip=True)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\s+([.,;:!?])", r"\1", text)
    text = re.sub(r"\(\s+", "(", text)
    text = re.sub(r"\s+\)", ")", text)
    text = re.sub(r"\[\s+", "[", text)
    text = re.sub(r"\s+\]", "]", text)
    text = re.sub(r"\{\s+", "{", text)
    text = re.sub(r"\s+\}", "}", text)
    return text.strip()

# 🚀 Endpoint principal
@router.get("/news")
async def get_news(first: int = 20) -> List[Dict]:
    payload = {
        "query": QUERY,
        "variables": {"first": first}
    }

    async with httpx.AsyncClient(timeout=10.0) as client:
        # Pega notícias do IMDb
        response = await client.post(GRAPHQL_URL, headers=HEADERS, json=payload)

        if response.status_code != 200:
            raise HTTPException(status_code=502, detail="Erro ao acessar a API do IMDb")

        data = response.json().get("data")
        if not data:
            raise HTTPException(status_code=500, detail="Resposta inválida da API")

        combined = []

        for category_key in ["movieNews", "tvNews"]:
            for edge in data.get(category_key, {}).get("edges", []):
                node = edge.get("node", {})
                combined.append({
                    "news_id": node.get("id"),
                    "title": node.get("articleTitle", {}).get("plainText"),
                    "url": node.get("externalUrl"),
                    "date": node.get("date"),
                    "text": clean_html(node.get("text", {}).get("plaidHtml")),
                    "image": node.get("image", {}).get("url"),
                    "category": category_key.replace("News", "").upper()
                })

        # 📌 Verifica quais IDs já existem no Supabase
        all_ids = [item["news_id"] for item in combined]

        existing_ids = []
        ids_chunks = [all_ids[i:i + 1000] for i in range(0, len(all_ids), 1000)]  # evita URL muito grande

        for chunk in ids_chunks:
            query_ids = ",".join([f"\"{nid}\"" for nid in chunk])
            url = f"{SUPABASE_URL}/rest/v1/news_extraction?select=news_id&news_id=in.({query_ids})"
            r = await client.get(url, headers=SUPABASE_HEADERS)
            if r.status_code == 200:
                existing_ids.extend([item["news_id"] for item in r.json()])

        # 🔎 Filtra apenas as novas notícias
        new_entries = [item for item in combined if item["news_id"] not in existing_ids]

        # 🧾 Insere novas notícias (em lote)
        if new_entries:
            insert_url = f"{SUPABASE_URL}/rest/v1/news_extraction"
            await client.post(insert_url, headers=SUPABASE_ROLE_HEADERS, json=new_entries)

        # 🔃 Ordena por data
        combined.sort(key=lambda x: x.get("date"), reverse=True)
        return combined