Update routers/getnews.py
Browse files- routers/getnews.py +74 -37
routers/getnews.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1 |
-
|
|
|
2 |
import httpx
|
3 |
from typing import List, Dict
|
4 |
from bs4 import BeautifulSoup
|
5 |
-
import
|
6 |
|
7 |
router = APIRouter()
|
8 |
|
|
|
9 |
GRAPHQL_URL = "https://api.graphql.imdb.com"
|
10 |
HEADERS = {"Content-Type": "application/json"}
|
11 |
|
@@ -38,59 +40,94 @@ query GetNews($first: Int!) {
|
|
38 |
}
|
39 |
"""
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
def clean_html(raw_html: str) -> str:
|
42 |
-
# Remove tags HTML
|
43 |
text = BeautifulSoup(raw_html or "", "html.parser").get_text(separator=" ", strip=True)
|
44 |
-
|
45 |
-
# Remove múltiplos espaços, tabs, quebras de linha, etc.
|
46 |
text = re.sub(r"\s+", " ", text)
|
47 |
-
|
48 |
-
# Remove espaços antes de pontuações
|
49 |
text = re.sub(r"\s+([.,;:!?])", r"\1", text)
|
50 |
-
|
51 |
-
# Remove espaços após parênteses de abertura e antes de fechamento
|
52 |
text = re.sub(r"\(\s+", "(", text)
|
53 |
text = re.sub(r"\s+\)", ")", text)
|
54 |
-
|
55 |
-
# Remove espaços desnecessários entre colchetes ou chaves se quiser estender
|
56 |
text = re.sub(r"\[\s+", "[", text)
|
57 |
text = re.sub(r"\s+\]", "]", text)
|
58 |
text = re.sub(r"\{\s+", "{", text)
|
59 |
text = re.sub(r"\s+\}", "}", text)
|
60 |
-
|
61 |
return text.strip()
|
62 |
|
|
|
63 |
@router.get("/news")
|
64 |
-
async def get_news(first: int =
|
65 |
payload = {
|
66 |
"query": QUERY,
|
67 |
"variables": {"first": first}
|
68 |
}
|
69 |
|
70 |
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
|
71 |
response = await client.post(GRAPHQL_URL, headers=HEADERS, json=payload)
|
72 |
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
import httpx
|
4 |
from typing import List, Dict
|
5 |
from bs4 import BeautifulSoup
|
6 |
+
from fastapi import APIRouter, HTTPException
|
7 |
|
8 |
router = APIRouter()
|
9 |
|
10 |
+
# 🎯 IMDb GraphQL
|
11 |
GRAPHQL_URL = "https://api.graphql.imdb.com"
|
12 |
HEADERS = {"Content-Type": "application/json"}
|
13 |
|
|
|
40 |
}
|
41 |
"""
|
42 |
|
43 |
+
# 🔧 Supabase Config
|
44 |
+
SUPABASE_URL = "https://ussxqnifefkgkaumjann.supabase.co"
|
45 |
+
SUPABASE_KEY = os.getenv("SUPA_KEY")
|
46 |
+
SUPABASE_ROLE_KEY = os.getenv("SUPA_SERVICE_KEY")
|
47 |
+
|
48 |
+
if not SUPABASE_KEY or not SUPABASE_ROLE_KEY:
|
49 |
+
raise ValueError("❌ SUPA_KEY or SUPA_SERVICE_KEY not set in environment!")
|
50 |
+
|
51 |
+
SUPABASE_HEADERS = {
|
52 |
+
"apikey": SUPABASE_KEY,
|
53 |
+
"Authorization": f"Bearer {SUPABASE_KEY}",
|
54 |
+
"Content-Type": "application/json"
|
55 |
+
}
|
56 |
+
|
57 |
+
SUPABASE_ROLE_HEADERS = {
|
58 |
+
"apikey": SUPABASE_ROLE_KEY,
|
59 |
+
"Authorization": f"Bearer {SUPABASE_ROLE_KEY}",
|
60 |
+
"Content-Type": "application/json"
|
61 |
+
}
|
62 |
+
|
63 |
+
# 🧼 HTML Cleanup
|
64 |
def clean_html(raw_html: str) -> str:
|
|
|
65 |
text = BeautifulSoup(raw_html or "", "html.parser").get_text(separator=" ", strip=True)
|
|
|
|
|
66 |
text = re.sub(r"\s+", " ", text)
|
|
|
|
|
67 |
text = re.sub(r"\s+([.,;:!?])", r"\1", text)
|
|
|
|
|
68 |
text = re.sub(r"\(\s+", "(", text)
|
69 |
text = re.sub(r"\s+\)", ")", text)
|
|
|
|
|
70 |
text = re.sub(r"\[\s+", "[", text)
|
71 |
text = re.sub(r"\s+\]", "]", text)
|
72 |
text = re.sub(r"\{\s+", "{", text)
|
73 |
text = re.sub(r"\s+\}", "}", text)
|
|
|
74 |
return text.strip()
|
75 |
|
76 |
+
# 🚀 Endpoint principal
|
77 |
@router.get("/news")
|
78 |
+
async def get_news(first: int = 10) -> List[Dict]:
|
79 |
payload = {
|
80 |
"query": QUERY,
|
81 |
"variables": {"first": first}
|
82 |
}
|
83 |
|
84 |
async with httpx.AsyncClient(timeout=10.0) as client:
|
85 |
+
# Pega notícias do IMDb
|
86 |
response = await client.post(GRAPHQL_URL, headers=HEADERS, json=payload)
|
87 |
|
88 |
+
if response.status_code != 200:
|
89 |
+
raise HTTPException(status_code=502, detail="Erro ao acessar a API do IMDb")
|
90 |
+
|
91 |
+
data = response.json().get("data")
|
92 |
+
if not data:
|
93 |
+
raise HTTPException(status_code=500, detail="Resposta inválida da API")
|
94 |
+
|
95 |
+
combined = []
|
96 |
+
|
97 |
+
for category_key in ["movieNews", "tvNews"]:
|
98 |
+
for edge in data.get(category_key, {}).get("edges", []):
|
99 |
+
node = edge.get("node", {})
|
100 |
+
combined.append({
|
101 |
+
"news_id": node.get("id"),
|
102 |
+
"title": node.get("articleTitle", {}).get("plainText"),
|
103 |
+
"url": node.get("externalUrl"),
|
104 |
+
"date": node.get("date"),
|
105 |
+
"text": clean_html(node.get("text", {}).get("plaidHtml")),
|
106 |
+
"image": node.get("image", {}).get("url"),
|
107 |
+
"category": category_key.replace("News", "").upper()
|
108 |
+
})
|
109 |
+
|
110 |
+
# 📌 Verifica quais IDs já existem no Supabase
|
111 |
+
all_ids = [item["news_id"] for item in combined]
|
112 |
+
|
113 |
+
existing_ids = []
|
114 |
+
ids_chunks = [all_ids[i:i + 1000] for i in range(0, len(all_ids), 1000)] # evita URL muito grande
|
115 |
+
|
116 |
+
for chunk in ids_chunks:
|
117 |
+
query_ids = ",".join([f"\"{nid}\"" for nid in chunk])
|
118 |
+
url = f"{SUPABASE_URL}/rest/v1/news_extraction?select=news_id&news_id=in.({query_ids})"
|
119 |
+
r = await client.get(url, headers=SUPABASE_HEADERS)
|
120 |
+
if r.status_code == 200:
|
121 |
+
existing_ids.extend([item["news_id"] for item in r.json()])
|
122 |
+
|
123 |
+
# 🔎 Filtra apenas as novas notícias
|
124 |
+
new_entries = [item for item in combined if item["news_id"] not in existing_ids]
|
125 |
+
|
126 |
+
# 🧾 Insere novas notícias (em lote)
|
127 |
+
if new_entries:
|
128 |
+
insert_url = f"{SUPABASE_URL}/rest/v1/news_extraction"
|
129 |
+
await client.post(insert_url, headers=SUPABASE_ROLE_HEADERS, json=new_entries)
|
130 |
+
|
131 |
+
# 🔃 Ordena por data
|
132 |
+
combined.sort(key=lambda x: x.get("date"), reverse=True)
|
133 |
+
return combined
|