File size: 4,276 Bytes
ef2ea4e b530948 e322c9a 548c8eb ef2ea4e b530948 ef2ea4e b530948 e322c9a b530948 e322c9a b530948 e322c9a b530948 e322c9a b530948 e322c9a b530948 e322c9a b530948 ef2ea4e f9f1c6b ef2ea4e 548c8eb d91886e c44f2ff d91886e 548c8eb ef2ea4e e322c9a bf65228 b530948 ef2ea4e b530948 ef2ea4e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import os
import re
import httpx
from typing import List, Dict
from bs4 import BeautifulSoup
from fastapi import APIRouter, HTTPException
router = APIRouter()
# 🎯 IMDb GraphQL
GRAPHQL_URL = "https://api.graphql.imdb.com"
HEADERS = {"Content-Type": "application/json"}
QUERY = """
query GetNews($first: Int!) {
movieNews: news(first: $first, category: MOVIE) {
edges {
node {
id
articleTitle { plainText }
externalUrl
date
text { plaidHtml }
image { url }
}
}
}
tvNews: news(first: $first, category: TV) {
edges {
node {
id
articleTitle { plainText }
externalUrl
date
text { plaidHtml }
image { url }
}
}
}
}
"""
# 🔧 Supabase Config
SUPABASE_URL = "https://iiwbixdrrhejkthxygak.supabase.co"
SUPABASE_KEY = os.getenv("SUPA_KEY")
SUPABASE_ROLE_KEY = os.getenv("SUPA_SERVICE_KEY")
if not SUPABASE_KEY or not SUPABASE_ROLE_KEY:
raise ValueError("❌ SUPA_KEY or SUPA_SERVICE_KEY not set in environment!")
SUPABASE_HEADERS = {
"apikey": SUPABASE_KEY,
"Authorization": f"Bearer {SUPABASE_KEY}",
"Content-Type": "application/json"
}
SUPABASE_ROLE_HEADERS = {
"apikey": SUPABASE_ROLE_KEY,
"Authorization": f"Bearer {SUPABASE_ROLE_KEY}",
"Content-Type": "application/json"
}
# 🧼 HTML Cleanup
def clean_html(raw_html: str) -> str:
text = BeautifulSoup(raw_html or "", "html.parser").get_text(separator=" ", strip=True)
text = re.sub(r"\s+", " ", text)
text = re.sub(r"\s+([.,;:!?])", r"\1", text)
text = re.sub(r"\(\s+", "(", text)
text = re.sub(r"\s+\)", ")", text)
text = re.sub(r"\[\s+", "[", text)
text = re.sub(r"\s+\]", "]", text)
text = re.sub(r"\{\s+", "{", text)
text = re.sub(r"\s+\}", "}", text)
return text.strip()
# 🚀 Endpoint principal
@router.get("/news")
async def get_news(first: int = 20) -> List[Dict]:
payload = {
"query": QUERY,
"variables": {"first": first}
}
async with httpx.AsyncClient(timeout=10.0) as client:
# Pega notícias do IMDb
response = await client.post(GRAPHQL_URL, headers=HEADERS, json=payload)
if response.status_code != 200:
raise HTTPException(status_code=502, detail="Erro ao acessar a API do IMDb")
data = response.json().get("data")
if not data:
raise HTTPException(status_code=500, detail="Resposta inválida da API")
combined = []
for category_key in ["movieNews", "tvNews"]:
for edge in data.get(category_key, {}).get("edges", []):
node = edge.get("node", {})
combined.append({
"news_id": node.get("id"),
"title": node.get("articleTitle", {}).get("plainText"),
"url": node.get("externalUrl"),
"date": node.get("date"),
"text": clean_html(node.get("text", {}).get("plaidHtml")),
"image": node.get("image", {}).get("url"),
"category": category_key.replace("News", "").upper()
})
# 📌 Verifica quais IDs já existem no Supabase
all_ids = [item["news_id"] for item in combined]
existing_ids = []
ids_chunks = [all_ids[i:i + 1000] for i in range(0, len(all_ids), 1000)] # evita URL muito grande
for chunk in ids_chunks:
query_ids = ",".join([f"\"{nid}\"" for nid in chunk])
url = f"{SUPABASE_URL}/rest/v1/news_extraction?select=news_id&news_id=in.({query_ids})"
r = await client.get(url, headers=SUPABASE_HEADERS)
if r.status_code == 200:
existing_ids.extend([item["news_id"] for item in r.json()])
# 🔎 Filtra apenas as novas notícias
new_entries = [item for item in combined if item["news_id"] not in existing_ids]
# 🧾 Insere novas notícias (em lote)
if new_entries:
insert_url = f"{SUPABASE_URL}/rest/v1/news_extraction"
await client.post(insert_url, headers=SUPABASE_ROLE_HEADERS, json=new_entries)
# 🔃 Ordena por data
combined.sort(key=lambda x: x.get("date"), reverse=True)
return combined |