newapi / routers /getnews.py
habulaj's picture
Update routers/getnews.py
c44f2ff verified
from fastapi import APIRouter, HTTPException
import httpx
from typing import List, Dict
from bs4 import BeautifulSoup
import re
router = APIRouter()
GRAPHQL_URL = "https://api.graphql.imdb.com"
HEADERS = {"Content-Type": "application/json"}
QUERY = """
query GetNews($first: Int!) {
movieNews: news(first: $first, category: MOVIE) {
edges {
node {
id
articleTitle { plainText }
externalUrl
date
text { plaidHtml }
image { url }
}
}
}
tvNews: news(first: $first, category: TV) {
edges {
node {
id
articleTitle { plainText }
externalUrl
date
text { plaidHtml }
image { url }
}
}
}
}
"""
def clean_html(raw_html: str) -> str:
# Remove tags HTML
text = BeautifulSoup(raw_html or "", "html.parser").get_text(separator=" ", strip=True)
# Remove múltiplos espaços, tabs, quebras de linha, etc.
text = re.sub(r"\s+", " ", text)
# Remove espaços antes de pontuações
text = re.sub(r"\s+([.,;:!?])", r"\1", text)
# Remove espaços após parênteses de abertura e antes de fechamento
text = re.sub(r"\(\s+", "(", text)
text = re.sub(r"\s+\)", ")", text)
# Remove espaços desnecessários entre colchetes ou chaves se quiser estender
text = re.sub(r"\[\s+", "[", text)
text = re.sub(r"\s+\]", "]", text)
text = re.sub(r"\{\s+", "{", text)
text = re.sub(r"\s+\}", "}", text)
return text.strip()
@router.get("/news")
async def get_news(first: int = 15) -> List[Dict]:
payload = {
"query": QUERY,
"variables": {"first": first}
}
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.post(GRAPHQL_URL, headers=HEADERS, json=payload)
if response.status_code != 200:
raise HTTPException(status_code=502, detail="Erro ao acessar a API do IMDb")
data = response.json().get("data")
if not data:
raise HTTPException(status_code=500, detail="Resposta inválida da API")
combined = []
for category_key in ["movieNews", "tvNews"]:
for edge in data.get(category_key, {}).get("edges", []):
node = edge.get("node", {})
combined.append({
"id": node.get("id"),
"title": node.get("articleTitle", {}).get("plainText"),
"url": node.get("externalUrl"),
"date": node.get("date"),
"text": clean_html(node.get("text", {}).get("plaidHtml")),
"image": node.get("image", {}).get("url"),
"category": category_key.replace("News", "").upper()
})
combined.sort(key=lambda x: x.get("date"), reverse=True)
return combined