Spaces:

habulaj
/

newapi

Running

App Files Files Community

habulaj commited on 8 days ago

Commit

ef2ea4e

verified ·

1 Parent(s): 8fce3cf

Update routers/getnews.py

Browse files

Files changed (1) hide show

routers/getnews.py +74 -37

routers/getnews.py CHANGED Viewed

@@ -1,11 +1,13 @@
-from fastapi import APIRouter, HTTPException
 import httpx
 from typing import List, Dict
 from bs4 import BeautifulSoup
-import re
 router = APIRouter()
 GRAPHQL_URL = "https://api.graphql.imdb.com"
 HEADERS = {"Content-Type": "application/json"}
@@ -38,59 +40,94 @@ query GetNews($first: Int!) {
 }
 """
 def clean_html(raw_html: str) -> str:
-    # Remove tags HTML
     text = BeautifulSoup(raw_html or "", "html.parser").get_text(separator=" ", strip=True)
-    # Remove múltiplos espaços, tabs, quebras de linha, etc.
     text = re.sub(r"\s+", " ", text)
-    # Remove espaços antes de pontuações
     text = re.sub(r"\s+([.,;:!?])", r"\1", text)
-    # Remove espaços após parênteses de abertura e antes de fechamento
     text = re.sub(r"\(\s+", "(", text)
     text = re.sub(r"\s+\)", ")", text)
-    # Remove espaços desnecessários entre colchetes ou chaves se quiser estender
     text = re.sub(r"\[\s+", "[", text)
     text = re.sub(r"\s+\]", "]", text)
     text = re.sub(r"\{\s+", "{", text)
     text = re.sub(r"\s+\}", "}", text)
     return text.strip()
 @router.get("/news")
-async def get_news(first: int = 5) -> List[Dict]:
     payload = {
         "query": QUERY,
         "variables": {"first": first}
     }
     async with httpx.AsyncClient(timeout=10.0) as client:
         response = await client.post(GRAPHQL_URL, headers=HEADERS, json=payload)
-    if response.status_code != 200:
-        raise HTTPException(status_code=502, detail="Erro ao acessar a API do IMDb")
-    data = response.json().get("data")
-    if not data:
-        raise HTTPException(status_code=500, detail="Resposta inválida da API")
-    combined = []
-    for category_key in ["movieNews", "tvNews"]:
-        for edge in data.get(category_key, {}).get("edges", []):
-            node = edge.get("node", {})
-            combined.append({
-                "id": node.get("id"),
-                "title": node.get("articleTitle", {}).get("plainText"),
-                "url": node.get("externalUrl"),
-                "date": node.get("date"),
-                "text": clean_html(node.get("text", {}).get("plaidHtml")),
-                "image": node.get("image", {}).get("url"),
-                "category": category_key.replace("News", "").upper()
-            })
-    combined.sort(key=lambda x: x.get("date"), reverse=True)
-    return combined

+import os
+import re
 import httpx
 from typing import List, Dict
 from bs4 import BeautifulSoup
+from fastapi import APIRouter, HTTPException
 router = APIRouter()
+# 🎯 IMDb GraphQL
 GRAPHQL_URL = "https://api.graphql.imdb.com"
 HEADERS = {"Content-Type": "application/json"}
 }
 """
+# 🔧 Supabase Config
+SUPABASE_URL = "https://ussxqnifefkgkaumjann.supabase.co"
+SUPABASE_KEY = os.getenv("SUPA_KEY")
+SUPABASE_ROLE_KEY = os.getenv("SUPA_SERVICE_KEY")
+if not SUPABASE_KEY or not SUPABASE_ROLE_KEY:
+    raise ValueError("❌ SUPA_KEY or SUPA_SERVICE_KEY not set in environment!")
+SUPABASE_HEADERS = {
+    "apikey": SUPABASE_KEY,
+    "Authorization": f"Bearer {SUPABASE_KEY}",
+    "Content-Type": "application/json"
+}
+SUPABASE_ROLE_HEADERS = {
+    "apikey": SUPABASE_ROLE_KEY,
+    "Authorization": f"Bearer {SUPABASE_ROLE_KEY}",
+    "Content-Type": "application/json"
+}
+# 🧼 HTML Cleanup
 def clean_html(raw_html: str) -> str:
     text = BeautifulSoup(raw_html or "", "html.parser").get_text(separator=" ", strip=True)
     text = re.sub(r"\s+", " ", text)
     text = re.sub(r"\s+([.,;:!?])", r"\1", text)
     text = re.sub(r"\(\s+", "(", text)
     text = re.sub(r"\s+\)", ")", text)
     text = re.sub(r"\[\s+", "[", text)
     text = re.sub(r"\s+\]", "]", text)
     text = re.sub(r"\{\s+", "{", text)
     text = re.sub(r"\s+\}", "}", text)
     return text.strip()
+# 🚀 Endpoint principal
 @router.get("/news")
+async def get_news(first: int = 10) -> List[Dict]:
     payload = {
         "query": QUERY,
         "variables": {"first": first}
     }
     async with httpx.AsyncClient(timeout=10.0) as client:
+        # Pega notícias do IMDb
         response = await client.post(GRAPHQL_URL, headers=HEADERS, json=payload)
+        if response.status_code != 200:
+            raise HTTPException(status_code=502, detail="Erro ao acessar a API do IMDb")
+        data = response.json().get("data")
+        if not data:
+            raise HTTPException(status_code=500, detail="Resposta inválida da API")
+        combined = []
+        for category_key in ["movieNews", "tvNews"]:
+            for edge in data.get(category_key, {}).get("edges", []):
+                node = edge.get("node", {})
+                combined.append({
+                    "news_id": node.get("id"),
+                    "title": node.get("articleTitle", {}).get("plainText"),
+                    "url": node.get("externalUrl"),
+                    "date": node.get("date"),
+                    "text": clean_html(node.get("text", {}).get("plaidHtml")),
+                    "image": node.get("image", {}).get("url"),
+                    "category": category_key.replace("News", "").upper()
+                })
+        # 📌 Verifica quais IDs já existem no Supabase
+        all_ids = [item["news_id"] for item in combined]
+        existing_ids = []
+        ids_chunks = [all_ids[i:i + 1000] for i in range(0, len(all_ids), 1000)]  # evita URL muito grande
+        for chunk in ids_chunks:
+            query_ids = ",".join([f"\"{nid}\"" for nid in chunk])
+            url = f"{SUPABASE_URL}/rest/v1/news_extraction?select=news_id&news_id=in.({query_ids})"
+            r = await client.get(url, headers=SUPABASE_HEADERS)
+            if r.status_code == 200:
+                existing_ids.extend([item["news_id"] for item in r.json()])
+        # 🔎 Filtra apenas as novas notícias
+        new_entries = [item for item in combined if item["news_id"] not in existing_ids]
+        # 🧾 Insere novas notícias (em lote)
+        if new_entries:
+            insert_url = f"{SUPABASE_URL}/rest/v1/news_extraction"
+            await client.post(insert_url, headers=SUPABASE_ROLE_HEADERS, json=new_entries)
+        # 🔃 Ordena por data
+        combined.sort(key=lambda x: x.get("date"), reverse=True)
+        return combined