habulaj commited on
Commit
ef2ea4e
·
verified ·
1 Parent(s): 8fce3cf

Update routers/getnews.py

Browse files
Files changed (1) hide show
  1. routers/getnews.py +74 -37
routers/getnews.py CHANGED
@@ -1,11 +1,13 @@
1
- from fastapi import APIRouter, HTTPException
 
2
  import httpx
3
  from typing import List, Dict
4
  from bs4 import BeautifulSoup
5
- import re
6
 
7
  router = APIRouter()
8
 
 
9
  GRAPHQL_URL = "https://api.graphql.imdb.com"
10
  HEADERS = {"Content-Type": "application/json"}
11
 
@@ -38,59 +40,94 @@ query GetNews($first: Int!) {
38
  }
39
  """
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def clean_html(raw_html: str) -> str:
42
- # Remove tags HTML
43
  text = BeautifulSoup(raw_html or "", "html.parser").get_text(separator=" ", strip=True)
44
-
45
- # Remove múltiplos espaços, tabs, quebras de linha, etc.
46
  text = re.sub(r"\s+", " ", text)
47
-
48
- # Remove espaços antes de pontuações
49
  text = re.sub(r"\s+([.,;:!?])", r"\1", text)
50
-
51
- # Remove espaços após parênteses de abertura e antes de fechamento
52
  text = re.sub(r"\(\s+", "(", text)
53
  text = re.sub(r"\s+\)", ")", text)
54
-
55
- # Remove espaços desnecessários entre colchetes ou chaves se quiser estender
56
  text = re.sub(r"\[\s+", "[", text)
57
  text = re.sub(r"\s+\]", "]", text)
58
  text = re.sub(r"\{\s+", "{", text)
59
  text = re.sub(r"\s+\}", "}", text)
60
-
61
  return text.strip()
62
 
 
63
  @router.get("/news")
64
- async def get_news(first: int = 5) -> List[Dict]:
65
  payload = {
66
  "query": QUERY,
67
  "variables": {"first": first}
68
  }
69
 
70
  async with httpx.AsyncClient(timeout=10.0) as client:
 
71
  response = await client.post(GRAPHQL_URL, headers=HEADERS, json=payload)
72
 
73
- if response.status_code != 200:
74
- raise HTTPException(status_code=502, detail="Erro ao acessar a API do IMDb")
75
-
76
- data = response.json().get("data")
77
- if not data:
78
- raise HTTPException(status_code=500, detail="Resposta inválida da API")
79
-
80
- combined = []
81
-
82
- for category_key in ["movieNews", "tvNews"]:
83
- for edge in data.get(category_key, {}).get("edges", []):
84
- node = edge.get("node", {})
85
- combined.append({
86
- "id": node.get("id"),
87
- "title": node.get("articleTitle", {}).get("plainText"),
88
- "url": node.get("externalUrl"),
89
- "date": node.get("date"),
90
- "text": clean_html(node.get("text", {}).get("plaidHtml")),
91
- "image": node.get("image", {}).get("url"),
92
- "category": category_key.replace("News", "").upper()
93
- })
94
-
95
- combined.sort(key=lambda x: x.get("date"), reverse=True)
96
- return combined
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
  import httpx
4
  from typing import List, Dict
5
  from bs4 import BeautifulSoup
6
+ from fastapi import APIRouter, HTTPException
7
 
8
  router = APIRouter()
9
 
10
+ # 🎯 IMDb GraphQL
11
  GRAPHQL_URL = "https://api.graphql.imdb.com"
12
  HEADERS = {"Content-Type": "application/json"}
13
 
 
40
  }
41
  """
42
 
43
+ # 🔧 Supabase Config
44
+ SUPABASE_URL = "https://ussxqnifefkgkaumjann.supabase.co"
45
+ SUPABASE_KEY = os.getenv("SUPA_KEY")
46
+ SUPABASE_ROLE_KEY = os.getenv("SUPA_SERVICE_KEY")
47
+
48
+ if not SUPABASE_KEY or not SUPABASE_ROLE_KEY:
49
+ raise ValueError("❌ SUPA_KEY or SUPA_SERVICE_KEY not set in environment!")
50
+
51
+ SUPABASE_HEADERS = {
52
+ "apikey": SUPABASE_KEY,
53
+ "Authorization": f"Bearer {SUPABASE_KEY}",
54
+ "Content-Type": "application/json"
55
+ }
56
+
57
+ SUPABASE_ROLE_HEADERS = {
58
+ "apikey": SUPABASE_ROLE_KEY,
59
+ "Authorization": f"Bearer {SUPABASE_ROLE_KEY}",
60
+ "Content-Type": "application/json"
61
+ }
62
+
63
+ # 🧼 HTML Cleanup
64
  def clean_html(raw_html: str) -> str:
 
65
  text = BeautifulSoup(raw_html or "", "html.parser").get_text(separator=" ", strip=True)
 
 
66
  text = re.sub(r"\s+", " ", text)
 
 
67
  text = re.sub(r"\s+([.,;:!?])", r"\1", text)
 
 
68
  text = re.sub(r"\(\s+", "(", text)
69
  text = re.sub(r"\s+\)", ")", text)
 
 
70
  text = re.sub(r"\[\s+", "[", text)
71
  text = re.sub(r"\s+\]", "]", text)
72
  text = re.sub(r"\{\s+", "{", text)
73
  text = re.sub(r"\s+\}", "}", text)
 
74
  return text.strip()
75
 
76
+ # 🚀 Endpoint principal
77
  @router.get("/news")
78
+ async def get_news(first: int = 10) -> List[Dict]:
79
  payload = {
80
  "query": QUERY,
81
  "variables": {"first": first}
82
  }
83
 
84
  async with httpx.AsyncClient(timeout=10.0) as client:
85
+ # Pega notícias do IMDb
86
  response = await client.post(GRAPHQL_URL, headers=HEADERS, json=payload)
87
 
88
+ if response.status_code != 200:
89
+ raise HTTPException(status_code=502, detail="Erro ao acessar a API do IMDb")
90
+
91
+ data = response.json().get("data")
92
+ if not data:
93
+ raise HTTPException(status_code=500, detail="Resposta inválida da API")
94
+
95
+ combined = []
96
+
97
+ for category_key in ["movieNews", "tvNews"]:
98
+ for edge in data.get(category_key, {}).get("edges", []):
99
+ node = edge.get("node", {})
100
+ combined.append({
101
+ "news_id": node.get("id"),
102
+ "title": node.get("articleTitle", {}).get("plainText"),
103
+ "url": node.get("externalUrl"),
104
+ "date": node.get("date"),
105
+ "text": clean_html(node.get("text", {}).get("plaidHtml")),
106
+ "image": node.get("image", {}).get("url"),
107
+ "category": category_key.replace("News", "").upper()
108
+ })
109
+
110
+ # 📌 Verifica quais IDs já existem no Supabase
111
+ all_ids = [item["news_id"] for item in combined]
112
+
113
+ existing_ids = []
114
+ ids_chunks = [all_ids[i:i + 1000] for i in range(0, len(all_ids), 1000)] # evita URL muito grande
115
+
116
+ for chunk in ids_chunks:
117
+ query_ids = ",".join([f"\"{nid}\"" for nid in chunk])
118
+ url = f"{SUPABASE_URL}/rest/v1/news_extraction?select=news_id&news_id=in.({query_ids})"
119
+ r = await client.get(url, headers=SUPABASE_HEADERS)
120
+ if r.status_code == 200:
121
+ existing_ids.extend([item["news_id"] for item in r.json()])
122
+
123
+ # 🔎 Filtra apenas as novas notícias
124
+ new_entries = [item for item in combined if item["news_id"] not in existing_ids]
125
+
126
+ # 🧾 Insere novas notícias (em lote)
127
+ if new_entries:
128
+ insert_url = f"{SUPABASE_URL}/rest/v1/news_extraction"
129
+ await client.post(insert_url, headers=SUPABASE_ROLE_HEADERS, json=new_entries)
130
+
131
+ # 🔃 Ordena por data
132
+ combined.sort(key=lambda x: x.get("date"), reverse=True)
133
+ return combined