Merlintxu commited on
Commit
5982a2e
·
verified ·
1 Parent(s): a3047a6

Update seo_analyzer.py

Browse files
Files changed (1) hide show
  1. seo_analyzer.py +211 -285
seo_analyzer.py CHANGED
@@ -1,13 +1,12 @@
1
  import os
2
- import logging
3
  import re
 
4
  import requests
5
- import hashlib
6
  import PyPDF2
7
  import numpy as np
8
  import pandas as pd
9
  from io import BytesIO
10
- from typing import List, Dict, Any, Tuple
11
  from urllib.parse import urlparse, urljoin
12
  from concurrent.futures import ThreadPoolExecutor, as_completed
13
  from bs4 import BeautifulSoup
@@ -21,347 +20,274 @@ from sentence_transformers import SentenceTransformer, util
21
  import torch
22
  import spacy
23
  import matplotlib.pyplot as plt
24
-
25
  from utils import sanitize_filename
26
 
27
- logging.basicConfig(
28
- level=logging.INFO,
29
- format='%(asctime)s - %(levelname)s - %(message)s'
30
- )
31
- logger = logging.getLogger(__name__)
32
 
 
 
33
 
34
  class SEOSpaceAnalyzer:
35
- def __init__(self, max_urls: int = 20, max_workers: int = 4) -> None:
36
  self.max_urls = max_urls
37
  self.max_workers = max_workers
38
  self.session = self._configure_session()
39
  self.models = self._load_models()
40
  self.base_dir = Path("content_storage")
41
  self.base_dir.mkdir(parents=True, exist_ok=True)
42
- self.current_analysis: Dict[str, Any] = {}
43
-
44
- def _load_models(self) -> Dict[str, Any]:
45
- try:
46
- device = 0 if torch.cuda.is_available() else -1
47
- logger.info("Cargando modelos NLP...")
48
- models = {
49
- 'summarizer': pipeline("summarization", model="facebook/bart-large-cnn", device=device),
50
- 'ner': pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple", device=device),
51
- 'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
52
- 'spacy': spacy.load("es_core_news_lg")
53
- }
54
- logger.info("Modelos cargados correctamente.")
55
- return models
56
- except Exception as e:
57
- logger.error(f"Error cargando modelos: {e}")
58
- raise
59
 
60
- def _configure_session(self) -> requests.Session:
61
  session = requests.Session()
62
- retry = Retry(
63
- total=3,
64
- backoff_factor=1,
65
- status_forcelist=[500, 502, 503, 504],
66
- allowed_methods=['GET', 'HEAD']
67
- )
68
- adapter = HTTPAdapter(max_retries=retry)
69
- session.mount('http://', adapter)
70
- session.mount('https://', adapter)
71
  session.headers.update({
72
- 'User-Agent': 'Mozilla/5.0 (compatible; SEOBot/1.0)',
73
- 'Accept-Language': 'es-ES,es;q=0.9'
74
  })
75
  return session
76
 
77
- def analyze_sitemap(self, sitemap_url: str) -> Tuple[Dict, List[str], Dict, Dict, List[Dict], Dict, Dict]:
78
- try:
79
- urls = self._parse_sitemap(sitemap_url)
80
- if not urls:
81
- return {"error": "No se pudieron extraer URLs del sitemap"}, [], {}, {}, [], {}, {}
 
 
 
 
82
 
83
- results: List[Dict] = []
84
- with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
85
- futures = {executor.submit(self._process_url, url): url for url in urls[:self.max_urls]}
86
- for future in as_completed(futures):
87
- url = futures[future]
88
- try:
89
- res = future.result()
90
- results.append(res)
91
- logger.info(f"Procesado: {url}")
92
- except Exception as e:
93
- logger.error(f"Error procesando {url}: {e}")
94
- results.append({'url': url, 'status': 'error', 'error': str(e)})
95
 
96
- summaries, entities = self._apply_nlp(results)
97
- similarities = self._compute_semantic_similarity(results)
 
 
 
 
 
 
98
 
99
- self.current_analysis = {
100
- 'stats': self._calculate_stats(results),
101
- 'content_analysis': self._analyze_content(results),
102
- 'links': self._analyze_links(results),
103
- 'recommendations': self._generate_seo_recommendations(results),
104
- 'details': results,
105
- 'summaries': summaries,
106
- 'entities': entities,
107
- 'similarities': similarities,
108
- 'timestamp': datetime.now().isoformat()
109
- }
110
- a = self.current_analysis
111
- return a['stats'], a['recommendations'], a['content_analysis'], a['links'], a['details'], a['summaries'], a['similarities']
112
- except Exception as e:
113
- logger.error(f"Error en análisis: {e}")
114
- return {"error": str(e)}, [], {}, {}, [], {}, {}
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  def _process_url(self, url: str) -> Dict:
117
  try:
118
- response = self.session.get(url, timeout=15)
119
- response.raise_for_status()
120
- content_type = response.headers.get('Content-Type', '')
121
- result: Dict[str, Any] = {'url': url, 'status': 'success'}
122
- if 'application/pdf' in content_type:
123
- result.update(self._process_pdf(response.content))
124
- elif 'text/html' in content_type:
125
- result.update(self._process_html(response.text, url))
126
- else:
127
- result.update({'type': 'unknown', 'content': '', 'word_count': 0})
128
- self._save_content(url, response.content)
129
- return result
130
- except requests.exceptions.Timeout as e:
131
- return {'url': url, 'status': 'error', 'error': "Timeout"}
132
- except requests.exceptions.HTTPError as e:
133
- return {'url': url, 'status': 'error', 'error': "HTTP Error"}
134
  except Exception as e:
135
- return {'url': url, 'status': 'error', 'error': str(e)}
136
 
137
- def _process_html(self, html: str, base_url: str) -> Dict:
138
- soup = BeautifulSoup(html, 'html.parser')
139
- clean_text = self._clean_text(soup.get_text())
140
  return {
141
- 'type': 'html',
142
- 'content': clean_text,
143
- 'word_count': len(clean_text.split()),
144
- 'metadata': self._extract_metadata(soup),
145
- 'links': self._extract_links(soup, base_url)
 
 
146
  }
147
 
148
- def _process_pdf(self, content: bytes) -> Dict:
149
  try:
150
- text = ""
151
- with BytesIO(content) as pdf_file:
152
- reader = PyPDF2.PdfReader(pdf_file)
153
- for page in reader.pages:
154
- extracted = page.extract_text()
155
- text += extracted if extracted else ""
156
- clean_text = self._clean_text(text)
157
  return {
158
- 'type': 'pdf',
159
- 'content': clean_text,
160
- 'word_count': len(clean_text.split()),
161
- 'page_count': len(reader.pages)
 
 
162
  }
163
  except Exception as e:
164
- return {'type': 'pdf', 'error': str(e)}
165
-
166
- def _clean_text(self, text: str) -> str:
167
- if not text:
168
- return ""
169
- text = re.sub(r'\s+', ' ', text)
170
- return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip()
171
 
172
- def _extract_metadata(self, soup: BeautifulSoup) -> Dict:
173
- metadata = {'title': '', 'description': '', 'keywords': [], 'og': {}}
174
- if soup.title and soup.title.string:
175
- metadata['title'] = soup.title.string.strip()[:200]
176
- for meta in soup.find_all('meta'):
177
- name = meta.get('name', '').lower()
178
- prop = meta.get('property', '').lower()
179
- content = meta.get('content', '')
180
- if name == 'description':
181
- metadata['description'] = content[:300]
182
- elif name == 'keywords':
183
- metadata['keywords'] = [kw.strip() for kw in content.split(',') if kw.strip()]
184
- elif prop.startswith('og:'):
185
- metadata['og'][prop[3:]] = content
186
- return metadata
187
 
188
- def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
189
- links: List[Dict] = []
190
- base_netloc = urlparse(base_url).netloc
191
- for tag in soup.find_all('a', href=True):
192
- try:
193
- href = tag['href'].strip()
194
- if not href or href.startswith('javascript:'):
195
- continue
196
- full_url = urljoin(base_url, href)
197
- parsed = urlparse(full_url)
198
- links.append({
199
- 'url': full_url,
200
- 'type': 'internal' if parsed.netloc == base_netloc else 'external',
201
- 'anchor': self._clean_text(tag.get_text())[:100],
202
- 'file_type': self._get_file_type(parsed.path)
203
- })
204
- except:
205
- continue
206
  return links
207
 
208
- def _get_file_type(self, path: str) -> str:
209
- ext = Path(path).suffix.lower()
210
- return ext[1:] if ext else 'html'
211
-
212
  def _parse_sitemap(self, sitemap_url: str) -> List[str]:
213
  try:
214
- response = self.session.get(sitemap_url, timeout=10)
215
- response.raise_for_status()
216
- if 'xml' not in response.headers.get('Content-Type', ''):
217
- return []
218
- soup = BeautifulSoup(response.text, 'lxml-xml')
219
- urls: List[str] = []
220
- if soup.find('sitemapindex'):
221
- for sitemap in soup.find_all('loc'):
222
- url = sitemap.text.strip()
223
- if url.endswith('.xml'):
224
- urls.extend(self._parse_sitemap(url))
225
- else:
226
- urls = [loc.text.strip() for loc in soup.find_all('loc')]
227
- return list({url for url in urls if url.startswith('http')})
228
  except:
229
  return []
230
 
231
- def _save_content(self, url: str, content: bytes) -> None:
232
- try:
233
- parsed = urlparse(url)
234
- domain_dir = self.base_dir / parsed.netloc
235
- raw_path = parsed.path.lstrip('/')
236
- if not raw_path or raw_path.endswith('/'):
237
- raw_path = os.path.join(raw_path, 'index.html') if raw_path else 'index.html'
238
- safe_path = sanitize_filename(raw_path)
239
- save_path = domain_dir / safe_path
240
- save_path.parent.mkdir(parents=True, exist_ok=True)
241
- with open(save_path, 'wb') as f:
242
- f.write(content)
243
- except:
244
- pass
245
-
246
- def _apply_nlp(self, results: List[Dict]) -> Tuple[Dict[str, str], Dict[str, List[str]]]:
247
- summaries = {}
248
- entities = {}
249
  for r in results:
250
- if r.get('status') != 'success' or not r.get('content'):
251
- continue
252
- content = r['content']
253
- if len(content.split()) > 300:
254
- try:
255
- summary = self.models['summarizer'](content[:1024], max_length=100, min_length=30, do_sample=False)[0]['summary_text']
256
- summaries[r['url']] = summary
257
- except:
258
- pass
259
  try:
260
- ents = self.models['ner'](content[:1000])
261
- entities[r['url']] = list(set([e['word'] for e in ents if e['entity_group'] in ['PER', 'ORG', 'LOC']]))
 
262
  except:
263
- pass
264
  return summaries, entities
265
 
266
- def _compute_semantic_similarity(self, results: List[Dict]) -> Dict[str, List[Dict]]:
267
- contents = [(r['url'], r['content']) for r in results if r.get('status') == 'success' and r.get('content')]
268
- if len(contents) < 2:
269
- return {}
270
- try:
271
- urls, texts = zip(*contents)
272
- embeddings = self.models['semantic'].encode(texts, convert_to_tensor=True)
273
- sim_matrix = util.pytorch_cos_sim(embeddings, embeddings)
274
- similarity_dict = {}
275
- for i, url in enumerate(urls):
276
- scores = list(sim_matrix[i])
277
- top_indices = sorted(range(len(scores)), key=lambda j: scores[j], reverse=True)
278
- top_similar = [
279
- {"url": urls[j], "score": float(scores[j])}
280
- for j in top_indices if j != i and float(scores[j]) > 0.5
281
- ][:3]
282
- similarity_dict[url] = top_similar
283
- return similarity_dict
284
- except:
285
- return {}
286
 
287
- def _calculate_stats(self, results: List[Dict]) -> Dict:
288
- successful = [r for r in results if r.get('status') == 'success']
289
- content_types = [r.get('type', 'unknown') for r in successful]
290
- avg_word_count = round(np.mean([r.get('word_count', 0) for r in successful]) if successful else 0, 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  return {
292
- 'total_urls': len(results),
293
- 'successful': len(successful),
294
- 'failed': len(results) - len(successful),
295
- 'content_types': pd.Series(content_types).value_counts().to_dict(),
296
- 'avg_word_count': avg_word_count,
297
- 'failed_urls': [r['url'] for r in results if r.get('status') != 'success']
298
  }
299
 
300
- def _analyze_content(self, results: List[Dict]) -> Dict:
301
- successful = [r for r in results if r.get('status') == 'success' and r.get('content')]
302
- texts = [r['content'] for r in successful if len(r['content'].split()) > 10]
303
  if not texts:
304
- return {'top_keywords': [], 'content_samples': []}
305
- try:
306
- stop_words = list(self.models['spacy'].Defaults.stop_words)
307
- vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=50, ngram_range=(1, 2))
308
- tfidf = vectorizer.fit_transform(texts)
309
- feature_names = vectorizer.get_feature_names_out()
310
- sorted_indices = np.argsort(np.asarray(tfidf.sum(axis=0)).ravel())[-10:]
311
- top_keywords = feature_names[sorted_indices][::-1].tolist()
312
- except:
313
- top_keywords = []
314
- samples = [{'url': r['url'], 'sample': r['content'][:500] + '...' if len(r['content']) > 500 else r['content']} for r in successful[:3]]
315
- return {'top_keywords': top_keywords, 'content_samples': samples}
316
 
317
- def _analyze_links(self, results: List[Dict]) -> Dict:
318
  all_links = []
319
- for result in results:
320
- if result.get('links'):
321
- all_links.extend(result['links'])
322
  if not all_links:
323
- return {'internal_links': {}, 'external_domains': {}, 'common_anchors': {}, 'file_types': {}}
324
  df = pd.DataFrame(all_links)
325
  return {
326
- 'internal_links': df[df['type'] == 'internal']['url'].value_counts().head(20).to_dict(),
327
- 'external_domains': df[df['type'] == 'external']['url'].apply(lambda x: urlparse(x).netloc).value_counts().head(10).to_dict(),
328
- 'common_anchors': df['anchor'].value_counts().head(10).to_dict(),
329
- 'file_types': df['file_type'].value_counts().to_dict()
330
  }
331
 
332
- def _generate_seo_recommendations(self, results: List[Dict]) -> List[str]:
333
- successful = [r for r in results if r.get('status') == 'success']
334
- if not successful:
335
- return ["No se pudo analizar ningún contenido exitosamente"]
336
  recs = []
337
- missing_titles = sum(1 for r in successful if not r.get('metadata', {}).get('title'))
338
- if missing_titles:
339
- recs.append(f"📌 Añadir títulos a {missing_titles} páginas")
340
- short_descriptions = sum(1 for r in successful if not r.get('metadata', {}).get('description'))
341
- if short_descriptions:
342
- recs.append(f"📌 Añadir meta descripciones a {short_descriptions} páginas")
343
- short_content = sum(1 for r in successful if r.get('word_count', 0) < 300)
344
- if short_content:
345
- recs.append(f"📝 Ampliar contenido en {short_content} páginas (menos de 300 palabras)")
346
- all_links = [link for r in results for link in r.get('links', [])]
347
- if all_links:
348
- df_links = pd.DataFrame(all_links)
349
- internal_links = df_links[df_links['type'] == 'internal']
350
- if len(internal_links) > 100:
351
- recs.append(f"🔗 Optimizar estructura de enlaces internos ({len(internal_links)} enlaces)")
352
- return recs if recs else ["✅ No se detectaron problemas críticos de SEO"]
353
 
354
- def plot_internal_links(self, links_data: Dict) -> Any:
355
- internal_links = links_data.get('internal_links', {})
 
 
 
 
356
  fig, ax = plt.subplots()
357
- if not internal_links:
358
- ax.text(0.5, 0.5, 'No hay enlaces internos', ha='center', va='center', transform=ax.transAxes)
359
- ax.axis('off')
360
- else:
361
- names = list(internal_links.keys())
362
- counts = list(internal_links.values())
363
- ax.barh(names, counts)
364
- ax.set_xlabel("Cantidad de enlaces")
365
- ax.set_title("Top 20 Enlaces Internos")
366
- plt.tight_layout()
367
  return fig
 
1
  import os
 
2
  import re
3
+ import logging
4
  import requests
 
5
  import PyPDF2
6
  import numpy as np
7
  import pandas as pd
8
  from io import BytesIO
9
+ from typing import List, Dict, Tuple
10
  from urllib.parse import urlparse, urljoin
11
  from concurrent.futures import ThreadPoolExecutor, as_completed
12
  from bs4 import BeautifulSoup
 
20
  import torch
21
  import spacy
22
  import matplotlib.pyplot as plt
 
23
  from utils import sanitize_filename
24
 
25
+ # Palabras no permitidas en SEO financiero/bancario
26
+ PROHIBITED_TERMS = [
27
+ "gratis", "garantizado", "rentabilidad asegurada", "sin compromiso",
28
+ "resultados inmediatos", "cero riesgo", "sin letra pequeña"
29
+ ]
30
 
31
+ logging.basicConfig(level=logging.INFO)
32
+ logger = logging.getLogger(__name__)
33
 
34
  class SEOSpaceAnalyzer:
35
+ def __init__(self, max_urls: int = 20, max_workers: int = 4):
36
  self.max_urls = max_urls
37
  self.max_workers = max_workers
38
  self.session = self._configure_session()
39
  self.models = self._load_models()
40
  self.base_dir = Path("content_storage")
41
  self.base_dir.mkdir(parents=True, exist_ok=True)
42
+ self.current_analysis: Dict = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ def _configure_session(self):
45
  session = requests.Session()
46
+ retry = Retry(total=3, backoff_factor=1,
47
+ status_forcelist=[500, 502, 503, 504],
48
+ allowed_methods=["GET"])
49
+ session.mount("http://", HTTPAdapter(max_retries=retry))
50
+ session.mount("https://", HTTPAdapter(max_retries=retry))
 
 
 
 
51
  session.headers.update({
52
+ "User-Agent": "SEOAnalyzer/1.0",
53
+ "Accept-Language": "es-ES,es;q=0.9"
54
  })
55
  return session
56
 
57
+ def _load_models(self):
58
+ device = 0 if torch.cuda.is_available() else -1
59
+ return {
60
+ "spacy": spacy.load("es_core_news_lg"),
61
+ "summarizer": pipeline("summarization", model="facebook/bart-large-cnn", device=device),
62
+ "ner": pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple", device=device),
63
+ "semantic": SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2"),
64
+ "zeroshot": pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
65
+ }
66
 
67
+ def analyze_sitemap(self, sitemap_url: str) -> Tuple:
68
+ urls = self._parse_sitemap(sitemap_url)
69
+ if not urls:
70
+ return {"error": "No se pudieron extraer URLs"}, [], {}, {}, [], {}, {}, {}
 
 
 
 
 
 
 
 
71
 
72
+ results = []
73
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
74
+ futures = {executor.submit(self._process_url, url): url for url in urls[:self.max_urls]}
75
+ for future in as_completed(futures):
76
+ try:
77
+ results.append(future.result())
78
+ except Exception as e:
79
+ results.append({"url": futures[future], "status": "error", "error": str(e)})
80
 
81
+ summaries, entities = self._apply_nlp(results)
82
+ similarities = self._compute_similarity(results)
83
+ flags = self._flag_prohibited_terms(results)
84
+ topics = self._classify_topics(results)
85
+ seo_tags = self._generate_seo_tags(results, summaries, topics, flags)
86
+
87
+ self.current_analysis = {
88
+ "stats": self._calculate_stats(results),
89
+ "content_analysis": self._analyze_content(results),
90
+ "links": self._analyze_links(results),
91
+ "recommendations": self._generate_recommendations(results),
92
+ "details": results,
93
+ "summaries": summaries,
94
+ "entities": entities,
95
+ "similarities": similarities,
96
+ "flags": flags,
97
+ "topics": topics,
98
+ "seo_tags": seo_tags,
99
+ "timestamp": datetime.now().isoformat()
100
+ }
101
+
102
+ a = self.current_analysis
103
+ return (
104
+ a["stats"], a["recommendations"], a["content_analysis"],
105
+ a["links"], a["details"], a["summaries"],
106
+ a["similarities"], a["seo_tags"]
107
+ )
108
 
109
  def _process_url(self, url: str) -> Dict:
110
  try:
111
+ response = self.session.get(url, timeout=10)
112
+ content_type = response.headers.get("Content-Type", "")
113
+ if "application/pdf" in content_type:
114
+ return self._process_pdf(url, response.content)
115
+ return self._process_html(url, response.text)
 
 
 
 
 
 
 
 
 
 
 
116
  except Exception as e:
117
+ return {"url": url, "status": "error", "error": str(e)}
118
 
119
+ def _process_html(self, url: str, html: str) -> Dict:
120
+ soup = BeautifulSoup(html, "html.parser")
121
+ text = re.sub(r"\\s+", " ", soup.get_text())
122
  return {
123
+ "url": url,
124
+ "type": "html",
125
+ "status": "success",
126
+ "content": text,
127
+ "word_count": len(text.split()),
128
+ "metadata": self._extract_metadata(soup),
129
+ "links": self._extract_links(soup, url)
130
  }
131
 
132
+ def _process_pdf(self, url: str, content: bytes) -> Dict:
133
  try:
134
+ reader = PyPDF2.PdfReader(BytesIO(content))
135
+ text = "".join(p.extract_text() or "" for p in reader.pages)
 
 
 
 
 
136
  return {
137
+ "url": url,
138
+ "type": "pdf",
139
+ "status": "success",
140
+ "content": text,
141
+ "word_count": len(text.split()),
142
+ "page_count": len(reader.pages)
143
  }
144
  except Exception as e:
145
+ return {"url": url, "status": "error", "error": str(e)}
 
 
 
 
 
 
146
 
147
+ def _extract_metadata(self, soup) -> Dict:
148
+ meta = {"title": "", "description": ""}
149
+ if soup.title:
150
+ meta["title"] = soup.title.string.strip()
151
+ for tag in soup.find_all("meta"):
152
+ if tag.get("name") == "description":
153
+ meta["description"] = tag.get("content", "")
154
+ return meta
 
 
 
 
 
 
 
155
 
156
+ def _extract_links(self, soup, base_url) -> List[Dict]:
157
+ links = []
158
+ base_domain = urlparse(base_url).netloc
159
+ for tag in soup.find_all("a", href=True):
160
+ href = tag["href"]
161
+ full_url = urljoin(base_url, href)
162
+ netloc = urlparse(full_url).netloc
163
+ links.append({
164
+ "url": full_url,
165
+ "type": "internal" if netloc == base_domain else "external",
166
+ "anchor": tag.get_text(strip=True)
167
+ })
 
 
 
 
 
 
168
  return links
169
 
 
 
 
 
170
  def _parse_sitemap(self, sitemap_url: str) -> List[str]:
171
  try:
172
+ r = self.session.get(sitemap_url)
173
+ soup = BeautifulSoup(r.text, "lxml-xml")
174
+ return [loc.text for loc in soup.find_all("loc")]
 
 
 
 
 
 
 
 
 
 
 
175
  except:
176
  return []
177
 
178
+ def _apply_nlp(self, results) -> Tuple[Dict, Dict]:
179
+ summaries, entities = {}, {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  for r in results:
181
+ if r.get("status") != "success" or not r.get("content"): continue
182
+ text = r["content"][:1024]
 
 
 
 
 
 
 
183
  try:
184
+ summaries[r["url"]] = self.models["summarizer"](text, max_length=100, min_length=30)[0]["summary_text"]
185
+ ents = self.models["ner"](text)
186
+ entities[r["url"]] = list({e["word"] for e in ents if e["score"] > 0.8})
187
  except:
188
+ continue
189
  return summaries, entities
190
 
191
+ def _compute_similarity(self, results) -> Dict[str, List[Dict]]:
192
+ docs = [(r["url"], r["content"]) for r in results if r.get("status") == "success" and r.get("content")]
193
+ if len(docs) < 2: return {}
194
+ urls, texts = zip(*docs)
195
+ emb = self.models["semantic"].encode(texts, convert_to_tensor=True)
196
+ sim = util.pytorch_cos_sim(emb, emb)
197
+ return {
198
+ urls[i]: [{"url": urls[j], "score": float(sim[i][j])}
199
+ for j in np.argsort(-sim[i]) if i != j][:3]
200
+ for i in range(len(urls))
201
+ }
 
 
 
 
 
 
 
 
 
202
 
203
+ def _flag_prohibited_terms(self, results) -> Dict[str, List[str]]:
204
+ flags = {}
205
+ for r in results:
206
+ found = [term for term in PROHIBITED_TERMS if term in r.get("content", "").lower()]
207
+ if found:
208
+ flags[r["url"]] = found
209
+ return flags
210
+
211
+ def _classify_topics(self, results) -> Dict[str, List[str]]:
212
+ labels = [
213
+ "hipotecas", "préstamos", "cuentas", "tarjetas",
214
+ "seguros", "inversión", "educación financiera"
215
+ ]
216
+ topics = {}
217
+ for r in results:
218
+ if r.get("status") != "success": continue
219
+ try:
220
+ res = self.models["zeroshot"](r["content"][:1000], candidate_labels=labels, multi_label=True)
221
+ topics[r["url"]] = [l for l, s in zip(res["labels"], res["scores"]) if s > 0.5]
222
+ except:
223
+ continue
224
+ return topics
225
+
226
+ def _generate_seo_tags(self, results, summaries, topics, flags) -> Dict[str, Dict]:
227
+ seo_tags = {}
228
+ for r in results:
229
+ url = r["url"]
230
+ base = summaries.get(url, r.get("content", "")[:300])
231
+ topic = topics.get(url, ["contenido"])[0]
232
+ try:
233
+ prompt = f"Genera un título SEO formal y una meta descripción para contenido sobre {topic}: {base}"
234
+ output = self.models["summarizer"](prompt, max_length=60, min_length=20)[0]["summary_text"]
235
+ title, desc = output.split(".")[0], output
236
+ except:
237
+ title, desc = "", ""
238
+ seo_tags[url] = {
239
+ "title": title,
240
+ "meta_description": desc,
241
+ "flags": flags.get(url, [])
242
+ }
243
+ return seo_tags
244
+
245
+ def _calculate_stats(self, results):
246
+ success = [r for r in results if r.get("status") == "success"]
247
  return {
248
+ "total": len(results),
249
+ "success": len(success),
250
+ "failed": len(results) - len(success),
251
+ "avg_words": round(np.mean([r.get("word_count", 0) for r in success]), 1)
 
 
252
  }
253
 
254
+ def _analyze_content(self, results):
255
+ texts = [r["content"] for r in results if r.get("status") == "success" and r.get("content")]
 
256
  if not texts:
257
+ return {}
258
+ tfidf = TfidfVectorizer(max_features=20, stop_words=list(self.models["spacy"].Defaults.stop_words))
259
+ tfidf.fit(texts)
260
+ top = tfidf.get_feature_names_out().tolist()
261
+ return {"top_keywords": top, "samples": texts[:3]}
 
 
 
 
 
 
 
262
 
263
+ def _analyze_links(self, results):
264
  all_links = []
265
+ for r in results:
266
+ all_links.extend(r.get("links", []))
 
267
  if not all_links:
268
+ return {}
269
  df = pd.DataFrame(all_links)
270
  return {
271
+ "internal_links": df[df["type"] == "internal"]["url"].value_counts().head(10).to_dict(),
272
+ "external_links": df[df["type"] == "external"]["url"].value_counts().head(10).to_dict()
 
 
273
  }
274
 
275
+ def _generate_recommendations(self, results):
 
 
 
276
  recs = []
277
+ if any(r.get("word_count", 0) < 300 for r in results):
278
+ recs.append("✍️ Algunos contenidos son demasiado breves (<300 palabras)")
279
+ if any("gratis" in r.get("content", "").lower() for r in results):
280
+ recs.append("⚠️ Detectado uso de lenguaje no permitido")
281
+ return recs or ["✅ Todo parece correcto"]
 
 
 
 
 
 
 
 
 
 
 
282
 
283
+ def plot_internal_links(self, links: Dict):
284
+ if not links or not links.get("internal_links"):
285
+ fig, ax = plt.subplots()
286
+ ax.text(0.5, 0.5, "No hay enlaces internos", ha="center")
287
+ return fig
288
+ top = links["internal_links"]
289
  fig, ax = plt.subplots()
290
+ ax.barh(list(top.keys()), list(top.values()))
291
+ ax.set_title("Top Enlaces Internos")
292
+ plt.tight_layout()
 
 
 
 
 
 
 
293
  return fig