Merlintxu commited on
Commit
bb43f76
·
verified ·
1 Parent(s): 6564f20

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +218 -161
app.py CHANGED
@@ -8,7 +8,7 @@ import PyPDF2
8
  import numpy as np
9
  import pandas as pd
10
  from io import BytesIO
11
- from typing import List, Dict, Optional, Tuple
12
  from urllib.parse import urlparse, urljoin
13
  from concurrent.futures import ThreadPoolExecutor, as_completed
14
  from bs4 import BeautifulSoup
@@ -24,7 +24,6 @@ import torch
24
  import subprocess
25
  import sys
26
  import spacy
27
- import logging
28
  import gradio as gr
29
  import matplotlib.pyplot as plt
30
 
@@ -35,30 +34,53 @@ logging.basicConfig(
35
  )
36
  logger = logging.getLogger(__name__)
37
 
 
 
 
 
 
 
 
 
 
 
38
  class SEOSpaceAnalyzer:
39
- def __init__(self):
 
 
 
 
 
 
 
 
 
 
40
  self.session = self._configure_session()
41
  self.models = self._load_models()
42
  self.base_dir = Path("content_storage")
43
  self.base_dir.mkdir(parents=True, exist_ok=True)
44
- self.current_analysis = {}
45
-
46
- def _load_models(self) -> Dict:
47
- """Carga modelos optimizados para Hugging Face"""
48
  try:
49
  device = 0 if torch.cuda.is_available() else -1
50
- return {
 
51
  'summarizer': pipeline("summarization", model="facebook/bart-large-cnn", device=device),
52
  'ner': pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple", device=device),
53
  'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
54
  'spacy': spacy.load("es_core_news_lg")
55
  }
 
 
56
  except Exception as e:
57
- logger.error(f"Error loading models: {e}")
58
  raise
59
-
60
  def _configure_session(self) -> requests.Session:
61
- """Configura sesión HTTP con reintentos"""
62
  session = requests.Session()
63
  retry = Retry(
64
  total=3,
@@ -74,25 +96,33 @@ class SEOSpaceAnalyzer:
74
  'Accept-Language': 'es-ES,es;q=0.9'
75
  })
76
  return session
77
-
78
  def analyze_sitemap(self, sitemap_url: str) -> Tuple[Dict, List[str], Dict, Dict]:
79
- """Analiza un sitemap completo y devuelve componentes por separado"""
 
 
 
 
80
  try:
 
81
  urls = self._parse_sitemap(sitemap_url)
82
  if not urls:
 
83
  return {"error": "No se pudieron extraer URLs del sitemap"}, [], {}, {}
84
-
85
- results = []
86
- with ThreadPoolExecutor(max_workers=4) as executor:
87
- futures = {executor.submit(self._process_url, url): url for url in urls[:20]} # Limitar para demo
88
  for future in as_completed(futures):
 
89
  try:
90
- results.append(future.result())
 
 
91
  except Exception as e:
92
- url = futures[future]
93
- logger.error(f"Error processing {url}: {e}")
94
  results.append({'url': url, 'status': 'error', 'error': str(e)})
95
-
96
  self.current_analysis = {
97
  'stats': self._calculate_stats(results),
98
  'content_analysis': self._analyze_content(results),
@@ -100,43 +130,42 @@ class SEOSpaceAnalyzer:
100
  'recommendations': self._generate_seo_recommendations(results),
101
  'timestamp': datetime.now().isoformat()
102
  }
103
-
104
- return (
105
- self.current_analysis['stats'],
106
- self.current_analysis['recommendations'],
107
- self.current_analysis['content_analysis'],
108
- self.current_analysis['links']
109
- )
110
-
111
  except Exception as e:
112
- logger.error(f"Error en análisis: {str(e)}")
113
  return {"error": str(e)}, [], {}, {}
114
-
115
  def _process_url(self, url: str) -> Dict:
116
- """Procesa una URL individual"""
117
  try:
118
  response = self.session.get(url, timeout=15)
119
  response.raise_for_status()
120
-
121
  content_type = response.headers.get('Content-Type', '')
122
- result = {'url': url, 'status': 'success'}
123
-
124
  if 'application/pdf' in content_type:
125
  result.update(self._process_pdf(response.content))
126
  elif 'text/html' in content_type:
127
  result.update(self._process_html(response.text, url))
128
-
 
 
129
  self._save_content(url, response.content)
130
  return result
131
  except requests.exceptions.RequestException as e:
132
  logger.warning(f"Error procesando {url}: {str(e)}")
133
  return {'url': url, 'status': 'error', 'error': str(e)}
134
-
 
 
 
135
  def _process_html(self, html: str, base_url: str) -> Dict:
136
- """Procesa contenido HTML"""
137
  soup = BeautifulSoup(html, 'html.parser')
138
  clean_text = self._clean_text(soup.get_text())
139
-
140
  return {
141
  'type': 'html',
142
  'content': clean_text,
@@ -144,16 +173,16 @@ class SEOSpaceAnalyzer:
144
  'links': self._extract_links(soup, base_url),
145
  'metadata': self._extract_metadata(soup)
146
  }
147
-
148
  def _process_pdf(self, content: bytes) -> Dict:
149
- """Procesa documentos PDF"""
150
  try:
151
  text = ""
152
  with BytesIO(content) as pdf_file:
153
  reader = PyPDF2.PdfReader(pdf_file)
154
  for page in reader.pages:
155
- text += page.extract_text() or "" # Handle None return
156
-
157
  clean_text = self._clean_text(text)
158
  return {
159
  'type': 'pdf',
@@ -162,30 +191,28 @@ class SEOSpaceAnalyzer:
162
  'page_count': len(reader.pages)
163
  }
164
  except PyPDF2.PdfReadError as e:
165
- logger.error(f"Error reading PDF: {e}")
166
  return {'type': 'pdf', 'error': str(e)}
167
 
168
  def _clean_text(self, text: str) -> str:
169
- """Limpieza avanzada de texto"""
170
  if not text:
171
  return ""
172
  text = re.sub(r'\s+', ' ', text)
173
  return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip()
174
-
175
  def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
176
- """Extrae y clasifica enlaces"""
177
- links = []
178
  base_netloc = urlparse(base_url).netloc
179
-
180
  for tag in soup.find_all('a', href=True):
181
  try:
182
  href = tag['href'].strip()
183
  if not href or href.startswith('javascript:'):
184
  continue
185
-
186
  full_url = urljoin(base_url, href)
187
  parsed = urlparse(full_url)
188
-
189
  links.append({
190
  'url': full_url,
191
  'type': 'internal' if parsed.netloc == base_netloc else 'external',
@@ -193,55 +220,54 @@ class SEOSpaceAnalyzer:
193
  'file_type': self._get_file_type(parsed.path)
194
  })
195
  except Exception as e:
196
- logger.warning(f"Error processing link {tag.get('href')}: {e}")
197
  continue
198
  return links
199
-
200
  def _get_file_type(self, path: str) -> str:
201
- """Determina tipo de archivo por extensión"""
202
  ext = Path(path).suffix.lower()
203
  return ext[1:] if ext else 'html'
204
-
205
  def _extract_metadata(self, soup: BeautifulSoup) -> Dict:
206
- """Extrae metadatos SEO"""
207
- metadata = {
208
  'title': '',
209
  'description': '',
210
  'keywords': [],
211
  'og': {}
212
  }
213
-
214
  if soup.title and soup.title.string:
215
  metadata['title'] = soup.title.string.strip()[:200]
216
-
217
  for meta in soup.find_all('meta'):
218
  name = meta.get('name', '').lower()
219
  property_ = meta.get('property', '').lower()
220
  content = meta.get('content', '')
221
-
222
  if name == 'description':
223
  metadata['description'] = content[:300]
224
  elif name == 'keywords':
225
  metadata['keywords'] = [kw.strip() for kw in content.split(',') if kw.strip()]
226
  elif property_.startswith('og:'):
227
  metadata['og'][property_[3:]] = content
228
-
229
  return metadata
230
-
231
  def _parse_sitemap(self, sitemap_url: str) -> List[str]:
232
- """Parsea sitemap XML básico"""
 
 
 
233
  try:
234
  response = self.session.get(sitemap_url, timeout=10)
235
  response.raise_for_status()
236
-
237
  if 'xml' not in response.headers.get('Content-Type', ''):
238
  logger.warning(f"El sitemap no parece ser XML: {sitemap_url}")
239
  return []
240
-
241
- urls = []
242
- soup = BeautifulSoup(response.text, 'lxml-xml') # Usar parser XML específico
243
-
244
- # Handle sitemap index
245
  if soup.find('sitemapindex'):
246
  for sitemap in soup.find_all('loc'):
247
  url = sitemap.text.strip()
@@ -249,80 +275,92 @@ class SEOSpaceAnalyzer:
249
  urls.extend(self._parse_sitemap(url))
250
  else:
251
  urls = [loc.text.strip() for loc in soup.find_all('loc')]
252
-
253
- return list(set(url for url in urls if url.startswith('http')))
 
254
  except Exception as e:
255
- logger.error(f"Error parsing sitemap {sitemap_url}: {e}")
256
  return []
257
-
258
  def _save_content(self, url: str, content: bytes) -> None:
259
- """Almacena el contenido descargado"""
 
 
260
  try:
261
  parsed = urlparse(url)
262
  domain_dir = self.base_dir / parsed.netloc
 
263
  path = parsed.path.lstrip('/')
264
-
265
  if not path or path.endswith('/'):
266
- path = path + 'index.html'
267
-
268
- save_path = domain_dir / path
269
  save_path.parent.mkdir(parents=True, exist_ok=True)
270
-
 
 
 
 
 
 
 
 
 
 
271
  with open(save_path, 'wb') as f:
272
  f.write(content)
 
273
  except Exception as e:
274
- logger.error(f"Error saving content for {url}: {e}")
275
 
276
  def _calculate_stats(self, results: List[Dict]) -> Dict:
277
- """Calcula estadísticas básicas"""
278
  successful = [r for r in results if r.get('status') == 'success']
279
-
 
280
  return {
281
  'total_urls': len(results),
282
  'successful': len(successful),
283
  'failed': len(results) - len(successful),
284
- 'content_types': pd.Series([r.get('type', 'unknown') for r in successful]).value_counts().to_dict(),
285
- 'avg_word_count': round(np.mean([r.get('word_count', 0) for r in successful]), 1),
286
  'failed_urls': [r['url'] for r in results if r.get('status') != 'success']
287
  }
288
-
289
  def _analyze_content(self, results: List[Dict]) -> Dict:
290
- """Analiza contenido con NLP"""
 
 
 
291
  successful = [r for r in results if r.get('status') == 'success' and r.get('content')]
292
- texts = [r['content'] for r in successful if len(r['content'].split()) > 10] # Filtrar contenido muy corto
293
-
294
  if not texts:
295
  return {'top_keywords': [], 'content_samples': []}
296
-
297
- # Análisis de temas principales
298
  try:
299
  stop_words = list(self.models['spacy'].Defaults.stop_words)
300
- vectorizer = TfidfVectorizer(
301
- stop_words=stop_words,
302
- max_features=50,
303
- ngram_range=(1, 2)
304
- )
305
  tfidf = vectorizer.fit_transform(texts)
306
  feature_names = vectorizer.get_feature_names_out()
307
- sorted_indices = np.argsort(np.asarray(tfidf.sum(axis=0)).ravel())[-10:] # Top 10 índices
308
- top_keywords = feature_names[sorted_indices][::-1].tolist() # Orden descendente
309
  except Exception as e:
310
- logger.error(f"Error en análisis TF-IDF: {str(e)}")
311
  top_keywords = []
312
-
313
  return {
314
  'top_keywords': top_keywords,
315
- 'content_samples': [{'url': r['url'], 'sample': r['content'][:500] + '...'}
316
- for r in successful[:3]] # Muestras de contenido
317
  }
318
-
319
  def _analyze_links(self, results: List[Dict]) -> Dict:
320
- """Analiza estructura de enlaces"""
 
 
 
321
  all_links = []
322
  for result in results:
323
  if result.get('links'):
324
  all_links.extend(result['links'])
325
-
326
  if not all_links:
327
  return {
328
  'internal_links': {},
@@ -330,9 +368,7 @@ class SEOSpaceAnalyzer:
330
  'common_anchors': {},
331
  'file_types': {}
332
  }
333
-
334
  df = pd.DataFrame(all_links)
335
-
336
  return {
337
  'internal_links': df[df['type'] == 'internal']['url'].value_counts().head(20).to_dict(),
338
  'external_domains': df[df['type'] == 'external']['url']
@@ -341,43 +377,59 @@ class SEOSpaceAnalyzer:
341
  'common_anchors': df['anchor'].value_counts().head(10).to_dict(),
342
  'file_types': df['file_type'].value_counts().to_dict()
343
  }
344
-
345
  def _generate_seo_recommendations(self, results: List[Dict]) -> List[str]:
346
- """Genera recomendaciones SEO"""
 
 
 
347
  successful = [r for r in results if r.get('status') == 'success']
348
  if not successful:
349
  return ["No se pudo analizar ningún contenido exitosamente"]
350
-
351
  recs = []
352
-
353
- # Revisar metadatos
354
  missing_titles = sum(1 for r in successful if not r.get('metadata', {}).get('title'))
355
  if missing_titles:
356
  recs.append(f"📌 Añadir títulos a {missing_titles} páginas")
357
-
358
- short_descriptions = sum(1 for r in successful
359
- if not r.get('metadata', {}).get('description'))
360
  if short_descriptions:
361
  recs.append(f"📌 Añadir meta descripciones a {short_descriptions} páginas")
362
-
363
- # Revisar contenido corto
364
  short_content = sum(1 for r in successful if r.get('word_count', 0) < 300)
365
  if short_content:
366
  recs.append(f"📝 Ampliar contenido en {short_content} páginas (menos de 300 palabras)")
367
-
368
- # Analizar enlaces
369
  all_links = [link for r in results for link in r.get('links', [])]
370
  if all_links:
371
  df_links = pd.DataFrame(all_links)
372
  internal_links = df_links[df_links['type'] == 'internal']
373
- if len(internal_links) > 100: # Umbral arbitrario
374
  recs.append(f"🔗 Optimizar estructura de enlaces internos ({len(internal_links)} enlaces)")
375
-
376
  return recs if recs else ["✅ No se detectaron problemas críticos de SEO"]
377
 
378
- def create_interface():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
  analyzer = SEOSpaceAnalyzer()
380
-
381
  with gr.Blocks(title="SEO Analyzer Pro", theme=gr.themes.Soft()) as interface:
382
  gr.Markdown("""
383
  # 🕵️ SEO Analyzer Pro
@@ -385,84 +437,88 @@ def create_interface():
385
 
386
  Sube la URL de un sitemap.xml para analizar todo el sitio web.
387
  """)
388
-
389
  with gr.Row():
390
  with gr.Column():
391
- sitemap_input = gr.Textbox(
392
- label="URL del Sitemap",
393
- placeholder="https://ejemplo.com/sitemap.xml",
394
- interactive=True
395
- )
396
  analyze_btn = gr.Button("Analizar Sitio", variant="primary")
397
-
398
  with gr.Row():
399
  clear_btn = gr.Button("Limpiar")
400
  download_btn = gr.Button("Descargar Reporte", variant="secondary")
401
-
402
  with gr.Column():
403
  status_output = gr.Textbox(label="Estado del Análisis", interactive=False)
404
  progress_bar = gr.Progress()
405
-
406
  with gr.Tabs():
407
  with gr.Tab("📊 Resumen"):
408
  stats_output = gr.JSON(label="Estadísticas Generales")
409
  recommendations_output = gr.JSON(label="Recomendaciones SEO")
410
-
411
  with gr.Tab("📝 Contenido"):
412
  content_output = gr.JSON(label="Análisis de Contenido")
413
  gr.Examples(
414
- examples=[
415
- {"content": "Ejemplo de análisis de contenido..."}
416
- ],
417
  inputs=[content_output],
418
  label="Ejemplos de Salida"
419
  )
420
-
421
  with gr.Tab("🔗 Enlaces"):
422
  links_output = gr.JSON(label="Análisis de Enlaces")
423
- with gr.Accordion("Visualización de Enlaces", open=False):
424
- links_plot = gr.Plot()
425
-
426
  with gr.Tab("📂 Documentos"):
427
  gr.Markdown("""
428
  ### Documentos Encontrados
429
  Los documentos descargados se guardan en la carpeta `content_storage/`
430
  """)
431
- # Reemplazado FileExplorer por Markdown informativo
432
-
433
- # Event handlers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
  analyze_btn.click(
435
  fn=analyzer.analyze_sitemap,
436
  inputs=sitemap_input,
437
  outputs=[stats_output, recommendations_output, content_output, links_output],
438
  show_progress=True
439
  )
440
-
441
  clear_btn.click(
442
- fn=lambda: [None]*4,
443
  outputs=[stats_output, recommendations_output, content_output, links_output]
444
  )
445
-
446
- # Para descargar el reporte, primero se debe generar
447
- def generate_report():
448
- if analyzer.current_analysis:
449
- report_path = "content_storage/seo_report.json"
450
- with open(report_path, 'w') as f:
451
- json.dump(analyzer.current_analysis, f, indent=2)
452
- return report_path
453
- return None
454
-
455
  download_btn.click(
456
  fn=generate_report,
457
  outputs=gr.File(label="Descargar Reporte")
458
  )
459
-
 
 
 
 
460
  return interface
461
- def setup_spacy_model():
462
- """Descarga el modelo de spaCy si no está instalado"""
 
 
 
 
463
  try:
464
  spacy.load("es_core_news_lg")
465
- logger.info("Modelo spaCy 'es_core_news_lg' cargado correctamente")
466
  except OSError:
467
  logger.info("Descargando modelo spaCy 'es_core_news_lg'...")
468
  try:
@@ -472,17 +528,18 @@ def setup_spacy_model():
472
  stdout=subprocess.PIPE,
473
  stderr=subprocess.PIPE
474
  )
475
- logger.info("Modelo descargado exitosamente")
476
  except subprocess.CalledProcessError as e:
477
  logger.error(f"Error al descargar modelo: {e.stderr.decode()}")
478
  raise RuntimeError("No se pudo descargar el modelo spaCy") from e
 
 
479
  if __name__ == "__main__":
480
  setup_spacy_model()
481
-
482
  app = create_interface()
483
  app.launch(
484
  server_name="0.0.0.0",
485
  server_port=7860,
486
  show_error=True,
487
  share=False
488
- )
 
8
  import numpy as np
9
  import pandas as pd
10
  from io import BytesIO
11
+ from typing import List, Dict, Optional, Tuple, Any
12
  from urllib.parse import urlparse, urljoin
13
  from concurrent.futures import ThreadPoolExecutor, as_completed
14
  from bs4 import BeautifulSoup
 
24
  import subprocess
25
  import sys
26
  import spacy
 
27
  import gradio as gr
28
  import matplotlib.pyplot as plt
29
 
 
34
  )
35
  logger = logging.getLogger(__name__)
36
 
37
+
38
+ def sanitize_filename(filename: str) -> str:
39
+ """
40
+ Sanitiza el nombre de un archivo eliminando o reemplazando caracteres no permitidos.
41
+ """
42
+ filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
43
+ filename = re.sub(r'\s+', '_', filename)
44
+ return filename
45
+
46
+
47
  class SEOSpaceAnalyzer:
48
+ """
49
+ Clase principal que encapsula la lógica para analizar un sitio web a partir de su sitemap.
50
+ """
51
+ def __init__(self, max_urls: int = 20, max_workers: int = 4) -> None:
52
+ """
53
+ Inicializa la sesión, carga los modelos y configura parámetros.
54
+ :param max_urls: Número máximo de URLs a procesar en un análisis.
55
+ :param max_workers: Número de hilos para la ejecución concurrente.
56
+ """
57
+ self.max_urls = max_urls
58
+ self.max_workers = max_workers
59
  self.session = self._configure_session()
60
  self.models = self._load_models()
61
  self.base_dir = Path("content_storage")
62
  self.base_dir.mkdir(parents=True, exist_ok=True)
63
+ self.current_analysis: Dict[str, Any] = {}
64
+
65
+ def _load_models(self) -> Dict[str, Any]:
66
+ """Carga modelos optimizados para Hugging Face y spaCy."""
67
  try:
68
  device = 0 if torch.cuda.is_available() else -1
69
+ logger.info("Cargando modelos NLP...")
70
+ models = {
71
  'summarizer': pipeline("summarization", model="facebook/bart-large-cnn", device=device),
72
  'ner': pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple", device=device),
73
  'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
74
  'spacy': spacy.load("es_core_news_lg")
75
  }
76
+ logger.info("Modelos cargados correctamente.")
77
+ return models
78
  except Exception as e:
79
+ logger.error(f"Error cargando modelos: {e}")
80
  raise
81
+
82
  def _configure_session(self) -> requests.Session:
83
+ """Configura una sesión HTTP con reintentos y headers personalizados."""
84
  session = requests.Session()
85
  retry = Retry(
86
  total=3,
 
96
  'Accept-Language': 'es-ES,es;q=0.9'
97
  })
98
  return session
99
+
100
  def analyze_sitemap(self, sitemap_url: str) -> Tuple[Dict, List[str], Dict, Dict]:
101
+ """
102
+ Analiza un sitemap completo, procesando URLs en paralelo y generando estadísticas, análisis de contenido, enlaces y recomendaciones SEO.
103
+ :param sitemap_url: URL del sitemap XML.
104
+ :return: Tuple con estadísticas, recomendaciones, análisis de contenido y análisis de enlaces.
105
+ """
106
  try:
107
+ logger.info(f"Parseando sitemap: {sitemap_url}")
108
  urls = self._parse_sitemap(sitemap_url)
109
  if not urls:
110
+ logger.warning("No se pudieron extraer URLs del sitemap.")
111
  return {"error": "No se pudieron extraer URLs del sitemap"}, [], {}, {}
112
+
113
+ results: List[Dict] = []
114
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
115
+ futures = {executor.submit(self._process_url, url): url for url in urls[:self.max_urls]}
116
  for future in as_completed(futures):
117
+ url = futures[future]
118
  try:
119
+ res = future.result()
120
+ results.append(res)
121
+ logger.info(f"Procesado: {url}")
122
  except Exception as e:
123
+ logger.error(f"Error procesando {url}: {e}")
 
124
  results.append({'url': url, 'status': 'error', 'error': str(e)})
125
+
126
  self.current_analysis = {
127
  'stats': self._calculate_stats(results),
128
  'content_analysis': self._analyze_content(results),
 
130
  'recommendations': self._generate_seo_recommendations(results),
131
  'timestamp': datetime.now().isoformat()
132
  }
133
+ return (self.current_analysis['stats'],
134
+ self.current_analysis['recommendations'],
135
+ self.current_analysis['content_analysis'],
136
+ self.current_analysis['links'])
 
 
 
 
137
  except Exception as e:
138
+ logger.error(f"Error en análisis: {e}")
139
  return {"error": str(e)}, [], {}, {}
140
+
141
  def _process_url(self, url: str) -> Dict:
142
+ """Procesa una URL individual y decide el método de procesamiento según el tipo de contenido."""
143
  try:
144
  response = self.session.get(url, timeout=15)
145
  response.raise_for_status()
 
146
  content_type = response.headers.get('Content-Type', '')
147
+ result: Dict[str, Any] = {'url': url, 'status': 'success'}
148
+
149
  if 'application/pdf' in content_type:
150
  result.update(self._process_pdf(response.content))
151
  elif 'text/html' in content_type:
152
  result.update(self._process_html(response.text, url))
153
+ else:
154
+ result.update({'type': 'unknown', 'content': '', 'word_count': 0})
155
+
156
  self._save_content(url, response.content)
157
  return result
158
  except requests.exceptions.RequestException as e:
159
  logger.warning(f"Error procesando {url}: {str(e)}")
160
  return {'url': url, 'status': 'error', 'error': str(e)}
161
+ except Exception as e:
162
+ logger.error(f"Error inesperado en {url}: {str(e)}")
163
+ return {'url': url, 'status': 'error', 'error': str(e)}
164
+
165
  def _process_html(self, html: str, base_url: str) -> Dict:
166
+ """Procesa contenido HTML: extrae y limpia el texto, enlaces y metadatos."""
167
  soup = BeautifulSoup(html, 'html.parser')
168
  clean_text = self._clean_text(soup.get_text())
 
169
  return {
170
  'type': 'html',
171
  'content': clean_text,
 
173
  'links': self._extract_links(soup, base_url),
174
  'metadata': self._extract_metadata(soup)
175
  }
176
+
177
  def _process_pdf(self, content: bytes) -> Dict:
178
+ """Procesa documentos PDF extrayendo texto de cada página."""
179
  try:
180
  text = ""
181
  with BytesIO(content) as pdf_file:
182
  reader = PyPDF2.PdfReader(pdf_file)
183
  for page in reader.pages:
184
+ extracted = page.extract_text()
185
+ text += extracted if extracted else ""
186
  clean_text = self._clean_text(text)
187
  return {
188
  'type': 'pdf',
 
191
  'page_count': len(reader.pages)
192
  }
193
  except PyPDF2.PdfReadError as e:
194
+ logger.error(f"Error leyendo PDF: {e}")
195
  return {'type': 'pdf', 'error': str(e)}
196
 
197
  def _clean_text(self, text: str) -> str:
198
+ """Realiza la limpieza y normalización del texto."""
199
  if not text:
200
  return ""
201
  text = re.sub(r'\s+', ' ', text)
202
  return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip()
203
+
204
  def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
205
+ """Extrae y clasifica enlaces presentes en el HTML."""
206
+ links: List[Dict] = []
207
  base_netloc = urlparse(base_url).netloc
208
+
209
  for tag in soup.find_all('a', href=True):
210
  try:
211
  href = tag['href'].strip()
212
  if not href or href.startswith('javascript:'):
213
  continue
 
214
  full_url = urljoin(base_url, href)
215
  parsed = urlparse(full_url)
 
216
  links.append({
217
  'url': full_url,
218
  'type': 'internal' if parsed.netloc == base_netloc else 'external',
 
220
  'file_type': self._get_file_type(parsed.path)
221
  })
222
  except Exception as e:
223
+ logger.warning(f"Error procesando enlace {tag.get('href')}: {e}")
224
  continue
225
  return links
226
+
227
  def _get_file_type(self, path: str) -> str:
228
+ """Determina el tipo de archivo según la extensión encontrada en la URL."""
229
  ext = Path(path).suffix.lower()
230
  return ext[1:] if ext else 'html'
231
+
232
  def _extract_metadata(self, soup: BeautifulSoup) -> Dict:
233
+ """Extrae metadatos relevantes para SEO (título, descripción, keywords y etiquetas OpenGraph)."""
234
+ metadata: Dict[str, Any] = {
235
  'title': '',
236
  'description': '',
237
  'keywords': [],
238
  'og': {}
239
  }
 
240
  if soup.title and soup.title.string:
241
  metadata['title'] = soup.title.string.strip()[:200]
242
+
243
  for meta in soup.find_all('meta'):
244
  name = meta.get('name', '').lower()
245
  property_ = meta.get('property', '').lower()
246
  content = meta.get('content', '')
 
247
  if name == 'description':
248
  metadata['description'] = content[:300]
249
  elif name == 'keywords':
250
  metadata['keywords'] = [kw.strip() for kw in content.split(',') if kw.strip()]
251
  elif property_.startswith('og:'):
252
  metadata['og'][property_[3:]] = content
 
253
  return metadata
254
+
255
  def _parse_sitemap(self, sitemap_url: str) -> List[str]:
256
+ """
257
+ Parsea un sitemap XML e incluso maneja índices de sitemaps.
258
+ :return: Lista de URLs encontradas en el sitemap.
259
+ """
260
  try:
261
  response = self.session.get(sitemap_url, timeout=10)
262
  response.raise_for_status()
263
+
264
  if 'xml' not in response.headers.get('Content-Type', ''):
265
  logger.warning(f"El sitemap no parece ser XML: {sitemap_url}")
266
  return []
267
+
268
+ soup = BeautifulSoup(response.text, 'lxml-xml')
269
+ urls: List[str] = []
270
+ # Manejo de sitemap index
 
271
  if soup.find('sitemapindex'):
272
  for sitemap in soup.find_all('loc'):
273
  url = sitemap.text.strip()
 
275
  urls.extend(self._parse_sitemap(url))
276
  else:
277
  urls = [loc.text.strip() for loc in soup.find_all('loc')]
278
+ # Filtrar URLs que empiezan por http y eliminar duplicados
279
+ filtered_urls = list({url for url in urls if url.startswith('http')})
280
+ return filtered_urls
281
  except Exception as e:
282
+ logger.error(f"Error al parsear el sitemap {sitemap_url}: {e}")
283
  return []
284
+
285
  def _save_content(self, url: str, content: bytes) -> None:
286
+ """
287
+ Almacena el contenido descargado en una estructura organizada. Antes de escribir, verifica si ya existe el archivo.
288
+ """
289
  try:
290
  parsed = urlparse(url)
291
  domain_dir = self.base_dir / parsed.netloc
292
+ # Construir ruta a partir de la ruta URL
293
  path = parsed.path.lstrip('/')
 
294
  if not path or path.endswith('/'):
295
+ path = os.path.join(path, 'index.html')
296
+ safe_path = sanitize_filename(path)
297
+ save_path = domain_dir / safe_path
298
  save_path.parent.mkdir(parents=True, exist_ok=True)
299
+
300
+ # Calcula hash del contenido y evita re-escribir si el archivo existe y es idéntico
301
+ new_hash = hashlib.md5(content).hexdigest()
302
+ if save_path.exists():
303
+ with open(save_path, 'rb') as f:
304
+ existing_content = f.read()
305
+ existing_hash = hashlib.md5(existing_content).hexdigest()
306
+ if new_hash == existing_hash:
307
+ logger.debug(f"El contenido de {url} ya está guardado y es idéntico.")
308
+ return
309
+
310
  with open(save_path, 'wb') as f:
311
  f.write(content)
312
+ logger.info(f"Contenido guardado en: {save_path}")
313
  except Exception as e:
314
+ logger.error(f"Error al guardar contenido para {url}: {e}")
315
 
316
  def _calculate_stats(self, results: List[Dict]) -> Dict:
317
+ """Calcula estadísticas básicas sobre el conjunto de resultados procesados."""
318
  successful = [r for r in results if r.get('status') == 'success']
319
+ content_types = [r.get('type', 'unknown') for r in successful]
320
+ avg_word_count = round(np.mean([r.get('word_count', 0) for r in successful]) if successful else 0, 1)
321
  return {
322
  'total_urls': len(results),
323
  'successful': len(successful),
324
  'failed': len(results) - len(successful),
325
+ 'content_types': pd.Series(content_types).value_counts().to_dict(),
326
+ 'avg_word_count': avg_word_count,
327
  'failed_urls': [r['url'] for r in results if r.get('status') != 'success']
328
  }
329
+
330
  def _analyze_content(self, results: List[Dict]) -> Dict:
331
+ """
332
+ Analiza el contenido extraído usando TF-IDF y muestra algunas muestras.
333
+ :return: Diccionario con keywords y ejemplos de contenido.
334
+ """
335
  successful = [r for r in results if r.get('status') == 'success' and r.get('content')]
336
+ texts = [r['content'] for r in successful if len(r['content'].split()) > 10]
 
337
  if not texts:
338
  return {'top_keywords': [], 'content_samples': []}
 
 
339
  try:
340
  stop_words = list(self.models['spacy'].Defaults.stop_words)
341
+ vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=50, ngram_range=(1, 2))
 
 
 
 
342
  tfidf = vectorizer.fit_transform(texts)
343
  feature_names = vectorizer.get_feature_names_out()
344
+ sorted_indices = np.argsort(np.asarray(tfidf.sum(axis=0)).ravel())[-10:]
345
+ top_keywords = feature_names[sorted_indices][::-1].tolist()
346
  except Exception as e:
347
+ logger.error(f"Error en análisis TF-IDF: {e}")
348
  top_keywords = []
 
349
  return {
350
  'top_keywords': top_keywords,
351
+ 'content_samples': [{'url': r['url'], 'sample': (r['content'][:500] + '...') if len(r['content']) > 500 else r['content']}
352
+ for r in successful[:3]]
353
  }
354
+
355
  def _analyze_links(self, results: List[Dict]) -> Dict:
356
+ """
357
+ Analiza la estructura de enlaces en el contenido procesado.
358
+ :return: Estadísticas de enlaces internos, dominios externos, anclas y tipos de archivos.
359
+ """
360
  all_links = []
361
  for result in results:
362
  if result.get('links'):
363
  all_links.extend(result['links'])
 
364
  if not all_links:
365
  return {
366
  'internal_links': {},
 
368
  'common_anchors': {},
369
  'file_types': {}
370
  }
 
371
  df = pd.DataFrame(all_links)
 
372
  return {
373
  'internal_links': df[df['type'] == 'internal']['url'].value_counts().head(20).to_dict(),
374
  'external_domains': df[df['type'] == 'external']['url']
 
377
  'common_anchors': df['anchor'].value_counts().head(10).to_dict(),
378
  'file_types': df['file_type'].value_counts().to_dict()
379
  }
380
+
381
  def _generate_seo_recommendations(self, results: List[Dict]) -> List[str]:
382
+ """
383
+ Genera recomendaciones SEO basadas en metadatos, cantidad de contenido y estructura de enlaces.
384
+ :return: Lista de recomendaciones.
385
+ """
386
  successful = [r for r in results if r.get('status') == 'success']
387
  if not successful:
388
  return ["No se pudo analizar ningún contenido exitosamente"]
389
+
390
  recs = []
 
 
391
  missing_titles = sum(1 for r in successful if not r.get('metadata', {}).get('title'))
392
  if missing_titles:
393
  recs.append(f"📌 Añadir títulos a {missing_titles} páginas")
394
+ short_descriptions = sum(1 for r in successful if not r.get('metadata', {}).get('description'))
 
 
395
  if short_descriptions:
396
  recs.append(f"📌 Añadir meta descripciones a {short_descriptions} páginas")
 
 
397
  short_content = sum(1 for r in successful if r.get('word_count', 0) < 300)
398
  if short_content:
399
  recs.append(f"📝 Ampliar contenido en {short_content} páginas (menos de 300 palabras)")
400
+
 
401
  all_links = [link for r in results for link in r.get('links', [])]
402
  if all_links:
403
  df_links = pd.DataFrame(all_links)
404
  internal_links = df_links[df_links['type'] == 'internal']
405
+ if len(internal_links) > 100:
406
  recs.append(f"🔗 Optimizar estructura de enlaces internos ({len(internal_links)} enlaces)")
 
407
  return recs if recs else ["✅ No se detectaron problemas críticos de SEO"]
408
 
409
+ def _plot_internal_links(self, links_data: Dict) -> Optional[plt.Figure]:
410
+ """
411
+ Genera un gráfico de barras para la distribución de enlaces internos.
412
+ :param links_data: Diccionario con los enlaces internos.
413
+ :return: Figura de matplotlib o None si no hay datos.
414
+ """
415
+ internal_links = links_data.get('internal_links', {})
416
+ if not internal_links:
417
+ return None
418
+ fig, ax = plt.subplots()
419
+ names = list(internal_links.keys())
420
+ counts = list(internal_links.values())
421
+ ax.barh(names, counts)
422
+ ax.set_xlabel("Cantidad de enlaces")
423
+ ax.set_title("Top 20 Enlaces Internos")
424
+ plt.tight_layout()
425
+ return fig
426
+
427
+
428
+ def create_interface() -> gr.Blocks:
429
+ """
430
+ Crea la interfaz de usuario utilizando Gradio.
431
+ """
432
  analyzer = SEOSpaceAnalyzer()
 
433
  with gr.Blocks(title="SEO Analyzer Pro", theme=gr.themes.Soft()) as interface:
434
  gr.Markdown("""
435
  # 🕵️ SEO Analyzer Pro
 
437
 
438
  Sube la URL de un sitemap.xml para analizar todo el sitio web.
439
  """)
 
440
  with gr.Row():
441
  with gr.Column():
442
+ sitemap_input = gr.Textbox(label="URL del Sitemap",
443
+ placeholder="https://ejemplo.com/sitemap.xml",
444
+ interactive=True)
 
 
445
  analyze_btn = gr.Button("Analizar Sitio", variant="primary")
 
446
  with gr.Row():
447
  clear_btn = gr.Button("Limpiar")
448
  download_btn = gr.Button("Descargar Reporte", variant="secondary")
449
+ plot_btn = gr.Button("Visualizar Enlaces Internos", variant="secondary")
450
  with gr.Column():
451
  status_output = gr.Textbox(label="Estado del Análisis", interactive=False)
452
  progress_bar = gr.Progress()
453
+
454
  with gr.Tabs():
455
  with gr.Tab("📊 Resumen"):
456
  stats_output = gr.JSON(label="Estadísticas Generales")
457
  recommendations_output = gr.JSON(label="Recomendaciones SEO")
 
458
  with gr.Tab("📝 Contenido"):
459
  content_output = gr.JSON(label="Análisis de Contenido")
460
  gr.Examples(
461
+ examples=[{"content": "Ejemplo de análisis de contenido..."}],
 
 
462
  inputs=[content_output],
463
  label="Ejemplos de Salida"
464
  )
 
465
  with gr.Tab("🔗 Enlaces"):
466
  links_output = gr.JSON(label="Análisis de Enlaces")
467
+ links_plot = gr.Plot(label="Visualización de Enlaces Internos")
 
 
468
  with gr.Tab("📂 Documentos"):
469
  gr.Markdown("""
470
  ### Documentos Encontrados
471
  Los documentos descargados se guardan en la carpeta `content_storage/`
472
  """)
473
+
474
+ # Función que genera el reporte y lo guarda en disco
475
+ def generate_report() -> Optional[str]:
476
+ if analyzer.current_analysis:
477
+ report_path = "content_storage/seo_report.json"
478
+ try:
479
+ with open(report_path, 'w', encoding='utf-8') as f:
480
+ json.dump(analyzer.current_analysis, f, indent=2, ensure_ascii=False)
481
+ return report_path
482
+ except Exception as e:
483
+ logger.error(f"Error generando reporte: {e}")
484
+ return None
485
+ return None
486
+
487
+ # Callback para generar gráfico de enlaces internos a partir del análisis almacenado
488
+ def generate_internal_links_plot(links_json: Dict) -> Any:
489
+ fig = analyzer._plot_internal_links(links_json)
490
+ return fig if fig is not None else {}
491
+
492
+ # Asignación de acciones a botones y otros eventos
493
  analyze_btn.click(
494
  fn=analyzer.analyze_sitemap,
495
  inputs=sitemap_input,
496
  outputs=[stats_output, recommendations_output, content_output, links_output],
497
  show_progress=True
498
  )
 
499
  clear_btn.click(
500
+ fn=lambda: [None] * 4,
501
  outputs=[stats_output, recommendations_output, content_output, links_output]
502
  )
 
 
 
 
 
 
 
 
 
 
503
  download_btn.click(
504
  fn=generate_report,
505
  outputs=gr.File(label="Descargar Reporte")
506
  )
507
+ plot_btn.click(
508
+ fn=generate_internal_links_plot,
509
+ inputs=links_output,
510
+ outputs=links_plot
511
+ )
512
  return interface
513
+
514
+
515
+ def setup_spacy_model() -> None:
516
+ """
517
+ Verifica y descarga el modelo de spaCy 'es_core_news_lg' si no está instalado.
518
+ """
519
  try:
520
  spacy.load("es_core_news_lg")
521
+ logger.info("Modelo spaCy 'es_core_news_lg' cargado correctamente.")
522
  except OSError:
523
  logger.info("Descargando modelo spaCy 'es_core_news_lg'...")
524
  try:
 
528
  stdout=subprocess.PIPE,
529
  stderr=subprocess.PIPE
530
  )
531
+ logger.info("Modelo descargado exitosamente.")
532
  except subprocess.CalledProcessError as e:
533
  logger.error(f"Error al descargar modelo: {e.stderr.decode()}")
534
  raise RuntimeError("No se pudo descargar el modelo spaCy") from e
535
+
536
+
537
  if __name__ == "__main__":
538
  setup_spacy_model()
 
539
  app = create_interface()
540
  app.launch(
541
  server_name="0.0.0.0",
542
  server_port=7860,
543
  show_error=True,
544
  share=False
545
+ )