Merlintxu commited on
Commit
ede29cb
·
verified ·
1 Parent(s): a8a2139

Create seo_analyzer.py

Browse files
Files changed (1) hide show
  1. seo_analyzer.py +374 -0
seo_analyzer.py ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import re
4
+ import requests
5
+ import hashlib
6
+ import PyPDF2
7
+ import numpy as np
8
+ import pandas as pd
9
+ from io import BytesIO
10
+ from typing import List, Dict, Any, Tuple
11
+ from urllib.parse import urlparse, urljoin
12
+ from concurrent.futures import ThreadPoolExecutor, as_completed
13
+ from bs4 import BeautifulSoup
14
+ from pathlib import Path
15
+ from datetime import datetime
16
+ from sklearn.feature_extraction.text import TfidfVectorizer
17
+ from requests.adapters import HTTPAdapter
18
+ from urllib3.util.retry import Retry
19
+ from transformers import pipeline
20
+ from sentence_transformers import SentenceTransformer
21
+ import torch
22
+ import subprocess
23
+ import sys
24
+ import spacy
25
+ import matplotlib.pyplot as plt
26
+
27
+ from utils import sanitize_filename
28
+
29
+ # Configuración de logging
30
+ logging.basicConfig(
31
+ level=logging.INFO,
32
+ format='%(asctime)s - %(levelname)s - %(message)s'
33
+ )
34
+ logger = logging.getLogger(__name__)
35
+
36
+ class SEOSpaceAnalyzer:
37
+ def __init__(self, max_urls: int = 20, max_workers: int = 4) -> None:
38
+ """
39
+ Inicializa la sesión HTTP, carga modelos NLP y prepara el directorio de almacenamiento.
40
+
41
+ Args:
42
+ max_urls: Número máximo de URLs a procesar por análisis.
43
+ max_workers: Número de hilos para la ejecución concurrente.
44
+ """
45
+ self.max_urls = max_urls
46
+ self.max_workers = max_workers
47
+ self.session = self._configure_session()
48
+ self.models = self._load_models()
49
+ self.base_dir = Path("content_storage")
50
+ self.base_dir.mkdir(parents=True, exist_ok=True)
51
+ self.current_analysis: Dict[str, Any] = {}
52
+
53
+ def _load_models(self) -> Dict[str, Any]:
54
+ """Carga los modelos NLP de Hugging Face y spaCy."""
55
+ try:
56
+ device = 0 if torch.cuda.is_available() else -1
57
+ logger.info("Cargando modelos NLP...")
58
+ models = {
59
+ 'summarizer': pipeline("summarization", model="facebook/bart-large-cnn", device=device),
60
+ 'ner': pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple", device=device),
61
+ 'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
62
+ 'spacy': spacy.load("es_core_news_lg")
63
+ }
64
+ logger.info("Modelos cargados correctamente.")
65
+ return models
66
+ except Exception as e:
67
+ logger.error(f"Error cargando modelos: {e}")
68
+ raise
69
+
70
+ def _configure_session(self) -> requests.Session:
71
+ """Configura una sesión HTTP con reintentos y headers personalizados."""
72
+ session = requests.Session()
73
+ retry = Retry(
74
+ total=3,
75
+ backoff_factor=1,
76
+ status_forcelist=[500, 502, 503, 504],
77
+ allowed_methods=['GET', 'HEAD']
78
+ )
79
+ adapter = HTTPAdapter(max_retries=retry)
80
+ session.mount('http://', adapter)
81
+ session.mount('https://', adapter)
82
+ session.headers.update({
83
+ 'User-Agent': 'Mozilla/5.0 (compatible; SEOBot/1.0)',
84
+ 'Accept-Language': 'es-ES,es;q=0.9'
85
+ })
86
+ return session
87
+
88
+ def analyze_sitemap(self, sitemap_url: str) -> Tuple[Dict, List[str], Dict, Dict, List[Dict]]:
89
+ """
90
+ Procesa el sitemap: extrae URLs, analiza cada página individualmente y devuelve datos agregados.
91
+
92
+ Args:
93
+ sitemap_url: URL del sitemap XML.
94
+
95
+ Returns:
96
+ Una tupla con 5 elementos:
97
+ - Estadísticas generales (dict)
98
+ - Recomendaciones SEO (lista de strings)
99
+ - Análisis de contenido agregado (dict)
100
+ - Análisis de enlaces (dict)
101
+ - Detalle individual de cada URL procesada (lista de dicts)
102
+ """
103
+ try:
104
+ urls = self._parse_sitemap(sitemap_url)
105
+ if not urls:
106
+ return {"error": "No se pudieron extraer URLs del sitemap"}, [], {}, {}, []
107
+ results: List[Dict] = []
108
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
109
+ futures = {executor.submit(self._process_url, url): url for url in urls[:self.max_urls]}
110
+ for future in as_completed(futures):
111
+ url = futures[future]
112
+ try:
113
+ res = future.result()
114
+ results.append(res)
115
+ logger.info(f"Procesado: {url}")
116
+ except Exception as e:
117
+ logger.error(f"Error procesando {url}: {e}")
118
+ results.append({'url': url, 'status': 'error', 'error': str(e)})
119
+
120
+ self.current_analysis = {
121
+ 'stats': self._calculate_stats(results),
122
+ 'content_analysis': self._analyze_content(results),
123
+ 'links': self._analyze_links(results),
124
+ 'recommendations': self._generate_seo_recommendations(results),
125
+ 'details': results,
126
+ 'timestamp': datetime.now().isoformat()
127
+ }
128
+ analysis = self.current_analysis
129
+ return analysis['stats'], analysis['recommendations'], analysis['content_analysis'], analysis['links'], analysis['details']
130
+ except Exception as e:
131
+ logger.error(f"Error en análisis: {e}")
132
+ return {"error": str(e)}, [], {}, {}, []
133
+
134
+ def _process_url(self, url: str) -> Dict:
135
+ """Procesa una URL individual extrayendo contenido, metadatos y enlaces."""
136
+ try:
137
+ response = self.session.get(url, timeout=15)
138
+ response.raise_for_status()
139
+ content_type = response.headers.get('Content-Type', '')
140
+ result: Dict[str, Any] = {'url': url, 'status': 'success'}
141
+ if 'application/pdf' in content_type:
142
+ result.update(self._process_pdf(response.content))
143
+ elif 'text/html' in content_type:
144
+ result.update(self._process_html(response.text, url))
145
+ else:
146
+ result.update({'type': 'unknown', 'content': '', 'word_count': 0})
147
+ self._save_content(url, response.content)
148
+ return result
149
+ except requests.exceptions.RequestException as e:
150
+ logger.warning(f"Error procesando {url}: {str(e)}")
151
+ return {'url': url, 'status': 'error', 'error': str(e)}
152
+ except Exception as e:
153
+ logger.error(f"Error inesperado en {url}: {str(e)}")
154
+ return {'url': url, 'status': 'error', 'error': str(e)}
155
+
156
+ def _process_html(self, html: str, base_url: str) -> Dict:
157
+ """Extrae y limpia el contenido HTML, metadatos y enlaces de la página."""
158
+ soup = BeautifulSoup(html, 'html.parser')
159
+ clean_text = self._clean_text(soup.get_text())
160
+ return {
161
+ 'type': 'html',
162
+ 'content': clean_text,
163
+ 'word_count': len(clean_text.split()),
164
+ 'metadata': self._extract_metadata(soup),
165
+ 'links': self._extract_links(soup, base_url)
166
+ }
167
+
168
+ def _process_pdf(self, content: bytes) -> Dict:
169
+ """Extrae texto de un documento PDF y calcula estadísticas básicas."""
170
+ try:
171
+ text = ""
172
+ with BytesIO(content) as pdf_file:
173
+ reader = PyPDF2.PdfReader(pdf_file)
174
+ for page in reader.pages:
175
+ extracted = page.extract_text()
176
+ text += extracted if extracted else ""
177
+ clean_text = self._clean_text(text)
178
+ return {
179
+ 'type': 'pdf',
180
+ 'content': clean_text,
181
+ 'word_count': len(clean_text.split()),
182
+ 'page_count': len(reader.pages)
183
+ }
184
+ except PyPDF2.PdfReadError as e:
185
+ logger.error(f"Error leyendo PDF: {e}")
186
+ return {'type': 'pdf', 'error': str(e)}
187
+
188
+ def _clean_text(self, text: str) -> str:
189
+ """Limpia y normaliza el texto removiendo espacios y caracteres especiales."""
190
+ if not text:
191
+ return ""
192
+ text = re.sub(r'\s+', ' ', text)
193
+ return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip()
194
+
195
+ def _extract_metadata(self, soup: BeautifulSoup) -> Dict:
196
+ """Extrae metadatos relevantes (título, descripción, keywords, Open Graph) de la página."""
197
+ metadata = {'title': '', 'description': '', 'keywords': [], 'og': {}}
198
+ if soup.title and soup.title.string:
199
+ metadata['title'] = soup.title.string.strip()[:200]
200
+ for meta in soup.find_all('meta'):
201
+ name = meta.get('name', '').lower()
202
+ prop = meta.get('property', '').lower()
203
+ content = meta.get('content', '')
204
+ if name == 'description':
205
+ metadata['description'] = content[:300]
206
+ elif name == 'keywords':
207
+ metadata['keywords'] = [kw.strip() for kw in content.split(',') if kw.strip()]
208
+ elif prop.startswith('og:'):
209
+ metadata['og'][prop[3:]] = content
210
+ return metadata
211
+
212
+ def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
213
+ """Extrae enlaces de la página, distinguiendo entre internos y externos."""
214
+ links: List[Dict] = []
215
+ base_netloc = urlparse(base_url).netloc
216
+ for tag in soup.find_all('a', href=True):
217
+ try:
218
+ href = tag['href'].strip()
219
+ if not href or href.startswith('javascript:'):
220
+ continue
221
+ full_url = urljoin(base_url, href)
222
+ parsed = urlparse(full_url)
223
+ links.append({
224
+ 'url': full_url,
225
+ 'type': 'internal' if parsed.netloc == base_netloc else 'external',
226
+ 'anchor': self._clean_text(tag.get_text())[:100],
227
+ 'file_type': self._get_file_type(parsed.path)
228
+ })
229
+ except Exception as e:
230
+ logger.warning(f"Error procesando enlace {tag.get('href')}: {e}")
231
+ continue
232
+ return links
233
+
234
+ def _get_file_type(self, path: str) -> str:
235
+ """Determina el tipo de archivo según la extensión."""
236
+ ext = Path(path).suffix.lower()
237
+ return ext[1:] if ext else 'html'
238
+
239
+ def _parse_sitemap(self, sitemap_url: str) -> List[str]:
240
+ """Parsea un sitemap XML (y posibles índices de sitemaps) para extraer URLs."""
241
+ try:
242
+ response = self.session.get(sitemap_url, timeout=10)
243
+ response.raise_for_status()
244
+ if 'xml' not in response.headers.get('Content-Type', ''):
245
+ logger.warning(f"El sitemap no parece ser XML: {sitemap_url}")
246
+ return []
247
+ soup = BeautifulSoup(response.text, 'lxml-xml')
248
+ urls: List[str] = []
249
+ if soup.find('sitemapindex'):
250
+ for sitemap in soup.find_all('loc'):
251
+ url = sitemap.text.strip()
252
+ if url.endswith('.xml'):
253
+ urls.extend(self._parse_sitemap(url))
254
+ else:
255
+ urls = [loc.text.strip() for loc in soup.find_all('loc')]
256
+ filtered_urls = list({url for url in urls if url.startswith('http')})
257
+ return filtered_urls
258
+ except Exception as e:
259
+ logger.error(f"Error al parsear sitemap {sitemap_url}: {e}")
260
+ return []
261
+
262
+ def _save_content(self, url: str, content: bytes) -> None:
263
+ """
264
+ Guarda el contenido descargado en una estructura de directorios organizada por dominio,
265
+ sanitizando el nombre del archivo y evitando sobrescribir archivos idénticos mediante hash.
266
+ """
267
+ try:
268
+ parsed = urlparse(url)
269
+ domain_dir = self.base_dir / parsed.netloc
270
+ path = parsed.path.lstrip('/')
271
+ if not path or path.endswith('/'):
272
+ path = os.path.join(path, 'index.html')
273
+ safe_path = sanitize_filename(path)
274
+ save_path = domain_dir / safe_path
275
+ save_path.parent.mkdir(parents=True, exist_ok=True)
276
+ new_hash = hashlib.md5(content).hexdigest()
277
+ if save_path.exists():
278
+ with open(save_path, 'rb') as f:
279
+ existing_content = f.read()
280
+ existing_hash = hashlib.md5(existing_content).hexdigest()
281
+ if new_hash == existing_hash:
282
+ logger.debug(f"El contenido de {url} ya está guardado.")
283
+ return
284
+ with open(save_path, 'wb') as f:
285
+ f.write(content)
286
+ logger.info(f"Guardado contenido en: {save_path}")
287
+ except Exception as e:
288
+ logger.error(f"Error guardando contenido para {url}: {e}")
289
+
290
+ def _calculate_stats(self, results: List[Dict]) -> Dict:
291
+ """Calcula estadísticas generales del análisis."""
292
+ successful = [r for r in results if r.get('status') == 'success']
293
+ content_types = [r.get('type', 'unknown') for r in successful]
294
+ avg_word_count = round(np.mean([r.get('word_count', 0) for r in successful]) if successful else 0, 1)
295
+ return {
296
+ 'total_urls': len(results),
297
+ 'successful': len(successful),
298
+ 'failed': len(results) - len(successful),
299
+ 'content_types': pd.Series(content_types).value_counts().to_dict(),
300
+ 'avg_word_count': avg_word_count,
301
+ 'failed_urls': [r['url'] for r in results if r.get('status') != 'success']
302
+ }
303
+
304
+ def _analyze_content(self, results: List[Dict]) -> Dict:
305
+ """Genera un análisis de contenido agregado usando TF-IDF para extraer las palabras clave principales y muestras."""
306
+ successful = [r for r in results if r.get('status') == 'success' and r.get('content')]
307
+ texts = [r['content'] for r in successful if len(r['content'].split()) > 10]
308
+ if not texts:
309
+ return {'top_keywords': [], 'content_samples': []}
310
+ try:
311
+ stop_words = list(self.models['spacy'].Defaults.stop_words)
312
+ vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=50, ngram_range=(1, 2))
313
+ tfidf = vectorizer.fit_transform(texts)
314
+ feature_names = vectorizer.get_feature_names_out()
315
+ sorted_indices = np.argsort(np.asarray(tfidf.sum(axis=0)).ravel())[-10:]
316
+ top_keywords = feature_names[sorted_indices][::-1].tolist()
317
+ except Exception as e:
318
+ logger.error(f"Error en análisis TF-IDF: {e}")
319
+ top_keywords = []
320
+ samples = [{'url': r['url'], 'sample': (r['content'][:500] + '...') if len(r['content']) > 500 else r['content']} for r in successful[:3]]
321
+ return {'top_keywords': top_keywords, 'content_samples': samples}
322
+
323
+ def _analyze_links(self, results: List[Dict]) -> Dict:
324
+ """Genera un análisis de enlaces internos, dominios externos, anclas y tipos de archivos."""
325
+ all_links = []
326
+ for result in results:
327
+ if result.get('links'):
328
+ all_links.extend(result['links'])
329
+ if not all_links:
330
+ return {'internal_links': {}, 'external_domains': {}, 'common_anchors': {}, 'file_types': {}}
331
+ df = pd.DataFrame(all_links)
332
+ return {
333
+ 'internal_links': df[df['type'] == 'internal']['url'].value_counts().head(20).to_dict(),
334
+ 'external_domains': df[df['type'] == 'external']['url'].apply(lambda x: urlparse(x).netloc).value_counts().head(10).to_dict(),
335
+ 'common_anchors': df['anchor'].value_counts().head(10).to_dict(),
336
+ 'file_types': df['file_type'].value_counts().to_dict()
337
+ }
338
+
339
+ def _generate_seo_recommendations(self, results: List[Dict]) -> List[str]:
340
+ """Genera recomendaciones SEO en base a las deficiencias encontradas en el análisis."""
341
+ successful = [r for r in results if r.get('status') == 'success']
342
+ if not successful:
343
+ return ["No se pudo analizar ningún contenido exitosamente"]
344
+ recs = []
345
+ missing_titles = sum(1 for r in successful if not r.get('metadata', {}).get('title'))
346
+ if missing_titles:
347
+ recs.append(f"📌 Añadir títulos a {missing_titles} páginas")
348
+ short_descriptions = sum(1 for r in successful if not r.get('metadata', {}).get('description'))
349
+ if short_descriptions:
350
+ recs.append(f"📌 Añadir meta descripciones a {short_descriptions} páginas")
351
+ short_content = sum(1 for r in successful if r.get('word_count', 0) < 300)
352
+ if short_content:
353
+ recs.append(f"📝 Ampliar contenido en {short_content} páginas (menos de 300 palabras)")
354
+ all_links = [link for r in results for link in r.get('links', [])]
355
+ if all_links:
356
+ df_links = pd.DataFrame(all_links)
357
+ internal_links = df_links[df_links['type'] == 'internal']
358
+ if len(internal_links) > 100:
359
+ recs.append(f"🔗 Optimizar estructura de enlaces internos ({len(internal_links)} enlaces)")
360
+ return recs if recs else ["✅ No se detectaron problemas críticos de SEO"]
361
+
362
+ def plot_internal_links(self, links_data: Dict) -> Any:
363
+ """Genera un gráfico de barras horizontales mostrando los 20 principales enlaces internos."""
364
+ internal_links = links_data.get('internal_links', {})
365
+ if not internal_links:
366
+ return {}
367
+ fig, ax = plt.subplots()
368
+ names = list(internal_links.keys())
369
+ counts = list(internal_links.values())
370
+ ax.barh(names, counts)
371
+ ax.set_xlabel("Cantidad de enlaces")
372
+ ax.set_title("Top 20 Enlaces Internos")
373
+ plt.tight_layout()
374
+ return fig