Merlintxu commited on
Commit
3969e8a
·
verified ·
1 Parent(s): 63fe26b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +254 -109
app.py CHANGED
@@ -8,7 +8,7 @@ import PyPDF2
8
  import numpy as np
9
  import pandas as pd
10
  from io import BytesIO
11
- from typing import List, Dict, Optional
12
  from urllib.parse import urlparse, urljoin
13
  from concurrent.futures import ThreadPoolExecutor, as_completed
14
  from bs4 import BeautifulSoup
@@ -18,7 +18,7 @@ from collections import defaultdict
18
  from sklearn.feature_extraction.text import TfidfVectorizer
19
  from requests.adapters import HTTPAdapter
20
  from urllib3.util.retry import Retry
21
- from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
22
  from sentence_transformers import SentenceTransformer
23
  import spacy
24
  import torch
@@ -27,7 +27,10 @@ import gradio as gr
27
  import matplotlib.pyplot as plt
28
 
29
  # Configuración de logging
30
- logging.basicConfig(level=logging.INFO)
 
 
 
31
  logger = logging.getLogger(__name__)
32
 
33
  class SEOSpaceAnalyzer:
@@ -35,18 +38,20 @@ class SEOSpaceAnalyzer:
35
  self.session = self._configure_session()
36
  self.models = self._load_models()
37
  self.base_dir = Path("content_storage")
38
- self.base_dir.mkdir(exist_ok=True)
39
  self.current_analysis = {}
40
 
41
- def _configure_session(self):
42
  """Configura sesión HTTP con reintentos"""
43
  session = requests.Session()
44
  retry = Retry(
45
  total=3,
46
  backoff_factor=1,
47
- status_forcelist=[500, 502, 503, 504]
 
48
  )
49
  adapter = HTTPAdapter(max_retries=retry)
 
50
  session.mount('https://', adapter)
51
  session.headers.update({
52
  'User-Agent': 'Mozilla/5.0 (compatible; SEOBot/1.0)',
@@ -54,21 +59,30 @@ class SEOSpaceAnalyzer:
54
  })
55
  return session
56
 
57
- def _load_models(self):
58
  """Carga modelos optimizados para Hugging Face"""
59
- device = 0 if torch.cuda.is_available() else -1
60
- return {
61
- 'summarizer': pipeline("summarization",
62
- model="facebook/bart-large-cnn",
63
- device=device),
64
- 'ner': pipeline("ner",
65
- model="dslim/bert-base-NER",
66
- device=device),
67
- 'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
68
- 'spacy': spacy.load("es_core_news_lg")
69
- }
70
-
71
- def analyze_sitemap(self, sitemap_url: str):
 
 
 
 
 
 
 
 
 
72
  """Analiza un sitemap completo"""
73
  try:
74
  urls = self._parse_sitemap(sitemap_url)
@@ -77,15 +91,21 @@ class SEOSpaceAnalyzer:
77
 
78
  results = []
79
  with ThreadPoolExecutor(max_workers=4) as executor:
80
- futures = [executor.submit(self._process_url, url) for url in urls[:50]] # Limitar para demo
81
  for future in as_completed(futures):
82
- results.append(future.result())
 
 
 
 
 
83
 
84
  self.current_analysis = {
85
  'stats': self._calculate_stats(results),
86
  'content_analysis': self._analyze_content(results),
87
  'links': self._analyze_links(results),
88
- 'recommendations': self._generate_seo_recommendations(results)
 
89
  }
90
 
91
  return self.current_analysis
@@ -94,10 +114,10 @@ class SEOSpaceAnalyzer:
94
  logger.error(f"Error en análisis: {str(e)}")
95
  return {"error": str(e)}
96
 
97
- def _process_url(self, url: str):
98
  """Procesa una URL individual"""
99
  try:
100
- response = self.session.get(url, timeout=10)
101
  response.raise_for_status()
102
 
103
  content_type = response.headers.get('Content-Type', '')
@@ -108,14 +128,15 @@ class SEOSpaceAnalyzer:
108
  elif 'text/html' in content_type:
109
  result.update(self._process_html(response.text, url))
110
 
 
111
  return result
112
- except Exception as e:
113
  logger.warning(f"Error procesando {url}: {str(e)}")
114
  return {'url': url, 'status': 'error', 'error': str(e)}
115
 
116
- def _process_html(self, html: str, base_url: str):
117
  """Procesa contenido HTML"""
118
- soup = BeautifulSoup(html, 'lxml')
119
  clean_text = self._clean_text(soup.get_text())
120
 
121
  return {
@@ -126,90 +147,135 @@ class SEOSpaceAnalyzer:
126
  'metadata': self._extract_metadata(soup)
127
  }
128
 
129
- def _process_pdf(self, content: bytes):
130
  """Procesa documentos PDF"""
131
- text = ""
132
- with BytesIO(content) as pdf_file:
133
- reader = PyPDF2.PdfReader(pdf_file)
134
- for page in reader.pages:
135
- text += page.extract_text()
136
-
137
- clean_text = self._clean_text(text)
138
- return {
139
- 'type': 'pdf',
140
- 'content': clean_text,
141
- 'word_count': len(clean_text.split()),
142
- 'page_count': len(reader.pages)
143
- }
144
-
145
- def _clean_text(self, text: str):
 
 
 
 
146
  """Limpieza avanzada de texto"""
 
 
147
  text = re.sub(r'\s+', ' ', text)
148
  return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip()
149
 
150
- def _extract_links(self, soup: BeautifulSoup, base_url: str):
151
  """Extrae y clasifica enlaces"""
152
  links = []
 
 
153
  for tag in soup.find_all('a', href=True):
154
  try:
155
- full_url = urljoin(base_url, tag['href'])
 
 
 
 
156
  parsed = urlparse(full_url)
157
 
158
  links.append({
159
  'url': full_url,
160
- 'type': 'internal' if parsed.netloc == urlparse(base_url).netloc else 'external',
161
- 'anchor': self._clean_text(tag.text)[:100],
162
  'file_type': self._get_file_type(parsed.path)
163
  })
164
- except:
 
165
  continue
166
  return links
167
 
168
- def _get_file_type(self, path: str):
169
  """Determina tipo de archivo por extensión"""
170
  ext = Path(path).suffix.lower()
171
  return ext[1:] if ext else 'html'
172
 
173
- def _extract_metadata(self, soup: BeautifulSoup):
174
  """Extrae metadatos SEO"""
175
- metadata = {'title': '', 'description': '', 'keywords': []}
 
 
 
 
 
176
 
177
- # Título
178
- if soup.title:
179
- metadata['title'] = soup.title.string.strip()
180
 
181
- # Meta tags
182
  for meta in soup.find_all('meta'):
183
- if meta.get('name') == 'description':
184
- metadata['description'] = meta.get('content', '')[:500]
185
- elif meta.get('name') == 'keywords':
186
- metadata['keywords'] = [kw.strip() for kw in meta.get('content', '').split(',')]
 
 
 
 
 
 
187
 
188
  return metadata
189
 
190
- def _parse_sitemap(self, sitemap_url: str):
191
  """Parsea sitemap XML básico"""
192
  try:
193
- response = self.session.get(sitemap_url)
194
  response.raise_for_status()
195
 
 
 
 
 
196
  urls = []
197
- soup = BeautifulSoup(response.text, 'lxml')
198
 
199
- # Sitemap index
200
- for loc in soup.find_all('loc'):
201
- url = loc.text.strip()
202
- if url.endswith('.xml') and url != sitemap_url:
203
- urls.extend(self._parse_sitemap(url))
204
- else:
205
- urls.append(url)
 
206
 
207
- return list(set(urls))
208
  except Exception as e:
209
- logger.error(f"Error parsing sitemap: {str(e)}")
210
  return []
211
 
212
- def _calculate_stats(self, results: List[Dict]):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  """Calcula estadísticas básicas"""
214
  successful = [r for r in results if r.get('status') == 'success']
215
 
@@ -218,28 +284,40 @@ class SEOSpaceAnalyzer:
218
  'successful': len(successful),
219
  'failed': len(results) - len(successful),
220
  'content_types': pd.Series([r.get('type', 'unknown') for r in successful]).value_counts().to_dict(),
221
- 'avg_word_count': np.mean([r.get('word_count', 0) for r in successful])
 
222
  }
223
 
224
- def _analyze_content(self, results: List[Dict]):
225
  """Analiza contenido con NLP"""
226
- successful = [r for r in results if r.get('status') == 'success']
227
- texts = [r.get('content', '') for r in successful]
 
 
 
228
 
229
  # Análisis de temas principales
230
- vectorizer = TfidfVectorizer(stop_words=list(spacy.lang.es.stop_words.STOP_WORDS))
231
  try:
 
 
 
 
 
232
  tfidf = vectorizer.fit_transform(texts)
233
- top_keywords = vectorizer.get_feature_names_out()[np.argsort(tfidf.sum(axis=0).A1][-10:][::-1]
234
- except:
 
 
 
235
  top_keywords = []
236
 
237
  return {
238
- 'top_keywords': list(top_keywords),
239
- 'content_samples': [t[:500] + '...' for t in texts[:3]] # Muestras de contenido
 
240
  }
241
 
242
- def _analyze_links(self, results: List[Dict]):
243
  """Analiza estructura de enlaces"""
244
  all_links = []
245
  for result in results:
@@ -247,78 +325,145 @@ class SEOSpaceAnalyzer:
247
  all_links.extend(result['links'])
248
 
249
  if not all_links:
250
- return {}
 
 
 
 
 
251
 
252
  df = pd.DataFrame(all_links)
 
253
  return {
254
- 'internal_links': df[df['type'] == 'internal']['url'].value_counts().to_dict(),
255
- 'external_domains': df[df['type'] == 'external']['url'].apply(lambda x: urlparse(x).netloc).value_counts().to_dict(),
256
- 'common_anchors': df['anchor'].value_counts().head(10).to_dict()
 
 
 
257
  }
258
 
259
- def _generate_seo_recommendations(self, results: List[Dict]):
260
  """Genera recomendaciones SEO"""
261
  successful = [r for r in results if r.get('status') == 'success']
 
 
262
 
263
  recs = []
264
 
265
  # Revisar metadatos
266
  missing_titles = sum(1 for r in successful if not r.get('metadata', {}).get('title'))
267
  if missing_titles:
268
- recs.append(f"Añadir títulos a {missing_titles} páginas")
 
 
 
 
 
269
 
270
  # Revisar contenido corto
271
  short_content = sum(1 for r in successful if r.get('word_count', 0) < 300)
272
  if short_content:
273
- recs.append(f"Ampliar contenido en {short_content} páginas (menos de 300 palabras)")
 
 
 
 
 
 
 
 
274
 
275
- return recs if recs else ["No se detectaron problemas críticos de SEO"]
276
 
277
- # Interfaz Gradio
278
  def create_interface():
279
  analyzer = SEOSpaceAnalyzer()
280
 
281
  with gr.Blocks(title="SEO Analyzer Pro", theme=gr.themes.Soft()) as interface:
282
  gr.Markdown("""
283
  # 🕵️ SEO Analyzer Pro
284
- *Analizador SEO avanzado con modelos de lenguaje*
 
 
285
  """)
286
 
287
  with gr.Row():
288
  with gr.Column():
289
- sitemap_url = gr.Textbox(
290
  label="URL del Sitemap",
291
  placeholder="https://ejemplo.com/sitemap.xml",
292
  interactive=True
293
  )
294
- analyze_btn = gr.Button("Analizar", variant="primary")
 
 
 
 
295
 
296
  with gr.Column():
297
- status = gr.Textbox(label="Estado", interactive=False)
 
298
 
299
  with gr.Tabs():
300
- with gr.Tab("Resumen"):
301
- stats = gr.JSON(label="Estadísticas")
302
- recommendations = gr.JSON(label="Recomendaciones SEO")
 
 
 
 
 
 
 
 
 
 
303
 
304
- with gr.Tab("Contenido"):
305
- content_analysis = gr.JSON(label="Análisis de Contenido")
306
- content_samples = gr.JSON(label="Muestras de Contenido")
 
307
 
308
- with gr.Tab("Enlaces"):
309
- links_analysis = gr.JSON(label="Análisis de Enlaces")
310
- links_plot = gr.Plot()
 
 
 
311
 
312
  # Event handlers
313
  analyze_btn.click(
314
  fn=analyzer.analyze_sitemap,
315
- inputs=sitemap_url,
316
- outputs=[stats, recommendations, content_analysis, links_analysis],
317
- api_name="analyze"
 
 
 
 
 
 
 
 
 
 
318
  )
319
 
320
  return interface
321
 
322
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
323
  app = create_interface()
324
- app.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
8
  import numpy as np
9
  import pandas as pd
10
  from io import BytesIO
11
+ from typing import List, Dict, Optional, Tuple
12
  from urllib.parse import urlparse, urljoin
13
  from concurrent.futures import ThreadPoolExecutor, as_completed
14
  from bs4 import BeautifulSoup
 
18
  from sklearn.feature_extraction.text import TfidfVectorizer
19
  from requests.adapters import HTTPAdapter
20
  from urllib3.util.retry import Retry
21
+ from transformers import pipeline
22
  from sentence_transformers import SentenceTransformer
23
  import spacy
24
  import torch
 
27
  import matplotlib.pyplot as plt
28
 
29
  # Configuración de logging
30
+ logging.basicConfig(
31
+ level=logging.INFO,
32
+ format='%(asctime)s - %(levelname)s - %(message)s'
33
+ )
34
  logger = logging.getLogger(__name__)
35
 
36
  class SEOSpaceAnalyzer:
 
38
  self.session = self._configure_session()
39
  self.models = self._load_models()
40
  self.base_dir = Path("content_storage")
41
+ self.base_dir.mkdir(parents=True, exist_ok=True)
42
  self.current_analysis = {}
43
 
44
+ def _configure_session(self) -> requests.Session:
45
  """Configura sesión HTTP con reintentos"""
46
  session = requests.Session()
47
  retry = Retry(
48
  total=3,
49
  backoff_factor=1,
50
+ status_forcelist=[500, 502, 503, 504],
51
+ allowed_methods=['GET', 'HEAD']
52
  )
53
  adapter = HTTPAdapter(max_retries=retry)
54
+ session.mount('http://', adapter)
55
  session.mount('https://', adapter)
56
  session.headers.update({
57
  'User-Agent': 'Mozilla/5.0 (compatible; SEOBot/1.0)',
 
59
  })
60
  return session
61
 
62
+ def _load_models(self) -> Dict:
63
  """Carga modelos optimizados para Hugging Face"""
64
+ try:
65
+ device = 0 if torch.cuda.is_available() else -1
66
+ return {
67
+ 'summarizer': pipeline(
68
+ "summarization",
69
+ model="facebook/bart-large-cnn",
70
+ device=device
71
+ ),
72
+ 'ner': pipeline(
73
+ "ner",
74
+ model="dslim/bert-base-NER",
75
+ aggregation_strategy="simple",
76
+ device=device
77
+ ),
78
+ 'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
79
+ 'spacy': spacy.load("es_core_news_lg")
80
+ }
81
+ except Exception as e:
82
+ logger.error(f"Error loading models: {e}")
83
+ raise
84
+
85
+ def analyze_sitemap(self, sitemap_url: str) -> Dict:
86
  """Analiza un sitemap completo"""
87
  try:
88
  urls = self._parse_sitemap(sitemap_url)
 
91
 
92
  results = []
93
  with ThreadPoolExecutor(max_workers=4) as executor:
94
+ futures = {executor.submit(self._process_url, url): url for url in urls[:20]] # Limitar para demo
95
  for future in as_completed(futures):
96
+ try:
97
+ results.append(future.result())
98
+ except Exception as e:
99
+ url = futures[future]
100
+ logger.error(f"Error processing {url}: {e}")
101
+ results.append({'url': url, 'status': 'error', 'error': str(e)})
102
 
103
  self.current_analysis = {
104
  'stats': self._calculate_stats(results),
105
  'content_analysis': self._analyze_content(results),
106
  'links': self._analyze_links(results),
107
+ 'recommendations': self._generate_seo_recommendations(results),
108
+ 'timestamp': datetime.now().isoformat()
109
  }
110
 
111
  return self.current_analysis
 
114
  logger.error(f"Error en análisis: {str(e)}")
115
  return {"error": str(e)}
116
 
117
+ def _process_url(self, url: str) -> Dict:
118
  """Procesa una URL individual"""
119
  try:
120
+ response = self.session.get(url, timeout=15)
121
  response.raise_for_status()
122
 
123
  content_type = response.headers.get('Content-Type', '')
 
128
  elif 'text/html' in content_type:
129
  result.update(self._process_html(response.text, url))
130
 
131
+ self._save_content(url, response.content)
132
  return result
133
+ except requests.exceptions.RequestException as e:
134
  logger.warning(f"Error procesando {url}: {str(e)}")
135
  return {'url': url, 'status': 'error', 'error': str(e)}
136
 
137
+ def _process_html(self, html: str, base_url: str) -> Dict:
138
  """Procesa contenido HTML"""
139
+ soup = BeautifulSoup(html, 'html.parser')
140
  clean_text = self._clean_text(soup.get_text())
141
 
142
  return {
 
147
  'metadata': self._extract_metadata(soup)
148
  }
149
 
150
+ def _process_pdf(self, content: bytes) -> Dict:
151
  """Procesa documentos PDF"""
152
+ try:
153
+ text = ""
154
+ with BytesIO(content) as pdf_file:
155
+ reader = PyPDF2.PdfReader(pdf_file)
156
+ for page in reader.pages:
157
+ text += page.extract_text() or "" # Handle None return
158
+
159
+ clean_text = self._clean_text(text)
160
+ return {
161
+ 'type': 'pdf',
162
+ 'content': clean_text,
163
+ 'word_count': len(clean_text.split()),
164
+ 'page_count': len(reader.pages)
165
+ }
166
+ except PyPDF2.PdfReadError as e:
167
+ logger.error(f"Error reading PDF: {e}")
168
+ return {'type': 'pdf', 'error': str(e)}
169
+
170
+ def _clean_text(self, text: str) -> str:
171
  """Limpieza avanzada de texto"""
172
+ if not text:
173
+ return ""
174
  text = re.sub(r'\s+', ' ', text)
175
  return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip()
176
 
177
+ def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
178
  """Extrae y clasifica enlaces"""
179
  links = []
180
+ base_netloc = urlparse(base_url).netloc
181
+
182
  for tag in soup.find_all('a', href=True):
183
  try:
184
+ href = tag['href'].strip()
185
+ if not href or href.startswith('javascript:'):
186
+ continue
187
+
188
+ full_url = urljoin(base_url, href)
189
  parsed = urlparse(full_url)
190
 
191
  links.append({
192
  'url': full_url,
193
+ 'type': 'internal' if parsed.netloc == base_netloc else 'external',
194
+ 'anchor': self._clean_text(tag.get_text())[:100],
195
  'file_type': self._get_file_type(parsed.path)
196
  })
197
+ except Exception as e:
198
+ logger.warning(f"Error processing link {tag.get('href')}: {e}")
199
  continue
200
  return links
201
 
202
+ def _get_file_type(self, path: str) -> str:
203
  """Determina tipo de archivo por extensión"""
204
  ext = Path(path).suffix.lower()
205
  return ext[1:] if ext else 'html'
206
 
207
+ def _extract_metadata(self, soup: BeautifulSoup) -> Dict:
208
  """Extrae metadatos SEO"""
209
+ metadata = {
210
+ 'title': '',
211
+ 'description': '',
212
+ 'keywords': [],
213
+ 'og': {}
214
+ }
215
 
216
+ if soup.title and soup.title.string:
217
+ metadata['title'] = soup.title.string.strip()[:200]
 
218
 
 
219
  for meta in soup.find_all('meta'):
220
+ name = meta.get('name', '').lower()
221
+ property_ = meta.get('property', '').lower()
222
+ content = meta.get('content', '')
223
+
224
+ if name == 'description':
225
+ metadata['description'] = content[:300]
226
+ elif name == 'keywords':
227
+ metadata['keywords'] = [kw.strip() for kw in content.split(',') if kw.strip()]
228
+ elif property_.startswith('og:'):
229
+ metadata['og'][property_[3:]] = content
230
 
231
  return metadata
232
 
233
+ def _parse_sitemap(self, sitemap_url: str) -> List[str]:
234
  """Parsea sitemap XML básico"""
235
  try:
236
+ response = self.session.get(sitemap_url, timeout=10)
237
  response.raise_for_status()
238
 
239
+ if 'xml' not in response.headers.get('Content-Type', ''):
240
+ logger.warning(f"El sitemap no parece ser XML: {sitemap_url}")
241
+ return []
242
+
243
  urls = []
244
+ soup = BeautifulSoup(response.text, 'lxml-xml') # Usar parser XML específico
245
 
246
+ # Handle sitemap index
247
+ if soup.find('sitemapindex'):
248
+ for sitemap in soup.find_all('loc'):
249
+ url = sitemap.text.strip()
250
+ if url.endswith('.xml'):
251
+ urls.extend(self._parse_sitemap(url))
252
+ else:
253
+ urls = [loc.text.strip() for loc in soup.find_all('loc')]
254
 
255
+ return list(set(url for url in urls if url.startswith('http')))
256
  except Exception as e:
257
+ logger.error(f"Error parsing sitemap {sitemap_url}: {e}")
258
  return []
259
 
260
+ def _save_content(self, url: str, content: bytes) -> None:
261
+ """Almacena el contenido descargado"""
262
+ try:
263
+ parsed = urlparse(url)
264
+ domain_dir = self.base_dir / parsed.netloc
265
+ path = parsed.path.lstrip('/')
266
+
267
+ if not path or path.endswith('/'):
268
+ path = path + 'index.html'
269
+
270
+ save_path = domain_dir / path
271
+ save_path.parent.mkdir(parents=True, exist_ok=True)
272
+
273
+ with open(save_path, 'wb') as f:
274
+ f.write(content)
275
+ except Exception as e:
276
+ logger.error(f"Error saving content for {url}: {e}")
277
+
278
+ def _calculate_stats(self, results: List[Dict]) -> Dict:
279
  """Calcula estadísticas básicas"""
280
  successful = [r for r in results if r.get('status') == 'success']
281
 
 
284
  'successful': len(successful),
285
  'failed': len(results) - len(successful),
286
  'content_types': pd.Series([r.get('type', 'unknown') for r in successful]).value_counts().to_dict(),
287
+ 'avg_word_count': round(np.mean([r.get('word_count', 0) for r in successful]), 1),
288
+ 'failed_urls': [r['url'] for r in results if r.get('status') != 'success']
289
  }
290
 
291
+ def _analyze_content(self, results: List[Dict]) -> Dict:
292
  """Analiza contenido con NLP"""
293
+ successful = [r for r in results if r.get('status') == 'success' and r.get('content')]
294
+ texts = [r['content'] for r in successful if len(r['content'].split()) > 10] # Filtrar contenido muy corto
295
+
296
+ if not texts:
297
+ return {'top_keywords': [], 'content_samples': []}
298
 
299
  # Análisis de temas principales
 
300
  try:
301
+ vectorizer = TfidfVectorizer(
302
+ stop_words=list(spacy.lang.es.stop_words.STOP_WORDS),
303
+ max_features=50,
304
+ ngram_range=(1, 2)
305
+ )
306
  tfidf = vectorizer.fit_transform(texts)
307
+ feature_names = vectorizer.get_feature_names_out()
308
+ sorted_indices = np.argsort(np.asarray(tfidf.sum(axis=0)).ravel())[-10:] # Top 10 índices
309
+ top_keywords = feature_names[sorted_indices][::-1].tolist() # Orden descendente
310
+ except Exception as e:
311
+ logger.error(f"Error en análisis TF-IDF: {str(e)}")
312
  top_keywords = []
313
 
314
  return {
315
+ 'top_keywords': top_keywords,
316
+ 'content_samples': [{'url': r['url'], 'sample': r['content'][:500] + '...'}
317
+ for r in successful[:3]] # Muestras de contenido
318
  }
319
 
320
+ def _analyze_links(self, results: List[Dict]) -> Dict:
321
  """Analiza estructura de enlaces"""
322
  all_links = []
323
  for result in results:
 
325
  all_links.extend(result['links'])
326
 
327
  if not all_links:
328
+ return {
329
+ 'internal_links': {},
330
+ 'external_domains': {},
331
+ 'common_anchors': {},
332
+ 'file_types': {}
333
+ }
334
 
335
  df = pd.DataFrame(all_links)
336
+
337
  return {
338
+ 'internal_links': df[df['type'] == 'internal']['url'].value_counts().head(20).to_dict(),
339
+ 'external_domains': df[df['type'] == 'external']['url']
340
+ .apply(lambda x: urlparse(x).netloc)
341
+ .value_counts().head(10).to_dict(),
342
+ 'common_anchors': df['anchor'].value_counts().head(10).to_dict(),
343
+ 'file_types': df['file_type'].value_counts().to_dict()
344
  }
345
 
346
+ def _generate_seo_recommendations(self, results: List[Dict]) -> List[str]:
347
  """Genera recomendaciones SEO"""
348
  successful = [r for r in results if r.get('status') == 'success']
349
+ if not successful:
350
+ return ["No se pudo analizar ningún contenido exitosamente"]
351
 
352
  recs = []
353
 
354
  # Revisar metadatos
355
  missing_titles = sum(1 for r in successful if not r.get('metadata', {}).get('title'))
356
  if missing_titles:
357
+ recs.append(f"📌 Añadir títulos a {missing_titles} páginas")
358
+
359
+ short_descriptions = sum(1 for r in successful
360
+ if not r.get('metadata', {}).get('description'))
361
+ if short_descriptions:
362
+ recs.append(f"📌 Añadir meta descripciones a {short_descriptions} páginas")
363
 
364
  # Revisar contenido corto
365
  short_content = sum(1 for r in successful if r.get('word_count', 0) < 300)
366
  if short_content:
367
+ recs.append(f"📝 Ampliar contenido en {short_content} páginas (menos de 300 palabras)")
368
+
369
+ # Analizar enlaces
370
+ all_links = [link for r in results for link in r.get('links', [])]
371
+ if all_links:
372
+ df_links = pd.DataFrame(all_links)
373
+ broken_links = sum(1 for link in all_links if link['type'] == 'internal')
374
+ if broken_links > 5: # Umbral arbitrario
375
+ recs.append(f"🔗 Revisar {broken_links} enlaces internos (posibles rotos)")
376
 
377
+ return recs if recs else ["No se detectaron problemas críticos de SEO"]
378
 
 
379
  def create_interface():
380
  analyzer = SEOSpaceAnalyzer()
381
 
382
  with gr.Blocks(title="SEO Analyzer Pro", theme=gr.themes.Soft()) as interface:
383
  gr.Markdown("""
384
  # 🕵️ SEO Analyzer Pro
385
+ **Analizador SEO avanzado con modelos de lenguaje**
386
+
387
+ Sube la URL de un sitemap.xml para analizar todo el sitio web.
388
  """)
389
 
390
  with gr.Row():
391
  with gr.Column():
392
+ sitemap_input = gr.Textbox(
393
  label="URL del Sitemap",
394
  placeholder="https://ejemplo.com/sitemap.xml",
395
  interactive=True
396
  )
397
+ analyze_btn = gr.Button("Analizar Sitio", variant="primary")
398
+
399
+ with gr.Row():
400
+ clear_btn = gr.Button("Limpiar")
401
+ download_btn = gr.Button("Descargar Reporte", variant="secondary")
402
 
403
  with gr.Column():
404
+ status_output = gr.Textbox(label="Estado del Análisis", interactive=False)
405
+ progress_bar = gr.Progress()
406
 
407
  with gr.Tabs():
408
+ with gr.Tab("📊 Resumen"):
409
+ stats_output = gr.JSON(label="Estadísticas Generales")
410
+ recommendations_output = gr.JSON(label="Recomendaciones SEO")
411
+
412
+ with gr.Tab("📝 Contenido"):
413
+ content_output = gr.JSON(label="Análisis de Contenido")
414
+ gr.Examples(
415
+ examples=[
416
+ {"content": "Ejemplo de análisis de contenido..."}
417
+ ],
418
+ inputs=[content_output],
419
+ label="Ejemplos de Salida"
420
+ )
421
 
422
+ with gr.Tab("🔗 Enlaces"):
423
+ links_output = gr.JSON(label="Análisis de Enlaces")
424
+ with gr.Accordion("Visualización de Enlaces", open=False):
425
+ links_plot = gr.Plot()
426
 
427
+ with gr.Tab("📂 Documentos"):
428
+ gr.Markdown("""
429
+ ### Documentos Encontrados
430
+ Los documentos descargados se guardan en la carpeta `content_storage/`
431
+ """)
432
+ file_explorer = gr.FileExplorer(glob="content_storage/**/*")
433
 
434
  # Event handlers
435
  analyze_btn.click(
436
  fn=analyzer.analyze_sitemap,
437
+ inputs=sitemap_input,
438
+ outputs=[stats_output, recommendations_output, content_output, links_output],
439
+ show_progress=True
440
+ )
441
+
442
+ clear_btn.click(
443
+ fn=lambda: [None]*4,
444
+ outputs=[stats_output, recommendations_output, content_output, links_output]
445
+ )
446
+
447
+ download_btn.click(
448
+ fn=lambda: gr.File(value="content_storage/seo_report.json"),
449
+ outputs=gr.File()
450
  )
451
 
452
  return interface
453
 
454
  if __name__ == "__main__":
455
+ # Verificar modelos antes de iniciar
456
+ try:
457
+ spacy.load("es_core_news_lg")
458
+ except OSError:
459
+ logger.error("Modelo spaCy 'es_core_news_lg' no encontrado. Ejecute:")
460
+ logger.error("python -m spacy download es_core_news_lg")
461
+ exit(1)
462
+
463
  app = create_interface()
464
+ app.launch(
465
+ server_name="0.0.0.0",
466
+ server_port=7860,
467
+ show_error=True,
468
+ share=False
469
+ )