Merlintxu commited on
Commit
63fe26b
·
verified ·
1 Parent(s): b5c209f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +197 -149
app.py CHANGED
@@ -15,38 +15,18 @@ from bs4 import BeautifulSoup
15
  from pathlib import Path
16
  from datetime import datetime
17
  from collections import defaultdict
18
-
19
- import gradio as gr
20
- import matplotlib.pyplot as plt
21
- from sklearn.feature_extractio
22
- import json
23
- import logging
24
- import re
25
- import requests
26
- import hashlib
27
- import PyPDF2
28
- import numpy as np
29
- import pandas as pd
30
- from io import BytesIO
31
- from typing import List, Dict, Optional
32
- from urllib.parse import urlparse, urljoin
33
- from concurrent.futures import ThreadPoolExecutor, as_completed
34
- from bs4 import BeautifulSoup
35
- from pathlib import Path
36
- from datetime import datetime
37
- from collections import defaultdict
38
-
39
- import gradio as gr
40
- import matplotlib.pyplot as plt
41
  from sklearn.feature_extraction.text import TfidfVectorizer
42
  from requests.adapters import HTTPAdapter
43
- from requests.packages.urllib3.util.retry import Retry
44
  from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
45
  from sentence_transformers import SentenceTransformer
46
  import spacy
47
  import torch
48
 
49
- # Configuración inicial
 
 
 
50
  logging.basicConfig(level=logging.INFO)
51
  logger = logging.getLogger(__name__)
52
 
@@ -55,12 +35,11 @@ class SEOSpaceAnalyzer:
55
  self.session = self._configure_session()
56
  self.models = self._load_models()
57
  self.base_dir = Path("content_storage")
58
- self.link_analysis = defaultdict(list)
59
- self.documents = []
60
  self.current_analysis = {}
61
 
62
  def _configure_session(self):
63
- """Configuración avanzada de sesión HTTP con reintentos"""
64
  session = requests.Session()
65
  retry = Retry(
66
  total=3,
@@ -76,201 +55,270 @@ class SEOSpaceAnalyzer:
76
  return session
77
 
78
  def _load_models(self):
79
- """Carga modelos de Hugging Face optimizados"""
80
  device = 0 if torch.cuda.is_available() else -1
81
  return {
82
  'summarizer': pipeline("summarization",
83
  model="facebook/bart-large-cnn",
84
  device=device),
85
  'ner': pipeline("ner",
86
- model="dslim/bert-base-NER",
87
- aggregation_strategy="simple",
88
- device=device),
89
- 'qa': pipeline("question-answering",
90
- model="deepset/roberta-base-squad2",
91
- device=device),
92
  'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
93
  'spacy': spacy.load("es_core_news_lg")
94
  }
95
 
96
- def _process_url(self, url):
97
- """Procesa una URL y extrae su contenido"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  try:
99
- response = self.session.get(url, timeout=15)
100
  response.raise_for_status()
101
 
102
  content_type = response.headers.get('Content-Type', '')
103
- result = {'url': url, 'links': []}
104
 
105
  if 'application/pdf' in content_type:
106
  result.update(self._process_pdf(response.content))
107
  elif 'text/html' in content_type:
108
  result.update(self._process_html(response.text, url))
109
-
110
- self._save_content(url, response.content)
111
- return result
112
 
 
113
  except Exception as e:
114
- logger.error(f"Error procesando {url}: {str(e)}")
115
- return {'url': url, 'error': str(e)}
116
 
117
- def _process_html(self, html, base_url):
118
  """Procesa contenido HTML"""
119
  soup = BeautifulSoup(html, 'lxml')
 
 
120
  return {
121
- 'content': self._clean_text(soup.get_text()),
122
  'type': 'html',
123
- 'metadata': self._extract_metadata(soup),
124
- 'links': self._extract_links(soup, base_url)
 
 
125
  }
126
 
127
- def _process_pdf(self, content):
128
  """Procesa documentos PDF"""
129
  text = ""
130
  with BytesIO(content) as pdf_file:
131
  reader = PyPDF2.PdfReader(pdf_file)
132
  for page in reader.pages:
133
  text += page.extract_text()
134
-
 
135
  return {
136
- 'content': self._clean_text(text),
137
  'type': 'pdf',
138
- 'metadata': {'pages': len(reader.pages)}
 
 
139
  }
140
 
141
- def _extract_links(self, soup, base_url):
 
 
 
 
 
142
  """Extrae y clasifica enlaces"""
143
  links = []
144
  for tag in soup.find_all('a', href=True):
145
- href = tag['href']
146
- full_url = urljoin(base_url, href)
147
- link_type = 'internal' if urlparse(full_url).netloc == urlparse(base_url).netloc else 'external'
148
-
149
- links.append({
150
- 'url': full_url,
151
- 'type': link_type,
152
- 'anchor': self._clean_text(tag.text),
153
- 'file_type': self._get_file_type(href)
154
- })
 
 
155
  return links
156
 
157
- def _get_file_type(self, url):
158
- """Determina el tipo de archivo por extensión"""
159
- ext = Path(urlparse(url).path).suffix.lower()
160
  return ext[1:] if ext else 'html'
161
 
162
- def _clean_text(self, text):
163
- """Limpieza avanzada de texto"""
164
- text = re.sub(r'\s+', ' ', text)
165
- return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip()
166
-
167
- def _save_content(self, url, content):
168
- """Almacena el contenido descargado"""
169
- path = urlparse(url).path.lstrip('/')
170
- save_path = self.base_dir / urlparse(url).netloc / path
171
- save_path.parent.mkdir(parents=True, exist_ok=True)
172
 
173
- with open(save_path.with_suffix(self._get_file_type(url)), 'wb') as f:
174
- f.write(content)
175
-
176
- def analyze_sitemap(self, sitemap_url):
177
- """Analiza todo el sitemap y genera reportes"""
178
- urls = self._parse_sitemap(sitemap_url)
179
- results = []
180
 
181
- with ThreadPoolExecutor(max_workers=4) as executor:
182
- futures = [executor.submit(self._process_url, url) for url in urls]
183
- for future in as_completed(futures):
184
- results.append(future.result())
185
- progress(len(results)/len(urls))
 
186
 
187
- self.current_analysis = {
188
- 'basic_stats': self._calculate_stats(results),
189
- 'content_analysis': self._analyze_content(results),
190
- 'link_analysis': self._analyze_links(results),
191
- 'seo_recommendations': self._generate_recommendations(results)
192
- }
193
-
194
- return self.current_analysis
195
 
196
- def _parse_sitemap(self, sitemap_url):
197
- """Parsea sitemaps XML incluyendo sitemaps indexados"""
198
- # Implementación de parsing de sitemap (similar a versiones anteriores)
199
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
- def _calculate_stats(self, results):
202
- """Calcula estadísticas básicas del análisis"""
 
 
203
  return {
204
  'total_urls': len(results),
205
- 'content_types': pd.Series([r.get('type', 'unknown') for r in results]).value_counts().to_dict(),
206
- 'avg_content_length': np.mean([len(r.get('content', '')) for r in results])
 
 
207
  }
208
 
209
- def create_report(self):
210
- """Crea un reporte descargable en múltiples formatos"""
211
- report = {
212
- 'timestamp': datetime.now().isoformat(),
213
- 'analysis': self.current_analysis
 
 
 
 
 
 
 
 
 
 
 
214
  }
 
 
 
 
 
 
 
215
 
216
- # Guardar en JSON
217
- json_path = self.base_dir / 'seo_report.json'
218
- with open(json_path, 'w') as f:
219
- json.dump(report, f)
220
-
221
- # Crear CSV con enlaces
222
- df = pd.DataFrame([link for result in self.current_analysis['link_analysis'] for link in result['links']])
223
- csv_path = self.base_dir / 'links_analysis.csv'
224
- df.to_csv(csv_path, index=False)
225
 
226
- return [str(json_path), str(csv_path)]
 
 
 
 
 
227
 
228
- def create_visualization(self):
229
- """Genera visualizaciones interactivas"""
230
- fig, ax = plt.subplots()
231
- pd.Series(self.current_analysis['basic_stats']['content_types']).plot.pie(
232
- ax=ax,
233
- title='Distribución de Tipos de Contenido',
234
- ylabel=''
235
- )
236
- return fig
 
 
 
 
 
 
 
 
237
 
238
- # Interface Gradio
239
  def create_interface():
240
  analyzer = SEOSpaceAnalyzer()
241
 
242
  with gr.Blocks(title="SEO Analyzer Pro", theme=gr.themes.Soft()) as interface:
243
- gr.Markdown("# 🕵️ SEO Analyzer Pro")
 
 
 
244
 
245
  with gr.Row():
246
- sitemap_url = gr.Textbox(label="URL del Sitemap", placeholder="https://www.ing.es/ennaranja/sitemap.xml")
247
- analyze_btn = gr.Button("Analizar", variant="primary")
 
 
 
 
 
 
 
 
248
 
249
- with gr.Tab("Resultados"):
250
- json_output = gr.JSON(label="Análisis Completo")
251
- plot_output = gr.Plot(label="Visualización")
 
252
 
253
- with gr.Tab("Enlaces"):
254
- internal_links = gr.Dataframe(label="Enlaces Internos")
255
- external_links = gr.Dataframe(label="Enlaces Externos")
256
 
257
- with gr.Tab("Descargas"):
258
- report_download = gr.Files(label="Descargar Reporte")
259
- download_btn = gr.Button("Generar Reporte", variant="secondary")
260
 
 
261
  analyze_btn.click(
262
  fn=analyzer.analyze_sitemap,
263
  inputs=sitemap_url,
264
- outputs=[json_output, plot_output, internal_links, external_links]
265
- )
266
-
267
- download_btn.click(
268
- fn=analyzer.create_report,
269
- outputs=report_download
270
  )
271
 
272
  return interface
273
 
274
  if __name__ == "__main__":
275
- interface = create_interface()
276
- interface.launch(server_name="0.0.0.0", server_port=7860)
 
15
  from pathlib import Path
16
  from datetime import datetime
17
  from collections import defaultdict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  from sklearn.feature_extraction.text import TfidfVectorizer
19
  from requests.adapters import HTTPAdapter
20
+ from urllib3.util.retry import Retry
21
  from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
22
  from sentence_transformers import SentenceTransformer
23
  import spacy
24
  import torch
25
 
26
+ import gradio as gr
27
+ import matplotlib.pyplot as plt
28
+
29
+ # Configuración de logging
30
  logging.basicConfig(level=logging.INFO)
31
  logger = logging.getLogger(__name__)
32
 
 
35
  self.session = self._configure_session()
36
  self.models = self._load_models()
37
  self.base_dir = Path("content_storage")
38
+ self.base_dir.mkdir(exist_ok=True)
 
39
  self.current_analysis = {}
40
 
41
  def _configure_session(self):
42
+ """Configura sesión HTTP con reintentos"""
43
  session = requests.Session()
44
  retry = Retry(
45
  total=3,
 
55
  return session
56
 
57
  def _load_models(self):
58
+ """Carga modelos optimizados para Hugging Face"""
59
  device = 0 if torch.cuda.is_available() else -1
60
  return {
61
  'summarizer': pipeline("summarization",
62
  model="facebook/bart-large-cnn",
63
  device=device),
64
  'ner': pipeline("ner",
65
+ model="dslim/bert-base-NER",
66
+ device=device),
 
 
 
 
67
  'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
68
  'spacy': spacy.load("es_core_news_lg")
69
  }
70
 
71
+ def analyze_sitemap(self, sitemap_url: str):
72
+ """Analiza un sitemap completo"""
73
+ try:
74
+ urls = self._parse_sitemap(sitemap_url)
75
+ if not urls:
76
+ return {"error": "No se pudieron extraer URLs del sitemap"}
77
+
78
+ results = []
79
+ with ThreadPoolExecutor(max_workers=4) as executor:
80
+ futures = [executor.submit(self._process_url, url) for url in urls[:50]] # Limitar para demo
81
+ for future in as_completed(futures):
82
+ results.append(future.result())
83
+
84
+ self.current_analysis = {
85
+ 'stats': self._calculate_stats(results),
86
+ 'content_analysis': self._analyze_content(results),
87
+ 'links': self._analyze_links(results),
88
+ 'recommendations': self._generate_seo_recommendations(results)
89
+ }
90
+
91
+ return self.current_analysis
92
+
93
+ except Exception as e:
94
+ logger.error(f"Error en análisis: {str(e)}")
95
+ return {"error": str(e)}
96
+
97
+ def _process_url(self, url: str):
98
+ """Procesa una URL individual"""
99
  try:
100
+ response = self.session.get(url, timeout=10)
101
  response.raise_for_status()
102
 
103
  content_type = response.headers.get('Content-Type', '')
104
+ result = {'url': url, 'status': 'success'}
105
 
106
  if 'application/pdf' in content_type:
107
  result.update(self._process_pdf(response.content))
108
  elif 'text/html' in content_type:
109
  result.update(self._process_html(response.text, url))
 
 
 
110
 
111
+ return result
112
  except Exception as e:
113
+ logger.warning(f"Error procesando {url}: {str(e)}")
114
+ return {'url': url, 'status': 'error', 'error': str(e)}
115
 
116
+ def _process_html(self, html: str, base_url: str):
117
  """Procesa contenido HTML"""
118
  soup = BeautifulSoup(html, 'lxml')
119
+ clean_text = self._clean_text(soup.get_text())
120
+
121
  return {
 
122
  'type': 'html',
123
+ 'content': clean_text,
124
+ 'word_count': len(clean_text.split()),
125
+ 'links': self._extract_links(soup, base_url),
126
+ 'metadata': self._extract_metadata(soup)
127
  }
128
 
129
+ def _process_pdf(self, content: bytes):
130
  """Procesa documentos PDF"""
131
  text = ""
132
  with BytesIO(content) as pdf_file:
133
  reader = PyPDF2.PdfReader(pdf_file)
134
  for page in reader.pages:
135
  text += page.extract_text()
136
+
137
+ clean_text = self._clean_text(text)
138
  return {
 
139
  'type': 'pdf',
140
+ 'content': clean_text,
141
+ 'word_count': len(clean_text.split()),
142
+ 'page_count': len(reader.pages)
143
  }
144
 
145
+ def _clean_text(self, text: str):
146
+ """Limpieza avanzada de texto"""
147
+ text = re.sub(r'\s+', ' ', text)
148
+ return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip()
149
+
150
+ def _extract_links(self, soup: BeautifulSoup, base_url: str):
151
  """Extrae y clasifica enlaces"""
152
  links = []
153
  for tag in soup.find_all('a', href=True):
154
+ try:
155
+ full_url = urljoin(base_url, tag['href'])
156
+ parsed = urlparse(full_url)
157
+
158
+ links.append({
159
+ 'url': full_url,
160
+ 'type': 'internal' if parsed.netloc == urlparse(base_url).netloc else 'external',
161
+ 'anchor': self._clean_text(tag.text)[:100],
162
+ 'file_type': self._get_file_type(parsed.path)
163
+ })
164
+ except:
165
+ continue
166
  return links
167
 
168
+ def _get_file_type(self, path: str):
169
+ """Determina tipo de archivo por extensión"""
170
+ ext = Path(path).suffix.lower()
171
  return ext[1:] if ext else 'html'
172
 
173
+ def _extract_metadata(self, soup: BeautifulSoup):
174
+ """Extrae metadatos SEO"""
175
+ metadata = {'title': '', 'description': '', 'keywords': []}
 
 
 
 
 
 
 
176
 
177
+ # Título
178
+ if soup.title:
179
+ metadata['title'] = soup.title.string.strip()
 
 
 
 
180
 
181
+ # Meta tags
182
+ for meta in soup.find_all('meta'):
183
+ if meta.get('name') == 'description':
184
+ metadata['description'] = meta.get('content', '')[:500]
185
+ elif meta.get('name') == 'keywords':
186
+ metadata['keywords'] = [kw.strip() for kw in meta.get('content', '').split(',')]
187
 
188
+ return metadata
 
 
 
 
 
 
 
189
 
190
+ def _parse_sitemap(self, sitemap_url: str):
191
+ """Parsea sitemap XML básico"""
192
+ try:
193
+ response = self.session.get(sitemap_url)
194
+ response.raise_for_status()
195
+
196
+ urls = []
197
+ soup = BeautifulSoup(response.text, 'lxml')
198
+
199
+ # Sitemap index
200
+ for loc in soup.find_all('loc'):
201
+ url = loc.text.strip()
202
+ if url.endswith('.xml') and url != sitemap_url:
203
+ urls.extend(self._parse_sitemap(url))
204
+ else:
205
+ urls.append(url)
206
+
207
+ return list(set(urls))
208
+ except Exception as e:
209
+ logger.error(f"Error parsing sitemap: {str(e)}")
210
+ return []
211
 
212
+ def _calculate_stats(self, results: List[Dict]):
213
+ """Calcula estadísticas básicas"""
214
+ successful = [r for r in results if r.get('status') == 'success']
215
+
216
  return {
217
  'total_urls': len(results),
218
+ 'successful': len(successful),
219
+ 'failed': len(results) - len(successful),
220
+ 'content_types': pd.Series([r.get('type', 'unknown') for r in successful]).value_counts().to_dict(),
221
+ 'avg_word_count': np.mean([r.get('word_count', 0) for r in successful])
222
  }
223
 
224
+ def _analyze_content(self, results: List[Dict]):
225
+ """Analiza contenido con NLP"""
226
+ successful = [r for r in results if r.get('status') == 'success']
227
+ texts = [r.get('content', '') for r in successful]
228
+
229
+ # Análisis de temas principales
230
+ vectorizer = TfidfVectorizer(stop_words=list(spacy.lang.es.stop_words.STOP_WORDS))
231
+ try:
232
+ tfidf = vectorizer.fit_transform(texts)
233
+ top_keywords = vectorizer.get_feature_names_out()[np.argsort(tfidf.sum(axis=0).A1][-10:][::-1]
234
+ except:
235
+ top_keywords = []
236
+
237
+ return {
238
+ 'top_keywords': list(top_keywords),
239
+ 'content_samples': [t[:500] + '...' for t in texts[:3]] # Muestras de contenido
240
  }
241
+
242
+ def _analyze_links(self, results: List[Dict]):
243
+ """Analiza estructura de enlaces"""
244
+ all_links = []
245
+ for result in results:
246
+ if result.get('links'):
247
+ all_links.extend(result['links'])
248
 
249
+ if not all_links:
250
+ return {}
 
 
 
 
 
 
 
251
 
252
+ df = pd.DataFrame(all_links)
253
+ return {
254
+ 'internal_links': df[df['type'] == 'internal']['url'].value_counts().to_dict(),
255
+ 'external_domains': df[df['type'] == 'external']['url'].apply(lambda x: urlparse(x).netloc).value_counts().to_dict(),
256
+ 'common_anchors': df['anchor'].value_counts().head(10).to_dict()
257
+ }
258
 
259
+ def _generate_seo_recommendations(self, results: List[Dict]):
260
+ """Genera recomendaciones SEO"""
261
+ successful = [r for r in results if r.get('status') == 'success']
262
+
263
+ recs = []
264
+
265
+ # Revisar metadatos
266
+ missing_titles = sum(1 for r in successful if not r.get('metadata', {}).get('title'))
267
+ if missing_titles:
268
+ recs.append(f"Añadir títulos a {missing_titles} páginas")
269
+
270
+ # Revisar contenido corto
271
+ short_content = sum(1 for r in successful if r.get('word_count', 0) < 300)
272
+ if short_content:
273
+ recs.append(f"Ampliar contenido en {short_content} páginas (menos de 300 palabras)")
274
+
275
+ return recs if recs else ["No se detectaron problemas críticos de SEO"]
276
 
277
+ # Interfaz Gradio
278
  def create_interface():
279
  analyzer = SEOSpaceAnalyzer()
280
 
281
  with gr.Blocks(title="SEO Analyzer Pro", theme=gr.themes.Soft()) as interface:
282
+ gr.Markdown("""
283
+ # 🕵️ SEO Analyzer Pro
284
+ *Analizador SEO avanzado con modelos de lenguaje*
285
+ """)
286
 
287
  with gr.Row():
288
+ with gr.Column():
289
+ sitemap_url = gr.Textbox(
290
+ label="URL del Sitemap",
291
+ placeholder="https://ejemplo.com/sitemap.xml",
292
+ interactive=True
293
+ )
294
+ analyze_btn = gr.Button("Analizar", variant="primary")
295
+
296
+ with gr.Column():
297
+ status = gr.Textbox(label="Estado", interactive=False)
298
 
299
+ with gr.Tabs():
300
+ with gr.Tab("Resumen"):
301
+ stats = gr.JSON(label="Estadísticas")
302
+ recommendations = gr.JSON(label="Recomendaciones SEO")
303
 
304
+ with gr.Tab("Contenido"):
305
+ content_analysis = gr.JSON(label="Análisis de Contenido")
306
+ content_samples = gr.JSON(label="Muestras de Contenido")
307
 
308
+ with gr.Tab("Enlaces"):
309
+ links_analysis = gr.JSON(label="Análisis de Enlaces")
310
+ links_plot = gr.Plot()
311
 
312
+ # Event handlers
313
  analyze_btn.click(
314
  fn=analyzer.analyze_sitemap,
315
  inputs=sitemap_url,
316
+ outputs=[stats, recommendations, content_analysis, links_analysis],
317
+ api_name="analyze"
 
 
 
 
318
  )
319
 
320
  return interface
321
 
322
  if __name__ == "__main__":
323
+ app = create_interface()
324
+ app.launch(server_name="0.0.0.0", server_port=7860)