Merlintxu commited on
Commit
6c55771
·
verified ·
1 Parent(s): a1463f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -62
app.py CHANGED
@@ -32,27 +32,7 @@ logging.basicConfig(
32
  format='%(asctime)s - %(levelname)s - %(message)s'
33
  )
34
  logger = logging.getLogger(__name__)
35
- def setup_spacy_model():
36
- """Descarga el modelo de spaCy si no está instalado"""
37
- try:
38
- spacy.load("es_core_news_lg")
39
- logger.info("Modelo spaCy 'es_core_news_lg' cargado correctamente")
40
- except OSError:
41
- logger.info("Descargando modelo spaCy 'es_core_news_lg'...")
42
- try:
43
- subprocess.run(
44
- [sys.executable, "-m", "spacy", "download", "es_core_news_lg"],
45
- check=True,
46
- stdout=subprocess.PIPE,
47
- stderr=subprocess.PIPE
48
- )
49
- logger.info("Modelo descargado exitosamente")
50
- except subprocess.CalledProcessError as e:
51
- logger.error(f"Error al descargar modelo: {e.stderr.decode()}")
52
- raise RuntimeError("No se pudo descargar el modelo spaCy") from e
53
 
54
- # Configurar modelo antes de iniciar
55
- setup_spacy_model()
56
  class SEOSpaceAnalyzer:
57
  def __init__(self):
58
  self.session = self._configure_session()
@@ -60,20 +40,21 @@ class SEOSpaceAnalyzer:
60
  self.base_dir = Path("content_storage")
61
  self.base_dir.mkdir(parents=True, exist_ok=True)
62
  self.current_analysis = {}
 
63
  def _load_models(self) -> Dict:
64
  """Carga modelos optimizados para Hugging Face"""
65
  try:
66
  device = 0 if torch.cuda.is_available() else -1
67
  return {
68
  'summarizer': pipeline("summarization", model="facebook/bart-large-cnn", device=device),
69
- 'ner': pipeline("ner", model="dslim/bert-base-NER", device=device),
70
  'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
71
- 'spacy': spacy.load("es_core_news_lg") # Ahora seguro que existe
72
  }
73
  except Exception as e:
74
  logger.error(f"Error loading models: {e}")
75
  raise
76
-
77
  def _configure_session(self) -> requests.Session:
78
  """Configura sesión HTTP con reintentos"""
79
  session = requests.Session()
@@ -92,39 +73,15 @@ class SEOSpaceAnalyzer:
92
  })
93
  return session
94
 
95
- def _load_models(self) -> Dict:
96
- """Carga modelos optimizados para Hugging Face"""
97
- try:
98
- device = 0 if torch.cuda.is_available() else -1
99
- return {
100
- 'summarizer': pipeline(
101
- "summarization",
102
- model="facebook/bart-large-cnn",
103
- device=device
104
- ),
105
- 'ner': pipeline(
106
- "ner",
107
- model="dslim/bert-base-NER",
108
- aggregation_strategy="simple",
109
- device=device
110
- ),
111
- 'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
112
- 'spacy': spacy.load("es_core_news_lg")
113
- }
114
- except Exception as e:
115
- logger.error(f"Error loading models: {e}")
116
- raise
117
-
118
- def analyze_sitemap(self, sitemap_url: str) -> Dict:
119
- """Analiza un sitemap completo"""
120
  try:
121
  urls = self._parse_sitemap(sitemap_url)
122
  if not urls:
123
- return {"error": "No se pudieron extraer URLs del sitemap"}
124
 
125
  results = []
126
  with ThreadPoolExecutor(max_workers=4) as executor:
127
- # Corregido: Cambiado ] por } en la comprensión del diccionario
128
  futures = {executor.submit(self._process_url, url): url for url in urls[:20]} # Limitar para demo
129
  for future in as_completed(futures):
130
  try:
@@ -142,11 +99,16 @@ class SEOSpaceAnalyzer:
142
  'timestamp': datetime.now().isoformat()
143
  }
144
 
145
- return self.current_analysis
 
 
 
 
 
146
 
147
  except Exception as e:
148
  logger.error(f"Error en análisis: {str(e)}")
149
- return {"error": str(e)}
150
 
151
  def _process_url(self, url: str) -> Dict:
152
  """Procesa una URL individual"""
@@ -332,8 +294,9 @@ class SEOSpaceAnalyzer:
332
 
333
  # Análisis de temas principales
334
  try:
 
335
  vectorizer = TfidfVectorizer(
336
- stop_words=list(spacy.lang.es.stop_words.STOP_WORDS),
337
  max_features=50,
338
  ngram_range=(1, 2)
339
  )
@@ -404,9 +367,9 @@ class SEOSpaceAnalyzer:
404
  all_links = [link for r in results for link in r.get('links', [])]
405
  if all_links:
406
  df_links = pd.DataFrame(all_links)
407
- broken_links = sum(1 for link in all_links if link['type'] == 'internal')
408
- if broken_links > 5: # Umbral arbitrario
409
- recs.append(f"🔗 Revisar {broken_links} enlaces internos (posibles rotos)")
410
 
411
  return recs if recs else ["✅ No se detectaron problemas críticos de SEO"]
412
 
@@ -463,7 +426,7 @@ def create_interface():
463
  ### Documentos Encontrados
464
  Los documentos descargados se guardan en la carpeta `content_storage/`
465
  """)
466
- file_explorer = gr.FileExplorer(glob="content_storage/**/*")
467
 
468
  # Event handlers
469
  analyze_btn.click(
@@ -478,20 +441,27 @@ def create_interface():
478
  outputs=[stats_output, recommendations_output, content_output, links_output]
479
  )
480
 
 
 
 
 
 
 
 
 
 
481
  download_btn.click(
482
- fn=lambda: gr.File(value="content_storage/seo_report.json"),
483
- outputs=gr.File()
484
  )
485
 
486
  return interface
487
 
488
  if __name__ == "__main__":
489
- # Verificar modelos antes de iniciar
490
  try:
491
  spacy.load("es_core_news_lg")
492
  except OSError:
493
- logger.error("Modelo spaCy 'es_core_news_lg' no encontrado. Ejecute:")
494
- logger.error("python -m spacy download es_core_news_lg")
495
  exit(1)
496
 
497
  app = create_interface()
 
32
  format='%(asctime)s - %(levelname)s - %(message)s'
33
  )
34
  logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
 
 
36
  class SEOSpaceAnalyzer:
37
  def __init__(self):
38
  self.session = self._configure_session()
 
40
  self.base_dir = Path("content_storage")
41
  self.base_dir.mkdir(parents=True, exist_ok=True)
42
  self.current_analysis = {}
43
+
44
  def _load_models(self) -> Dict:
45
  """Carga modelos optimizados para Hugging Face"""
46
  try:
47
  device = 0 if torch.cuda.is_available() else -1
48
  return {
49
  'summarizer': pipeline("summarization", model="facebook/bart-large-cnn", device=device),
50
+ 'ner': pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple", device=device),
51
  'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
52
+ 'spacy': spacy.load("es_core_news_lg")
53
  }
54
  except Exception as e:
55
  logger.error(f"Error loading models: {e}")
56
  raise
57
+
58
  def _configure_session(self) -> requests.Session:
59
  """Configura sesión HTTP con reintentos"""
60
  session = requests.Session()
 
73
  })
74
  return session
75
 
76
+ def analyze_sitemap(self, sitemap_url: str) -> Tuple[Dict, List[str], Dict, Dict]:
77
+ """Analiza un sitemap completo y devuelve componentes por separado"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  try:
79
  urls = self._parse_sitemap(sitemap_url)
80
  if not urls:
81
+ return {"error": "No se pudieron extraer URLs del sitemap"}, [], {}, {}
82
 
83
  results = []
84
  with ThreadPoolExecutor(max_workers=4) as executor:
 
85
  futures = {executor.submit(self._process_url, url): url for url in urls[:20]} # Limitar para demo
86
  for future in as_completed(futures):
87
  try:
 
99
  'timestamp': datetime.now().isoformat()
100
  }
101
 
102
+ return (
103
+ self.current_analysis['stats'],
104
+ self.current_analysis['recommendations'],
105
+ self.current_analysis['content_analysis'],
106
+ self.current_analysis['links']
107
+ )
108
 
109
  except Exception as e:
110
  logger.error(f"Error en análisis: {str(e)}")
111
+ return {"error": str(e)}, [], {}, {}
112
 
113
  def _process_url(self, url: str) -> Dict:
114
  """Procesa una URL individual"""
 
294
 
295
  # Análisis de temas principales
296
  try:
297
+ stop_words = list(self.models['spacy'].Defaults.stop_words)
298
  vectorizer = TfidfVectorizer(
299
+ stop_words=stop_words,
300
  max_features=50,
301
  ngram_range=(1, 2)
302
  )
 
367
  all_links = [link for r in results for link in r.get('links', [])]
368
  if all_links:
369
  df_links = pd.DataFrame(all_links)
370
+ internal_links = df_links[df_links['type'] == 'internal']
371
+ if len(internal_links) > 100: # Umbral arbitrario
372
+ recs.append(f"🔗 Optimizar estructura de enlaces internos ({len(internal_links)} enlaces)")
373
 
374
  return recs if recs else ["✅ No se detectaron problemas críticos de SEO"]
375
 
 
426
  ### Documentos Encontrados
427
  Los documentos descargados se guardan en la carpeta `content_storage/`
428
  """)
429
+ # Reemplazado FileExplorer por Markdown informativo
430
 
431
  # Event handlers
432
  analyze_btn.click(
 
441
  outputs=[stats_output, recommendations_output, content_output, links_output]
442
  )
443
 
444
+ # Para descargar el reporte, primero se debe generar
445
+ def generate_report():
446
+ if analyzer.current_analysis:
447
+ report_path = "content_storage/seo_report.json"
448
+ with open(report_path, 'w') as f:
449
+ json.dump(analyzer.current_analysis, f, indent=2)
450
+ return report_path
451
+ return None
452
+
453
  download_btn.click(
454
+ fn=generate_report,
455
+ outputs=gr.File(label="Descargar Reporte")
456
  )
457
 
458
  return interface
459
 
460
  if __name__ == "__main__":
 
461
  try:
462
  spacy.load("es_core_news_lg")
463
  except OSError:
464
+ logger.error("Modelo spaCy 'es_core_news_lg' no encontrado. Ejecute: python -m spacy download es_core_news_lg")
 
465
  exit(1)
466
 
467
  app = create_interface()