Update app.py
Browse files
app.py
CHANGED
@@ -32,27 +32,7 @@ logging.basicConfig(
|
|
32 |
format='%(asctime)s - %(levelname)s - %(message)s'
|
33 |
)
|
34 |
logger = logging.getLogger(__name__)
|
35 |
-
def setup_spacy_model():
|
36 |
-
"""Descarga el modelo de spaCy si no está instalado"""
|
37 |
-
try:
|
38 |
-
spacy.load("es_core_news_lg")
|
39 |
-
logger.info("Modelo spaCy 'es_core_news_lg' cargado correctamente")
|
40 |
-
except OSError:
|
41 |
-
logger.info("Descargando modelo spaCy 'es_core_news_lg'...")
|
42 |
-
try:
|
43 |
-
subprocess.run(
|
44 |
-
[sys.executable, "-m", "spacy", "download", "es_core_news_lg"],
|
45 |
-
check=True,
|
46 |
-
stdout=subprocess.PIPE,
|
47 |
-
stderr=subprocess.PIPE
|
48 |
-
)
|
49 |
-
logger.info("Modelo descargado exitosamente")
|
50 |
-
except subprocess.CalledProcessError as e:
|
51 |
-
logger.error(f"Error al descargar modelo: {e.stderr.decode()}")
|
52 |
-
raise RuntimeError("No se pudo descargar el modelo spaCy") from e
|
53 |
|
54 |
-
# Configurar modelo antes de iniciar
|
55 |
-
setup_spacy_model()
|
56 |
class SEOSpaceAnalyzer:
|
57 |
def __init__(self):
|
58 |
self.session = self._configure_session()
|
@@ -60,20 +40,21 @@ class SEOSpaceAnalyzer:
|
|
60 |
self.base_dir = Path("content_storage")
|
61 |
self.base_dir.mkdir(parents=True, exist_ok=True)
|
62 |
self.current_analysis = {}
|
|
|
63 |
def _load_models(self) -> Dict:
|
64 |
"""Carga modelos optimizados para Hugging Face"""
|
65 |
try:
|
66 |
device = 0 if torch.cuda.is_available() else -1
|
67 |
return {
|
68 |
'summarizer': pipeline("summarization", model="facebook/bart-large-cnn", device=device),
|
69 |
-
'ner': pipeline("ner", model="dslim/bert-base-NER", device=device),
|
70 |
'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
|
71 |
-
'spacy': spacy.load("es_core_news_lg")
|
72 |
}
|
73 |
except Exception as e:
|
74 |
logger.error(f"Error loading models: {e}")
|
75 |
raise
|
76 |
-
|
77 |
def _configure_session(self) -> requests.Session:
|
78 |
"""Configura sesión HTTP con reintentos"""
|
79 |
session = requests.Session()
|
@@ -92,39 +73,15 @@ class SEOSpaceAnalyzer:
|
|
92 |
})
|
93 |
return session
|
94 |
|
95 |
-
def
|
96 |
-
"""
|
97 |
-
try:
|
98 |
-
device = 0 if torch.cuda.is_available() else -1
|
99 |
-
return {
|
100 |
-
'summarizer': pipeline(
|
101 |
-
"summarization",
|
102 |
-
model="facebook/bart-large-cnn",
|
103 |
-
device=device
|
104 |
-
),
|
105 |
-
'ner': pipeline(
|
106 |
-
"ner",
|
107 |
-
model="dslim/bert-base-NER",
|
108 |
-
aggregation_strategy="simple",
|
109 |
-
device=device
|
110 |
-
),
|
111 |
-
'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
|
112 |
-
'spacy': spacy.load("es_core_news_lg")
|
113 |
-
}
|
114 |
-
except Exception as e:
|
115 |
-
logger.error(f"Error loading models: {e}")
|
116 |
-
raise
|
117 |
-
|
118 |
-
def analyze_sitemap(self, sitemap_url: str) -> Dict:
|
119 |
-
"""Analiza un sitemap completo"""
|
120 |
try:
|
121 |
urls = self._parse_sitemap(sitemap_url)
|
122 |
if not urls:
|
123 |
-
return {"error": "No se pudieron extraer URLs del sitemap"}
|
124 |
|
125 |
results = []
|
126 |
with ThreadPoolExecutor(max_workers=4) as executor:
|
127 |
-
# Corregido: Cambiado ] por } en la comprensión del diccionario
|
128 |
futures = {executor.submit(self._process_url, url): url for url in urls[:20]} # Limitar para demo
|
129 |
for future in as_completed(futures):
|
130 |
try:
|
@@ -142,11 +99,16 @@ class SEOSpaceAnalyzer:
|
|
142 |
'timestamp': datetime.now().isoformat()
|
143 |
}
|
144 |
|
145 |
-
return
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
except Exception as e:
|
148 |
logger.error(f"Error en análisis: {str(e)}")
|
149 |
-
return {"error": str(e)}
|
150 |
|
151 |
def _process_url(self, url: str) -> Dict:
|
152 |
"""Procesa una URL individual"""
|
@@ -332,8 +294,9 @@ class SEOSpaceAnalyzer:
|
|
332 |
|
333 |
# Análisis de temas principales
|
334 |
try:
|
|
|
335 |
vectorizer = TfidfVectorizer(
|
336 |
-
stop_words=
|
337 |
max_features=50,
|
338 |
ngram_range=(1, 2)
|
339 |
)
|
@@ -404,9 +367,9 @@ class SEOSpaceAnalyzer:
|
|
404 |
all_links = [link for r in results for link in r.get('links', [])]
|
405 |
if all_links:
|
406 |
df_links = pd.DataFrame(all_links)
|
407 |
-
|
408 |
-
if
|
409 |
-
recs.append(f"🔗
|
410 |
|
411 |
return recs if recs else ["✅ No se detectaron problemas críticos de SEO"]
|
412 |
|
@@ -463,7 +426,7 @@ def create_interface():
|
|
463 |
### Documentos Encontrados
|
464 |
Los documentos descargados se guardan en la carpeta `content_storage/`
|
465 |
""")
|
466 |
-
|
467 |
|
468 |
# Event handlers
|
469 |
analyze_btn.click(
|
@@ -478,20 +441,27 @@ def create_interface():
|
|
478 |
outputs=[stats_output, recommendations_output, content_output, links_output]
|
479 |
)
|
480 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
481 |
download_btn.click(
|
482 |
-
fn=
|
483 |
-
outputs=gr.File()
|
484 |
)
|
485 |
|
486 |
return interface
|
487 |
|
488 |
if __name__ == "__main__":
|
489 |
-
# Verificar modelos antes de iniciar
|
490 |
try:
|
491 |
spacy.load("es_core_news_lg")
|
492 |
except OSError:
|
493 |
-
logger.error("Modelo spaCy 'es_core_news_lg' no encontrado. Ejecute:")
|
494 |
-
logger.error("python -m spacy download es_core_news_lg")
|
495 |
exit(1)
|
496 |
|
497 |
app = create_interface()
|
|
|
32 |
format='%(asctime)s - %(levelname)s - %(message)s'
|
33 |
)
|
34 |
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
|
|
|
|
36 |
class SEOSpaceAnalyzer:
|
37 |
def __init__(self):
|
38 |
self.session = self._configure_session()
|
|
|
40 |
self.base_dir = Path("content_storage")
|
41 |
self.base_dir.mkdir(parents=True, exist_ok=True)
|
42 |
self.current_analysis = {}
|
43 |
+
|
44 |
def _load_models(self) -> Dict:
|
45 |
"""Carga modelos optimizados para Hugging Face"""
|
46 |
try:
|
47 |
device = 0 if torch.cuda.is_available() else -1
|
48 |
return {
|
49 |
'summarizer': pipeline("summarization", model="facebook/bart-large-cnn", device=device),
|
50 |
+
'ner': pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple", device=device),
|
51 |
'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
|
52 |
+
'spacy': spacy.load("es_core_news_lg")
|
53 |
}
|
54 |
except Exception as e:
|
55 |
logger.error(f"Error loading models: {e}")
|
56 |
raise
|
57 |
+
|
58 |
def _configure_session(self) -> requests.Session:
|
59 |
"""Configura sesión HTTP con reintentos"""
|
60 |
session = requests.Session()
|
|
|
73 |
})
|
74 |
return session
|
75 |
|
76 |
+
def analyze_sitemap(self, sitemap_url: str) -> Tuple[Dict, List[str], Dict, Dict]:
|
77 |
+
"""Analiza un sitemap completo y devuelve componentes por separado"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
try:
|
79 |
urls = self._parse_sitemap(sitemap_url)
|
80 |
if not urls:
|
81 |
+
return {"error": "No se pudieron extraer URLs del sitemap"}, [], {}, {}
|
82 |
|
83 |
results = []
|
84 |
with ThreadPoolExecutor(max_workers=4) as executor:
|
|
|
85 |
futures = {executor.submit(self._process_url, url): url for url in urls[:20]} # Limitar para demo
|
86 |
for future in as_completed(futures):
|
87 |
try:
|
|
|
99 |
'timestamp': datetime.now().isoformat()
|
100 |
}
|
101 |
|
102 |
+
return (
|
103 |
+
self.current_analysis['stats'],
|
104 |
+
self.current_analysis['recommendations'],
|
105 |
+
self.current_analysis['content_analysis'],
|
106 |
+
self.current_analysis['links']
|
107 |
+
)
|
108 |
|
109 |
except Exception as e:
|
110 |
logger.error(f"Error en análisis: {str(e)}")
|
111 |
+
return {"error": str(e)}, [], {}, {}
|
112 |
|
113 |
def _process_url(self, url: str) -> Dict:
|
114 |
"""Procesa una URL individual"""
|
|
|
294 |
|
295 |
# Análisis de temas principales
|
296 |
try:
|
297 |
+
stop_words = list(self.models['spacy'].Defaults.stop_words)
|
298 |
vectorizer = TfidfVectorizer(
|
299 |
+
stop_words=stop_words,
|
300 |
max_features=50,
|
301 |
ngram_range=(1, 2)
|
302 |
)
|
|
|
367 |
all_links = [link for r in results for link in r.get('links', [])]
|
368 |
if all_links:
|
369 |
df_links = pd.DataFrame(all_links)
|
370 |
+
internal_links = df_links[df_links['type'] == 'internal']
|
371 |
+
if len(internal_links) > 100: # Umbral arbitrario
|
372 |
+
recs.append(f"🔗 Optimizar estructura de enlaces internos ({len(internal_links)} enlaces)")
|
373 |
|
374 |
return recs if recs else ["✅ No se detectaron problemas críticos de SEO"]
|
375 |
|
|
|
426 |
### Documentos Encontrados
|
427 |
Los documentos descargados se guardan en la carpeta `content_storage/`
|
428 |
""")
|
429 |
+
# Reemplazado FileExplorer por Markdown informativo
|
430 |
|
431 |
# Event handlers
|
432 |
analyze_btn.click(
|
|
|
441 |
outputs=[stats_output, recommendations_output, content_output, links_output]
|
442 |
)
|
443 |
|
444 |
+
# Para descargar el reporte, primero se debe generar
|
445 |
+
def generate_report():
|
446 |
+
if analyzer.current_analysis:
|
447 |
+
report_path = "content_storage/seo_report.json"
|
448 |
+
with open(report_path, 'w') as f:
|
449 |
+
json.dump(analyzer.current_analysis, f, indent=2)
|
450 |
+
return report_path
|
451 |
+
return None
|
452 |
+
|
453 |
download_btn.click(
|
454 |
+
fn=generate_report,
|
455 |
+
outputs=gr.File(label="Descargar Reporte")
|
456 |
)
|
457 |
|
458 |
return interface
|
459 |
|
460 |
if __name__ == "__main__":
|
|
|
461 |
try:
|
462 |
spacy.load("es_core_news_lg")
|
463 |
except OSError:
|
464 |
+
logger.error("Modelo spaCy 'es_core_news_lg' no encontrado. Ejecute: python -m spacy download es_core_news_lg")
|
|
|
465 |
exit(1)
|
466 |
|
467 |
app = create_interface()
|