|
import os |
|
import json |
|
import logging |
|
import re |
|
import requests |
|
import hashlib |
|
import PyPDF2 |
|
import numpy as np |
|
import pandas as pd |
|
from io import BytesIO |
|
from typing import List, Dict, Optional |
|
from urllib.parse import urlparse, urljoin |
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
from bs4 import BeautifulSoup |
|
from pathlib import Path |
|
from datetime import datetime |
|
from collections import defaultdict |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from requests.adapters import HTTPAdapter |
|
from urllib3.util.retry import Retry |
|
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer |
|
from sentence_transformers import SentenceTransformer |
|
import spacy |
|
import torch |
|
|
|
import gradio as gr |
|
import matplotlib.pyplot as plt |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
class SEOSpaceAnalyzer: |
|
def __init__(self): |
|
self.session = self._configure_session() |
|
self.models = self._load_models() |
|
self.base_dir = Path("content_storage") |
|
self.base_dir.mkdir(exist_ok=True) |
|
self.current_analysis = {} |
|
|
|
def _configure_session(self): |
|
"""Configura sesión HTTP con reintentos""" |
|
session = requests.Session() |
|
retry = Retry( |
|
total=3, |
|
backoff_factor=1, |
|
status_forcelist=[500, 502, 503, 504] |
|
) |
|
adapter = HTTPAdapter(max_retries=retry) |
|
session.mount('https://', adapter) |
|
session.headers.update({ |
|
'User-Agent': 'Mozilla/5.0 (compatible; SEOBot/1.0)', |
|
'Accept-Language': 'es-ES,es;q=0.9' |
|
}) |
|
return session |
|
|
|
def _load_models(self): |
|
"""Carga modelos optimizados para Hugging Face""" |
|
device = 0 if torch.cuda.is_available() else -1 |
|
return { |
|
'summarizer': pipeline("summarization", |
|
model="facebook/bart-large-cnn", |
|
device=device), |
|
'ner': pipeline("ner", |
|
model="dslim/bert-base-NER", |
|
device=device), |
|
'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'), |
|
'spacy': spacy.load("es_core_news_lg") |
|
} |
|
|
|
def analyze_sitemap(self, sitemap_url: str): |
|
"""Analiza un sitemap completo""" |
|
try: |
|
urls = self._parse_sitemap(sitemap_url) |
|
if not urls: |
|
return {"error": "No se pudieron extraer URLs del sitemap"} |
|
|
|
results = [] |
|
with ThreadPoolExecutor(max_workers=4) as executor: |
|
futures = [executor.submit(self._process_url, url) for url in urls[:50]] |
|
for future in as_completed(futures): |
|
results.append(future.result()) |
|
|
|
self.current_analysis = { |
|
'stats': self._calculate_stats(results), |
|
'content_analysis': self._analyze_content(results), |
|
'links': self._analyze_links(results), |
|
'recommendations': self._generate_seo_recommendations(results) |
|
} |
|
|
|
return self.current_analysis |
|
|
|
except Exception as e: |
|
logger.error(f"Error en análisis: {str(e)}") |
|
return {"error": str(e)} |
|
|
|
def _process_url(self, url: str): |
|
"""Procesa una URL individual""" |
|
try: |
|
response = self.session.get(url, timeout=10) |
|
response.raise_for_status() |
|
|
|
content_type = response.headers.get('Content-Type', '') |
|
result = {'url': url, 'status': 'success'} |
|
|
|
if 'application/pdf' in content_type: |
|
result.update(self._process_pdf(response.content)) |
|
elif 'text/html' in content_type: |
|
result.update(self._process_html(response.text, url)) |
|
|
|
return result |
|
except Exception as e: |
|
logger.warning(f"Error procesando {url}: {str(e)}") |
|
return {'url': url, 'status': 'error', 'error': str(e)} |
|
|
|
def _process_html(self, html: str, base_url: str): |
|
"""Procesa contenido HTML""" |
|
soup = BeautifulSoup(html, 'lxml') |
|
clean_text = self._clean_text(soup.get_text()) |
|
|
|
return { |
|
'type': 'html', |
|
'content': clean_text, |
|
'word_count': len(clean_text.split()), |
|
'links': self._extract_links(soup, base_url), |
|
'metadata': self._extract_metadata(soup) |
|
} |
|
|
|
def _process_pdf(self, content: bytes): |
|
"""Procesa documentos PDF""" |
|
text = "" |
|
with BytesIO(content) as pdf_file: |
|
reader = PyPDF2.PdfReader(pdf_file) |
|
for page in reader.pages: |
|
text += page.extract_text() |
|
|
|
clean_text = self._clean_text(text) |
|
return { |
|
'type': 'pdf', |
|
'content': clean_text, |
|
'word_count': len(clean_text.split()), |
|
'page_count': len(reader.pages) |
|
} |
|
|
|
def _clean_text(self, text: str): |
|
"""Limpieza avanzada de texto""" |
|
text = re.sub(r'\s+', ' ', text) |
|
return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip() |
|
|
|
def _extract_links(self, soup: BeautifulSoup, base_url: str): |
|
"""Extrae y clasifica enlaces""" |
|
links = [] |
|
for tag in soup.find_all('a', href=True): |
|
try: |
|
full_url = urljoin(base_url, tag['href']) |
|
parsed = urlparse(full_url) |
|
|
|
links.append({ |
|
'url': full_url, |
|
'type': 'internal' if parsed.netloc == urlparse(base_url).netloc else 'external', |
|
'anchor': self._clean_text(tag.text)[:100], |
|
'file_type': self._get_file_type(parsed.path) |
|
}) |
|
except: |
|
continue |
|
return links |
|
|
|
def _get_file_type(self, path: str): |
|
"""Determina tipo de archivo por extensión""" |
|
ext = Path(path).suffix.lower() |
|
return ext[1:] if ext else 'html' |
|
|
|
def _extract_metadata(self, soup: BeautifulSoup): |
|
"""Extrae metadatos SEO""" |
|
metadata = {'title': '', 'description': '', 'keywords': []} |
|
|
|
|
|
if soup.title: |
|
metadata['title'] = soup.title.string.strip() |
|
|
|
|
|
for meta in soup.find_all('meta'): |
|
if meta.get('name') == 'description': |
|
metadata['description'] = meta.get('content', '')[:500] |
|
elif meta.get('name') == 'keywords': |
|
metadata['keywords'] = [kw.strip() for kw in meta.get('content', '').split(',')] |
|
|
|
return metadata |
|
|
|
def _parse_sitemap(self, sitemap_url: str): |
|
"""Parsea sitemap XML básico""" |
|
try: |
|
response = self.session.get(sitemap_url) |
|
response.raise_for_status() |
|
|
|
urls = [] |
|
soup = BeautifulSoup(response.text, 'lxml') |
|
|
|
|
|
for loc in soup.find_all('loc'): |
|
url = loc.text.strip() |
|
if url.endswith('.xml') and url != sitemap_url: |
|
urls.extend(self._parse_sitemap(url)) |
|
else: |
|
urls.append(url) |
|
|
|
return list(set(urls)) |
|
except Exception as e: |
|
logger.error(f"Error parsing sitemap: {str(e)}") |
|
return [] |
|
|
|
def _calculate_stats(self, results: List[Dict]): |
|
"""Calcula estadísticas básicas""" |
|
successful = [r for r in results if r.get('status') == 'success'] |
|
|
|
return { |
|
'total_urls': len(results), |
|
'successful': len(successful), |
|
'failed': len(results) - len(successful), |
|
'content_types': pd.Series([r.get('type', 'unknown') for r in successful]).value_counts().to_dict(), |
|
'avg_word_count': np.mean([r.get('word_count', 0) for r in successful]) |
|
} |
|
|
|
def _analyze_content(self, results: List[Dict]): |
|
"""Analiza contenido con NLP""" |
|
successful = [r for r in results if r.get('status') == 'success'] |
|
texts = [r.get('content', '') for r in successful] |
|
|
|
|
|
vectorizer = TfidfVectorizer(stop_words=list(spacy.lang.es.stop_words.STOP_WORDS)) |
|
try: |
|
tfidf = vectorizer.fit_transform(texts) |
|
top_keywords = vectorizer.get_feature_names_out()[np.argsort(tfidf.sum(axis=0).A1][-10:][::-1] |
|
except: |
|
top_keywords = [] |
|
|
|
return { |
|
'top_keywords': list(top_keywords), |
|
'content_samples': [t[:500] + '...' for t in texts[:3]] |
|
} |
|
|
|
def _analyze_links(self, results: List[Dict]): |
|
"""Analiza estructura de enlaces""" |
|
all_links = [] |
|
for result in results: |
|
if result.get('links'): |
|
all_links.extend(result['links']) |
|
|
|
if not all_links: |
|
return {} |
|
|
|
df = pd.DataFrame(all_links) |
|
return { |
|
'internal_links': df[df['type'] == 'internal']['url'].value_counts().to_dict(), |
|
'external_domains': df[df['type'] == 'external']['url'].apply(lambda x: urlparse(x).netloc).value_counts().to_dict(), |
|
'common_anchors': df['anchor'].value_counts().head(10).to_dict() |
|
} |
|
|
|
def _generate_seo_recommendations(self, results: List[Dict]): |
|
"""Genera recomendaciones SEO""" |
|
successful = [r for r in results if r.get('status') == 'success'] |
|
|
|
recs = [] |
|
|
|
|
|
missing_titles = sum(1 for r in successful if not r.get('metadata', {}).get('title')) |
|
if missing_titles: |
|
recs.append(f"Añadir títulos a {missing_titles} páginas") |
|
|
|
|
|
short_content = sum(1 for r in successful if r.get('word_count', 0) < 300) |
|
if short_content: |
|
recs.append(f"Ampliar contenido en {short_content} páginas (menos de 300 palabras)") |
|
|
|
return recs if recs else ["No se detectaron problemas críticos de SEO"] |
|
|
|
|
|
def create_interface(): |
|
analyzer = SEOSpaceAnalyzer() |
|
|
|
with gr.Blocks(title="SEO Analyzer Pro", theme=gr.themes.Soft()) as interface: |
|
gr.Markdown(""" |
|
# 🕵️ SEO Analyzer Pro |
|
*Analizador SEO avanzado con modelos de lenguaje* |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
sitemap_url = gr.Textbox( |
|
label="URL del Sitemap", |
|
placeholder="https://ejemplo.com/sitemap.xml", |
|
interactive=True |
|
) |
|
analyze_btn = gr.Button("Analizar", variant="primary") |
|
|
|
with gr.Column(): |
|
status = gr.Textbox(label="Estado", interactive=False) |
|
|
|
with gr.Tabs(): |
|
with gr.Tab("Resumen"): |
|
stats = gr.JSON(label="Estadísticas") |
|
recommendations = gr.JSON(label="Recomendaciones SEO") |
|
|
|
with gr.Tab("Contenido"): |
|
content_analysis = gr.JSON(label="Análisis de Contenido") |
|
content_samples = gr.JSON(label="Muestras de Contenido") |
|
|
|
with gr.Tab("Enlaces"): |
|
links_analysis = gr.JSON(label="Análisis de Enlaces") |
|
links_plot = gr.Plot() |
|
|
|
|
|
analyze_btn.click( |
|
fn=analyzer.analyze_sitemap, |
|
inputs=sitemap_url, |
|
outputs=[stats, recommendations, content_analysis, links_analysis], |
|
api_name="analyze" |
|
) |
|
|
|
return interface |
|
|
|
if __name__ == "__main__": |
|
app = create_interface() |
|
app.launch(server_name="0.0.0.0", server_port=7860) |