|
import os |
|
import json |
|
import logging |
|
import re |
|
import requests |
|
import hashlib |
|
import PyPDF2 |
|
import numpy as np |
|
import pandas as pd |
|
from io import BytesIO |
|
from typing import List, Dict, Optional |
|
from urllib.parse import urlparse, urljoin |
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
from bs4 import BeautifulSoup |
|
from pathlib import Path |
|
from datetime import datetime |
|
from collections import defaultdict |
|
|
|
import gradio as gr |
|
import matplotlib.pyplot as plt |
|
from sklearn.feature_extractioimport json |
|
import logging |
|
import re |
|
import requests |
|
import hashlib |
|
import PyPDF2 |
|
import numpy as np |
|
import pandas as pd |
|
from io import BytesIO |
|
from typing import List, Dict, Optional |
|
from urllib.parse import urlparse, urljoin |
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
from bs4 import BeautifulSoup |
|
from pathlib import Path |
|
from datetime import datetime |
|
from collections import defaultdict |
|
|
|
import gradio as gr |
|
import matplotlib.pyplot as plt |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from requests.adapters import HTTPAdapter |
|
from requests.packages.urllib3.util.retry import Retry |
|
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer |
|
from sentence_transformers import SentenceTransformer |
|
import spacy |
|
import torch |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
class SEOSpaceAnalyzer: |
|
def __init__(self): |
|
self.session = self._configure_session() |
|
self.models = self._load_models() |
|
self.base_dir = Path("content_storage") |
|
self.link_analysis = defaultdict(list) |
|
self.documents = [] |
|
self.current_analysis = {} |
|
|
|
def _configure_session(self): |
|
"""Configuración avanzada de sesión HTTP con reintentos""" |
|
session = requests.Session() |
|
retry = Retry( |
|
total=3, |
|
backoff_factor=1, |
|
status_forcelist=[500, 502, 503, 504] |
|
) |
|
adapter = HTTPAdapter(max_retries=retry) |
|
session.mount('https://', adapter) |
|
session.headers.update({ |
|
'User-Agent': 'Mozilla/5.0 (compatible; SEOBot/1.0)', |
|
'Accept-Language': 'es-ES,es;q=0.9' |
|
}) |
|
return session |
|
|
|
def _load_models(self): |
|
"""Carga modelos de Hugging Face optimizados""" |
|
device = 0 if torch.cuda.is_available() else -1 |
|
return { |
|
'summarizer': pipeline("summarization", |
|
model="facebook/bart-large-cnn", |
|
device=device), |
|
'ner': pipeline("ner", |
|
model="dslim/bert-base-NER", |
|
aggregation_strategy="simple", |
|
device=device), |
|
'qa': pipeline("question-answering", |
|
model="deepset/roberta-base-squad2", |
|
device=device), |
|
'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'), |
|
'spacy': spacy.load("es_core_news_lg") |
|
} |
|
|
|
def _process_url(self, url): |
|
"""Procesa una URL y extrae su contenido""" |
|
try: |
|
response = self.session.get(url, timeout=15) |
|
response.raise_for_status() |
|
|
|
content_type = response.headers.get('Content-Type', '') |
|
result = {'url': url, 'links': []} |
|
|
|
if 'application/pdf' in content_type: |
|
result.update(self._process_pdf(response.content)) |
|
elif 'text/html' in content_type: |
|
result.update(self._process_html(response.text, url)) |
|
|
|
self._save_content(url, response.content) |
|
return result |
|
|
|
except Exception as e: |
|
logger.error(f"Error procesando {url}: {str(e)}") |
|
return {'url': url, 'error': str(e)} |
|
|
|
def _process_html(self, html, base_url): |
|
"""Procesa contenido HTML""" |
|
soup = BeautifulSoup(html, 'lxml') |
|
return { |
|
'content': self._clean_text(soup.get_text()), |
|
'type': 'html', |
|
'metadata': self._extract_metadata(soup), |
|
'links': self._extract_links(soup, base_url) |
|
} |
|
|
|
def _process_pdf(self, content): |
|
"""Procesa documentos PDF""" |
|
text = "" |
|
with BytesIO(content) as pdf_file: |
|
reader = PyPDF2.PdfReader(pdf_file) |
|
for page in reader.pages: |
|
text += page.extract_text() |
|
|
|
return { |
|
'content': self._clean_text(text), |
|
'type': 'pdf', |
|
'metadata': {'pages': len(reader.pages)} |
|
} |
|
|
|
def _extract_links(self, soup, base_url): |
|
"""Extrae y clasifica enlaces""" |
|
links = [] |
|
for tag in soup.find_all('a', href=True): |
|
href = tag['href'] |
|
full_url = urljoin(base_url, href) |
|
link_type = 'internal' if urlparse(full_url).netloc == urlparse(base_url).netloc else 'external' |
|
|
|
links.append({ |
|
'url': full_url, |
|
'type': link_type, |
|
'anchor': self._clean_text(tag.text), |
|
'file_type': self._get_file_type(href) |
|
}) |
|
return links |
|
|
|
def _get_file_type(self, url): |
|
"""Determina el tipo de archivo por extensión""" |
|
ext = Path(urlparse(url).path).suffix.lower() |
|
return ext[1:] if ext else 'html' |
|
|
|
def _clean_text(self, text): |
|
"""Limpieza avanzada de texto""" |
|
text = re.sub(r'\s+', ' ', text) |
|
return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip() |
|
|
|
def _save_content(self, url, content): |
|
"""Almacena el contenido descargado""" |
|
path = urlparse(url).path.lstrip('/') |
|
save_path = self.base_dir / urlparse(url).netloc / path |
|
save_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
|
with open(save_path.with_suffix(self._get_file_type(url)), 'wb') as f: |
|
f.write(content) |
|
|
|
def analyze_sitemap(self, sitemap_url): |
|
"""Analiza todo el sitemap y genera reportes""" |
|
urls = self._parse_sitemap(sitemap_url) |
|
results = [] |
|
|
|
with ThreadPoolExecutor(max_workers=4) as executor: |
|
futures = [executor.submit(self._process_url, url) for url in urls] |
|
for future in as_completed(futures): |
|
results.append(future.result()) |
|
progress(len(results)/len(urls)) |
|
|
|
self.current_analysis = { |
|
'basic_stats': self._calculate_stats(results), |
|
'content_analysis': self._analyze_content(results), |
|
'link_analysis': self._analyze_links(results), |
|
'seo_recommendations': self._generate_recommendations(results) |
|
} |
|
|
|
return self.current_analysis |
|
|
|
def _parse_sitemap(self, sitemap_url): |
|
"""Parsea sitemaps XML incluyendo sitemaps indexados""" |
|
|
|
return [] |
|
|
|
def _calculate_stats(self, results): |
|
"""Calcula estadísticas básicas del análisis""" |
|
return { |
|
'total_urls': len(results), |
|
'content_types': pd.Series([r.get('type', 'unknown') for r in results]).value_counts().to_dict(), |
|
'avg_content_length': np.mean([len(r.get('content', '')) for r in results]) |
|
} |
|
|
|
def create_report(self): |
|
"""Crea un reporte descargable en múltiples formatos""" |
|
report = { |
|
'timestamp': datetime.now().isoformat(), |
|
'analysis': self.current_analysis |
|
} |
|
|
|
|
|
json_path = self.base_dir / 'seo_report.json' |
|
with open(json_path, 'w') as f: |
|
json.dump(report, f) |
|
|
|
|
|
df = pd.DataFrame([link for result in self.current_analysis['link_analysis'] for link in result['links']]) |
|
csv_path = self.base_dir / 'links_analysis.csv' |
|
df.to_csv(csv_path, index=False) |
|
|
|
return [str(json_path), str(csv_path)] |
|
|
|
def create_visualization(self): |
|
"""Genera visualizaciones interactivas""" |
|
fig, ax = plt.subplots() |
|
pd.Series(self.current_analysis['basic_stats']['content_types']).plot.pie( |
|
ax=ax, |
|
title='Distribución de Tipos de Contenido', |
|
ylabel='' |
|
) |
|
return fig |
|
|
|
|
|
def create_interface(): |
|
analyzer = SEOSpaceAnalyzer() |
|
|
|
with gr.Blocks(title="SEO Analyzer Pro", theme=gr.themes.Soft()) as interface: |
|
gr.Markdown("# 🕵️ SEO Analyzer Pro") |
|
|
|
with gr.Row(): |
|
sitemap_url = gr.Textbox(label="URL del Sitemap", placeholder="https://www.ing.es/ennaranja/sitemap.xml") |
|
analyze_btn = gr.Button("Analizar", variant="primary") |
|
|
|
with gr.Tab("Resultados"): |
|
json_output = gr.JSON(label="Análisis Completo") |
|
plot_output = gr.Plot(label="Visualización") |
|
|
|
with gr.Tab("Enlaces"): |
|
internal_links = gr.Dataframe(label="Enlaces Internos") |
|
external_links = gr.Dataframe(label="Enlaces Externos") |
|
|
|
with gr.Tab("Descargas"): |
|
report_download = gr.Files(label="Descargar Reporte") |
|
download_btn = gr.Button("Generar Reporte", variant="secondary") |
|
|
|
analyze_btn.click( |
|
fn=analyzer.analyze_sitemap, |
|
inputs=sitemap_url, |
|
outputs=[json_output, plot_output, internal_links, external_links] |
|
) |
|
|
|
download_btn.click( |
|
fn=analyzer.create_report, |
|
outputs=report_download |
|
) |
|
|
|
return interface |
|
|
|
if __name__ == "__main__": |
|
interface = create_interface() |
|
interface.launch(server_name="0.0.0.0", server_port=7860) |