Spaces:

Merlintxu
/

SEO

Sleeping

App Files Files Community

SEO / app.py

Merlintxu

Update app.py

7d39cf2 verified 24 days ago

raw

history blame

9.95 kB

	import os
	import json
	import logging
	import re
	import requests
	import hashlib
	import PyPDF2
	import numpy as np
	import pandas as pd
	from io import BytesIO
	from typing import List, Dict, Optional
	from urllib.parse import urlparse, urljoin
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from bs4 import BeautifulSoup
	from pathlib import Path
	from datetime import datetime
	from collections import defaultdict

	import gradio as gr
	import matplotlib.pyplot as plt
	from sklearn.feature_extractioimport json
	import logging
	import re
	import requests
	import hashlib
	import PyPDF2
	import numpy as np
	import pandas as pd
	from io import BytesIO
	from typing import List, Dict, Optional
	from urllib.parse import urlparse, urljoin
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from bs4 import BeautifulSoup
	from pathlib import Path
	from datetime import datetime
	from collections import defaultdict

	import gradio as gr
	import matplotlib.pyplot as plt
	from sklearn.feature_extraction.text import TfidfVectorizer
	from requests.adapters import HTTPAdapter
	from requests.packages.urllib3.util.retry import Retry
	from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
	from sentence_transformers import SentenceTransformer
	import spacy
	import torch

	# Configuración inicial
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class SEOSpaceAnalyzer:
	def __init__(self):
	self.session = self._configure_session()
	self.models = self._load_models()
	self.base_dir = Path("content_storage")
	self.link_analysis = defaultdict(list)
	self.documents = []
	self.current_analysis = {}

	def _configure_session(self):
	"""Configuración avanzada de sesión HTTP con reintentos"""
	session = requests.Session()
	retry = Retry(
	total=3,
	backoff_factor=1,
	status_forcelist=[500, 502, 503, 504]
	)
	adapter = HTTPAdapter(max_retries=retry)
	session.mount('https://', adapter)
	session.headers.update({
	'User-Agent': 'Mozilla/5.0 (compatible; SEOBot/1.0)',
	'Accept-Language': 'es-ES,es;q=0.9'
	})
	return session

	def _load_models(self):
	"""Carga modelos de Hugging Face optimizados"""
	device = 0 if torch.cuda.is_available() else -1
	return {
	'summarizer': pipeline("summarization",
	model="facebook/bart-large-cnn",
	device=device),
	'ner': pipeline("ner",
	model="dslim/bert-base-NER",
	aggregation_strategy="simple",
	device=device),
	'qa': pipeline("question-answering",
	model="deepset/roberta-base-squad2",
	device=device),
	'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
	'spacy': spacy.load("es_core_news_lg")
	}

	def _process_url(self, url):
	"""Procesa una URL y extrae su contenido"""
	try:
	response = self.session.get(url, timeout=15)
	response.raise_for_status()

	content_type = response.headers.get('Content-Type', '')
	result = {'url': url, 'links': []}

	if 'application/pdf' in content_type:
	result.update(self._process_pdf(response.content))
	elif 'text/html' in content_type:
	result.update(self._process_html(response.text, url))

	self._save_content(url, response.content)
	return result

	except Exception as e:
	logger.error(f"Error procesando {url}: {str(e)}")
	return {'url': url, 'error': str(e)}

	def _process_html(self, html, base_url):
	"""Procesa contenido HTML"""
	soup = BeautifulSoup(html, 'lxml')
	return {
	'content': self._clean_text(soup.get_text()),
	'type': 'html',
	'metadata': self._extract_metadata(soup),
	'links': self._extract_links(soup, base_url)
	}

	def _process_pdf(self, content):
	"""Procesa documentos PDF"""
	text = ""
	with BytesIO(content) as pdf_file:
	reader = PyPDF2.PdfReader(pdf_file)
	for page in reader.pages:
	text += page.extract_text()

	return {
	'content': self._clean_text(text),
	'type': 'pdf',
	'metadata': {'pages': len(reader.pages)}
	}

	def _extract_links(self, soup, base_url):
	"""Extrae y clasifica enlaces"""
	links = []
	for tag in soup.find_all('a', href=True):
	href = tag['href']
	full_url = urljoin(base_url, href)
	link_type = 'internal' if urlparse(full_url).netloc == urlparse(base_url).netloc else 'external'

	links.append({
	'url': full_url,
	'type': link_type,
	'anchor': self._clean_text(tag.text),
	'file_type': self._get_file_type(href)
	})
	return links

	def _get_file_type(self, url):
	"""Determina el tipo de archivo por extensión"""
	ext = Path(urlparse(url).path).suffix.lower()
	return ext[1:] if ext else 'html'

	def _clean_text(self, text):
	"""Limpieza avanzada de texto"""
	text = re.sub(r'\s+', ' ', text)
	return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip()

	def _save_content(self, url, content):
	"""Almacena el contenido descargado"""
	path = urlparse(url).path.lstrip('/')
	save_path = self.base_dir / urlparse(url).netloc / path
	save_path.parent.mkdir(parents=True, exist_ok=True)

	with open(save_path.with_suffix(self._get_file_type(url)), 'wb') as f:
	f.write(content)

	def analyze_sitemap(self, sitemap_url):
	"""Analiza todo el sitemap y genera reportes"""
	urls = self._parse_sitemap(sitemap_url)
	results = []

	with ThreadPoolExecutor(max_workers=4) as executor:
	futures = [executor.submit(self._process_url, url) for url in urls]
	for future in as_completed(futures):
	results.append(future.result())
	progress(len(results)/len(urls))

	self.current_analysis = {
	'basic_stats': self._calculate_stats(results),
	'content_analysis': self._analyze_content(results),
	'link_analysis': self._analyze_links(results),
	'seo_recommendations': self._generate_recommendations(results)
	}

	return self.current_analysis

	def _parse_sitemap(self, sitemap_url):
	"""Parsea sitemaps XML incluyendo sitemaps indexados"""
	# Implementación de parsing de sitemap (similar a versiones anteriores)
	return []

	def _calculate_stats(self, results):
	"""Calcula estadísticas básicas del análisis"""
	return {
	'total_urls': len(results),
	'content_types': pd.Series([r.get('type', 'unknown') for r in results]).value_counts().to_dict(),
	'avg_content_length': np.mean([len(r.get('content', '')) for r in results])
	}

	def create_report(self):
	"""Crea un reporte descargable en múltiples formatos"""
	report = {
	'timestamp': datetime.now().isoformat(),
	'analysis': self.current_analysis
	}

	# Guardar en JSON
	json_path = self.base_dir / 'seo_report.json'
	with open(json_path, 'w') as f:
	json.dump(report, f)

	# Crear CSV con enlaces
	df = pd.DataFrame([link for result in self.current_analysis['link_analysis'] for link in result['links']])
	csv_path = self.base_dir / 'links_analysis.csv'
	df.to_csv(csv_path, index=False)

	return [str(json_path), str(csv_path)]

	def create_visualization(self):
	"""Genera visualizaciones interactivas"""
	fig, ax = plt.subplots()
	pd.Series(self.current_analysis['basic_stats']['content_types']).plot.pie(
	ax=ax,
	title='Distribución de Tipos de Contenido',
	ylabel=''
	)
	return fig

	# Interface Gradio
	def create_interface():
	analyzer = SEOSpaceAnalyzer()

	with gr.Blocks(title="SEO Analyzer Pro", theme=gr.themes.Soft()) as interface:
	gr.Markdown("# 🕵️ SEO Analyzer Pro")

	with gr.Row():
	sitemap_url = gr.Textbox(label="URL del Sitemap", placeholder="https://www.ing.es/ennaranja/sitemap.xml")
	analyze_btn = gr.Button("Analizar", variant="primary")

	with gr.Tab("Resultados"):
	json_output = gr.JSON(label="Análisis Completo")
	plot_output = gr.Plot(label="Visualización")

	with gr.Tab("Enlaces"):
	internal_links = gr.Dataframe(label="Enlaces Internos")
	external_links = gr.Dataframe(label="Enlaces Externos")

	with gr.Tab("Descargas"):
	report_download = gr.Files(label="Descargar Reporte")
	download_btn = gr.Button("Generar Reporte", variant="secondary")

	analyze_btn.click(
	fn=analyzer.analyze_sitemap,
	inputs=sitemap_url,
	outputs=[json_output, plot_output, internal_links, external_links]
	)

	download_btn.click(
	fn=analyzer.create_report,
	outputs=report_download
	)

	return interface

	if __name__ == "__main__":
	interface = create_interface()
	interface.launch(server_name="0.0.0.0", server_port=7860)