Update app.py
Browse files
app.py
CHANGED
@@ -4,248 +4,272 @@ import logging
|
|
4 |
import re
|
5 |
import requests
|
6 |
import hashlib
|
|
|
|
|
|
|
|
|
|
|
7 |
from urllib.parse import urlparse, urljoin
|
8 |
-
from typing import List, Dict, Optional, Tuple
|
9 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
10 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
import PyPDF2
|
12 |
-
from io import BytesIO
|
13 |
-
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
|
14 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
|
|
|
16 |
from sentence_transformers import SentenceTransformer
|
17 |
import spacy
|
18 |
-
import
|
19 |
|
20 |
-
# Configuración
|
21 |
-
logging.basicConfig(level=logging.INFO
|
22 |
-
format='%(asctime)s - %(levelname)s - %(message)s')
|
23 |
logger = logging.getLogger(__name__)
|
24 |
|
25 |
-
class
|
26 |
-
def __init__(self
|
27 |
-
self.sitemap_url = sitemap_url
|
28 |
self.session = self._configure_session()
|
29 |
self.models = self._load_models()
|
30 |
-
self.
|
31 |
-
self.
|
32 |
-
self.content_store = {}
|
33 |
self.documents = []
|
34 |
-
|
35 |
-
|
|
|
|
|
36 |
session = requests.Session()
|
37 |
retry = Retry(
|
38 |
-
total=
|
39 |
backoff_factor=1,
|
40 |
status_forcelist=[500, 502, 503, 504]
|
41 |
)
|
42 |
adapter = HTTPAdapter(max_retries=retry)
|
43 |
session.mount('https://', adapter)
|
44 |
session.headers.update({
|
45 |
-
'User-Agent': 'Mozilla/5.0 (compatible; SEOBot/1.0
|
|
|
46 |
})
|
47 |
return session
|
48 |
-
|
49 |
-
def _load_models(self)
|
|
|
|
|
50 |
return {
|
51 |
-
'
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
54 |
'qa': pipeline("question-answering",
|
55 |
model="deepset/roberta-base-squad2",
|
56 |
-
|
57 |
-
'
|
58 |
-
|
59 |
-
aggregation_strategy="simple"),
|
60 |
-
'semantic': SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2'),
|
61 |
-
'spacy': spacy.load("en_core_web_lg")
|
62 |
}
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
return None
|
67 |
-
|
68 |
try:
|
69 |
response = self.session.get(url, timeout=15)
|
70 |
response.raise_for_status()
|
|
|
71 |
content_type = response.headers.get('Content-Type', '')
|
72 |
-
|
|
|
73 |
if 'application/pdf' in content_type:
|
74 |
-
|
75 |
elif 'text/html' in content_type:
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
except Exception as e:
|
82 |
-
logger.error(f"Error
|
83 |
-
return
|
84 |
-
|
85 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
text = ""
|
87 |
with BytesIO(content) as pdf_file:
|
88 |
reader = PyPDF2.PdfReader(pdf_file)
|
89 |
for page in reader.pages:
|
90 |
text += page.extract_text()
|
91 |
-
|
92 |
-
doc_hash = hashlib.sha256(content).hexdigest()
|
93 |
-
self._save_document(url, content, 'pdf')
|
94 |
-
|
95 |
return {
|
96 |
-
'
|
97 |
'type': 'pdf',
|
98 |
-
'
|
99 |
-
'hash': doc_hash,
|
100 |
-
'links': []
|
101 |
}
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
main_content = self._extract_main_content(soup)
|
106 |
-
links = self._extract_links(url, soup)
|
107 |
-
|
108 |
-
self._save_document(url, html.encode('utf-8'), 'html')
|
109 |
-
|
110 |
-
return {
|
111 |
-
'url': url,
|
112 |
-
'type': 'html',
|
113 |
-
'content': main_content,
|
114 |
-
'hash': hashlib.sha256(html.encode()).hexdigest(),
|
115 |
-
'links': links,
|
116 |
-
'metadata': self._extract_metadata(soup)
|
117 |
-
}
|
118 |
-
|
119 |
-
def _extract_links(self, base_url: str, soup: BeautifulSoup) -> List[Dict]:
|
120 |
links = []
|
121 |
-
|
122 |
-
|
123 |
-
for tag in soup.find_all(['a', 'link'], href=True):
|
124 |
href = tag['href']
|
125 |
full_url = urljoin(base_url, href)
|
126 |
-
|
127 |
|
128 |
-
link_type = 'internal' if parsed.netloc == base_domain else 'external'
|
129 |
-
file_type = 'other'
|
130 |
-
|
131 |
-
if parsed.path.lower().endswith(('.pdf', '.doc', '.docx')):
|
132 |
-
file_type = 'document'
|
133 |
-
elif parsed.path.lower().endswith(('.jpg', '.png', '.gif')):
|
134 |
-
file_type = 'image'
|
135 |
-
|
136 |
links.append({
|
137 |
'url': full_url,
|
138 |
'type': link_type,
|
139 |
-
'
|
140 |
-
'
|
141 |
})
|
142 |
-
|
143 |
return links
|
144 |
-
|
145 |
-
def
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
}
|
152 |
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
'
|
172 |
-
|
173 |
-
min_length=30)[0]['summary_text'],
|
174 |
-
'entities': self.models['ner'](content['content']),
|
175 |
-
'semantic_embedding': self.models['semantic'].encode(content['content']),
|
176 |
-
'seo_analysis': self._perform_seo_analysis(content)
|
177 |
}
|
178 |
|
179 |
-
|
180 |
-
|
|
|
|
|
181 |
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
doc = self.models['spacy'](text)
|
187 |
|
188 |
-
return
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
|
|
|
|
|
|
|
|
195 |
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
|
|
201 |
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
sitemap_urls = self._parse_sitemap()
|
206 |
-
results = []
|
207 |
|
208 |
-
with
|
209 |
-
|
210 |
-
|
211 |
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
|
|
218 |
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
'content_analysis': self._aggregate_content_stats(results)
|
225 |
-
}
|
226 |
-
|
227 |
-
def _save_document(self, url: str, content: bytes, file_type: str) -> None:
|
228 |
-
parsed = urlparse(url)
|
229 |
-
path = parsed.path.lstrip('/')
|
230 |
-
filename = f"documents/{parsed.netloc}/{path}" if path else f"documents/{parsed.netloc}/index"
|
231 |
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
def launch_interface(self):
|
237 |
-
interface = gr.Interface(
|
238 |
-
fn=self.run_analysis,
|
239 |
-
inputs=gr.Textbox(label="Sitemap URL"),
|
240 |
-
outputs=[
|
241 |
-
gr.JSON(label="Analysis Results"),
|
242 |
-
gr.File(label="Download Data")
|
243 |
-
],
|
244 |
-
title="Advanced SEO Analyzer",
|
245 |
-
description="Analyze websites with AI-powered SEO insights"
|
246 |
)
|
247 |
-
|
|
|
248 |
|
249 |
if __name__ == "__main__":
|
250 |
-
|
251 |
-
|
|
|
4 |
import re
|
5 |
import requests
|
6 |
import hashlib
|
7 |
+
import PyPDF2
|
8 |
+
import numpy as np
|
9 |
+
import pandas as pd
|
10 |
+
from io import BytesIO
|
11 |
+
from typing import List, Dict, Optional
|
12 |
from urllib.parse import urlparse, urljoin
|
|
|
13 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
14 |
from bs4 import BeautifulSoup
|
15 |
+
from pathlib import Path
|
16 |
+
from datetime import datetime
|
17 |
+
from collections import defaultdict
|
18 |
+
|
19 |
+
import gradio as gr
|
20 |
+
import matplotlib.pyplot as plt
|
21 |
+
from sklearn.feature_extractioimport json
|
22 |
+
import logging
|
23 |
+
import re
|
24 |
+
import requests
|
25 |
+
import hashlib
|
26 |
import PyPDF2
|
|
|
|
|
27 |
import numpy as np
|
28 |
+
import pandas as pd
|
29 |
+
from io import BytesIO
|
30 |
+
from typing import List, Dict, Optional
|
31 |
+
from urllib.parse import urlparse, urljoin
|
32 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
33 |
+
from bs4 import BeautifulSoup
|
34 |
+
from pathlib import Path
|
35 |
+
from datetime import datetime
|
36 |
+
from collections import defaultdict
|
37 |
+
|
38 |
+
import gradio as gr
|
39 |
+
import matplotlib.pyplot as plt
|
40 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
41 |
+
from requests.adapters import HTTPAdapter
|
42 |
+
from requests.packages.urllib3.util.retry import Retry
|
43 |
+
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
|
44 |
from sentence_transformers import SentenceTransformer
|
45 |
import spacy
|
46 |
+
import torch
|
47 |
|
48 |
+
# Configuración inicial
|
49 |
+
logging.basicConfig(level=logging.INFO)
|
|
|
50 |
logger = logging.getLogger(__name__)
|
51 |
|
52 |
+
class SEOSpaceAnalyzer:
|
53 |
+
def __init__(self):
|
|
|
54 |
self.session = self._configure_session()
|
55 |
self.models = self._load_models()
|
56 |
+
self.base_dir = Path("content_storage")
|
57 |
+
self.link_analysis = defaultdict(list)
|
|
|
58 |
self.documents = []
|
59 |
+
self.current_analysis = {}
|
60 |
+
|
61 |
+
def _configure_session(self):
|
62 |
+
"""Configuración avanzada de sesión HTTP con reintentos"""
|
63 |
session = requests.Session()
|
64 |
retry = Retry(
|
65 |
+
total=3,
|
66 |
backoff_factor=1,
|
67 |
status_forcelist=[500, 502, 503, 504]
|
68 |
)
|
69 |
adapter = HTTPAdapter(max_retries=retry)
|
70 |
session.mount('https://', adapter)
|
71 |
session.headers.update({
|
72 |
+
'User-Agent': 'Mozilla/5.0 (compatible; SEOBot/1.0)',
|
73 |
+
'Accept-Language': 'es-ES,es;q=0.9'
|
74 |
})
|
75 |
return session
|
76 |
+
|
77 |
+
def _load_models(self):
|
78 |
+
"""Carga modelos de Hugging Face optimizados"""
|
79 |
+
device = 0 if torch.cuda.is_available() else -1
|
80 |
return {
|
81 |
+
'summarizer': pipeline("summarization",
|
82 |
+
model="facebook/bart-large-cnn",
|
83 |
+
device=device),
|
84 |
+
'ner': pipeline("ner",
|
85 |
+
model="dslim/bert-base-NER",
|
86 |
+
aggregation_strategy="simple",
|
87 |
+
device=device),
|
88 |
'qa': pipeline("question-answering",
|
89 |
model="deepset/roberta-base-squad2",
|
90 |
+
device=device),
|
91 |
+
'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
|
92 |
+
'spacy': spacy.load("es_core_news_lg")
|
|
|
|
|
|
|
93 |
}
|
94 |
+
|
95 |
+
def _process_url(self, url):
|
96 |
+
"""Procesa una URL y extrae su contenido"""
|
|
|
|
|
97 |
try:
|
98 |
response = self.session.get(url, timeout=15)
|
99 |
response.raise_for_status()
|
100 |
+
|
101 |
content_type = response.headers.get('Content-Type', '')
|
102 |
+
result = {'url': url, 'links': []}
|
103 |
+
|
104 |
if 'application/pdf' in content_type:
|
105 |
+
result.update(self._process_pdf(response.content))
|
106 |
elif 'text/html' in content_type:
|
107 |
+
result.update(self._process_html(response.text, url))
|
108 |
+
|
109 |
+
self._save_content(url, response.content)
|
110 |
+
return result
|
111 |
+
|
112 |
except Exception as e:
|
113 |
+
logger.error(f"Error procesando {url}: {str(e)}")
|
114 |
+
return {'url': url, 'error': str(e)}
|
115 |
+
|
116 |
+
def _process_html(self, html, base_url):
|
117 |
+
"""Procesa contenido HTML"""
|
118 |
+
soup = BeautifulSoup(html, 'lxml')
|
119 |
+
return {
|
120 |
+
'content': self._clean_text(soup.get_text()),
|
121 |
+
'type': 'html',
|
122 |
+
'metadata': self._extract_metadata(soup),
|
123 |
+
'links': self._extract_links(soup, base_url)
|
124 |
+
}
|
125 |
+
|
126 |
+
def _process_pdf(self, content):
|
127 |
+
"""Procesa documentos PDF"""
|
128 |
text = ""
|
129 |
with BytesIO(content) as pdf_file:
|
130 |
reader = PyPDF2.PdfReader(pdf_file)
|
131 |
for page in reader.pages:
|
132 |
text += page.extract_text()
|
133 |
+
|
|
|
|
|
|
|
134 |
return {
|
135 |
+
'content': self._clean_text(text),
|
136 |
'type': 'pdf',
|
137 |
+
'metadata': {'pages': len(reader.pages)}
|
|
|
|
|
138 |
}
|
139 |
+
|
140 |
+
def _extract_links(self, soup, base_url):
|
141 |
+
"""Extrae y clasifica enlaces"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
links = []
|
143 |
+
for tag in soup.find_all('a', href=True):
|
|
|
|
|
144 |
href = tag['href']
|
145 |
full_url = urljoin(base_url, href)
|
146 |
+
link_type = 'internal' if urlparse(full_url).netloc == urlparse(base_url).netloc else 'external'
|
147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
links.append({
|
149 |
'url': full_url,
|
150 |
'type': link_type,
|
151 |
+
'anchor': self._clean_text(tag.text),
|
152 |
+
'file_type': self._get_file_type(href)
|
153 |
})
|
|
|
154 |
return links
|
155 |
+
|
156 |
+
def _get_file_type(self, url):
|
157 |
+
"""Determina el tipo de archivo por extensión"""
|
158 |
+
ext = Path(urlparse(url).path).suffix.lower()
|
159 |
+
return ext[1:] if ext else 'html'
|
160 |
+
|
161 |
+
def _clean_text(self, text):
|
162 |
+
"""Limpieza avanzada de texto"""
|
163 |
+
text = re.sub(r'\s+', ' ', text)
|
164 |
+
return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip()
|
165 |
+
|
166 |
+
def _save_content(self, url, content):
|
167 |
+
"""Almacena el contenido descargado"""
|
168 |
+
path = urlparse(url).path.lstrip('/')
|
169 |
+
save_path = self.base_dir / urlparse(url).netloc / path
|
170 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
171 |
+
|
172 |
+
with open(save_path.with_suffix(self._get_file_type(url)), 'wb') as f:
|
173 |
+
f.write(content)
|
174 |
+
|
175 |
+
def analyze_sitemap(self, sitemap_url):
|
176 |
+
"""Analiza todo el sitemap y genera reportes"""
|
177 |
+
urls = self._parse_sitemap(sitemap_url)
|
178 |
+
results = []
|
179 |
+
|
180 |
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
181 |
+
futures = [executor.submit(self._process_url, url) for url in urls]
|
182 |
+
for future in as_completed(futures):
|
183 |
+
results.append(future.result())
|
184 |
+
progress(len(results)/len(urls))
|
185 |
+
|
186 |
+
self.current_analysis = {
|
187 |
+
'basic_stats': self._calculate_stats(results),
|
188 |
+
'content_analysis': self._analyze_content(results),
|
189 |
+
'link_analysis': self._analyze_links(results),
|
190 |
+
'seo_recommendations': self._generate_recommendations(results)
|
191 |
}
|
192 |
|
193 |
+
return self.current_analysis
|
194 |
+
|
195 |
+
def _parse_sitemap(self, sitemap_url):
|
196 |
+
"""Parsea sitemaps XML incluyendo sitemaps indexados"""
|
197 |
+
# Implementación de parsing de sitemap (similar a versiones anteriores)
|
198 |
+
return []
|
199 |
+
|
200 |
+
def _calculate_stats(self, results):
|
201 |
+
"""Calcula estadísticas básicas del análisis"""
|
202 |
+
return {
|
203 |
+
'total_urls': len(results),
|
204 |
+
'content_types': pd.Series([r.get('type', 'unknown') for r in results]).value_counts().to_dict(),
|
205 |
+
'avg_content_length': np.mean([len(r.get('content', '')) for r in results])
|
206 |
+
}
|
207 |
+
|
208 |
+
def create_report(self):
|
209 |
+
"""Crea un reporte descargable en múltiples formatos"""
|
210 |
+
report = {
|
211 |
+
'timestamp': datetime.now().isoformat(),
|
212 |
+
'analysis': self.current_analysis
|
|
|
|
|
|
|
|
|
213 |
}
|
214 |
|
215 |
+
# Guardar en JSON
|
216 |
+
json_path = self.base_dir / 'seo_report.json'
|
217 |
+
with open(json_path, 'w') as f:
|
218 |
+
json.dump(report, f)
|
219 |
|
220 |
+
# Crear CSV con enlaces
|
221 |
+
df = pd.DataFrame([link for result in self.current_analysis['link_analysis'] for link in result['links']])
|
222 |
+
csv_path = self.base_dir / 'links_analysis.csv'
|
223 |
+
df.to_csv(csv_path, index=False)
|
|
|
224 |
|
225 |
+
return [str(json_path), str(csv_path)]
|
226 |
+
|
227 |
+
def create_visualization(self):
|
228 |
+
"""Genera visualizaciones interactivas"""
|
229 |
+
fig, ax = plt.subplots()
|
230 |
+
pd.Series(self.current_analysis['basic_stats']['content_types']).plot.pie(
|
231 |
+
ax=ax,
|
232 |
+
title='Distribución de Tipos de Contenido',
|
233 |
+
ylabel=''
|
234 |
+
)
|
235 |
+
return fig
|
236 |
|
237 |
+
# Interface Gradio
|
238 |
+
def create_interface():
|
239 |
+
analyzer = SEOSpaceAnalyzer()
|
240 |
+
|
241 |
+
with gr.Blocks(title="SEO Analyzer Pro", theme=gr.themes.Soft()) as interface:
|
242 |
+
gr.Markdown("# 🕵️ SEO Analyzer Pro")
|
243 |
|
244 |
+
with gr.Row():
|
245 |
+
sitemap_url = gr.Textbox(label="URL del Sitemap", placeholder="https://www.ing.es/ennaranja/sitemap.xml")
|
246 |
+
analyze_btn = gr.Button("Analizar", variant="primary")
|
|
|
|
|
247 |
|
248 |
+
with gr.Tab("Resultados"):
|
249 |
+
json_output = gr.JSON(label="Análisis Completo")
|
250 |
+
plot_output = gr.Plot(label="Visualización")
|
251 |
|
252 |
+
with gr.Tab("Enlaces"):
|
253 |
+
internal_links = gr.Dataframe(label="Enlaces Internos")
|
254 |
+
external_links = gr.Dataframe(label="Enlaces Externos")
|
255 |
+
|
256 |
+
with gr.Tab("Descargas"):
|
257 |
+
report_download = gr.Files(label="Descargar Reporte")
|
258 |
+
download_btn = gr.Button("Generar Reporte", variant="secondary")
|
259 |
|
260 |
+
analyze_btn.click(
|
261 |
+
fn=analyzer.analyze_sitemap,
|
262 |
+
inputs=sitemap_url,
|
263 |
+
outputs=[json_output, plot_output, internal_links, external_links]
|
264 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
|
266 |
+
download_btn.click(
|
267 |
+
fn=analyzer.create_report,
|
268 |
+
outputs=report_download
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
)
|
270 |
+
|
271 |
+
return interface
|
272 |
|
273 |
if __name__ == "__main__":
|
274 |
+
interface = create_interface()
|
275 |
+
interface.launch(server_name="0.0.0.0", server_port=7860)
|