Merlintxu commited on
Commit
7d39cf2
·
verified ·
1 Parent(s): cde8d76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +202 -178
app.py CHANGED
@@ -4,248 +4,272 @@ import logging
4
  import re
5
  import requests
6
  import hashlib
 
 
 
 
 
7
  from urllib.parse import urlparse, urljoin
8
- from typing import List, Dict, Optional, Tuple
9
  from concurrent.futures import ThreadPoolExecutor, as_completed
10
  from bs4 import BeautifulSoup
 
 
 
 
 
 
 
 
 
 
 
11
  import PyPDF2
12
- from io import BytesIO
13
- from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
14
  import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
15
  from sklearn.feature_extraction.text import TfidfVectorizer
 
 
 
16
  from sentence_transformers import SentenceTransformer
17
  import spacy
18
- import gradio as gr
19
 
20
- # Configuración avanzada
21
- logging.basicConfig(level=logging.INFO,
22
- format='%(asctime)s - %(levelname)s - %(message)s')
23
  logger = logging.getLogger(__name__)
24
 
25
- class AdvancedSEOAanalyzer:
26
- def __init__(self, sitemap_url: str):
27
- self.sitemap_url = sitemap_url
28
  self.session = self._configure_session()
29
  self.models = self._load_models()
30
- self.processed_urls = set()
31
- self.link_graph = defaultdict(list)
32
- self.content_store = {}
33
  self.documents = []
34
-
35
- def _configure_session(self) -> requests.Session:
 
 
36
  session = requests.Session()
37
  retry = Retry(
38
- total=5,
39
  backoff_factor=1,
40
  status_forcelist=[500, 502, 503, 504]
41
  )
42
  adapter = HTTPAdapter(max_retries=retry)
43
  session.mount('https://', adapter)
44
  session.headers.update({
45
- 'User-Agent': 'Mozilla/5.0 (compatible; SEOBot/1.0; +https://seo.example.com/bot)'
 
46
  })
47
  return session
48
-
49
- def _load_models(self) -> Dict:
 
 
50
  return {
51
- 'summarization': pipeline("summarization",
52
- model="facebook/bart-large-cnn",
53
- device=0 if torch.cuda.is_available() else -1),
 
 
 
 
54
  'qa': pipeline("question-answering",
55
  model="deepset/roberta-base-squad2",
56
- tokenizer="deepset/roberta-base-squad2"),
57
- 'ner': pipeline("ner",
58
- model="dslim/bert-base-NER",
59
- aggregation_strategy="simple"),
60
- 'semantic': SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2'),
61
- 'spacy': spacy.load("en_core_web_lg")
62
  }
63
-
64
- async def download_content(self, url: str) -> Optional[Dict]:
65
- if url in self.processed_urls:
66
- return None
67
-
68
  try:
69
  response = self.session.get(url, timeout=15)
70
  response.raise_for_status()
 
71
  content_type = response.headers.get('Content-Type', '')
72
-
 
73
  if 'application/pdf' in content_type:
74
- return self._process_pdf(url, response.content)
75
  elif 'text/html' in content_type:
76
- return await self._process_html(url, response.text)
77
- else:
78
- logger.warning(f"Unsupported content type: {content_type}")
79
- return None
80
-
81
  except Exception as e:
82
- logger.error(f"Error downloading {url}: {str(e)}")
83
- return None
84
-
85
- def _process_pdf(self, url: str, content: bytes) -> Dict:
 
 
 
 
 
 
 
 
 
 
 
86
  text = ""
87
  with BytesIO(content) as pdf_file:
88
  reader = PyPDF2.PdfReader(pdf_file)
89
  for page in reader.pages:
90
  text += page.extract_text()
91
-
92
- doc_hash = hashlib.sha256(content).hexdigest()
93
- self._save_document(url, content, 'pdf')
94
-
95
  return {
96
- 'url': url,
97
  'type': 'pdf',
98
- 'content': text,
99
- 'hash': doc_hash,
100
- 'links': []
101
  }
102
-
103
- async def _process_html(self, url: str, html: str) -> Dict:
104
- soup = BeautifulSoup(html, 'lxml')
105
- main_content = self._extract_main_content(soup)
106
- links = self._extract_links(url, soup)
107
-
108
- self._save_document(url, html.encode('utf-8'), 'html')
109
-
110
- return {
111
- 'url': url,
112
- 'type': 'html',
113
- 'content': main_content,
114
- 'hash': hashlib.sha256(html.encode()).hexdigest(),
115
- 'links': links,
116
- 'metadata': self._extract_metadata(soup)
117
- }
118
-
119
- def _extract_links(self, base_url: str, soup: BeautifulSoup) -> List[Dict]:
120
  links = []
121
- base_domain = urlparse(base_url).netloc
122
-
123
- for tag in soup.find_all(['a', 'link'], href=True):
124
  href = tag['href']
125
  full_url = urljoin(base_url, href)
126
- parsed = urlparse(full_url)
127
 
128
- link_type = 'internal' if parsed.netloc == base_domain else 'external'
129
- file_type = 'other'
130
-
131
- if parsed.path.lower().endswith(('.pdf', '.doc', '.docx')):
132
- file_type = 'document'
133
- elif parsed.path.lower().endswith(('.jpg', '.png', '.gif')):
134
- file_type = 'image'
135
-
136
  links.append({
137
  'url': full_url,
138
  'type': link_type,
139
- 'file_type': file_type,
140
- 'anchor': tag.text.strip()
141
  })
142
-
143
  return links
144
-
145
- def _extract_metadata(self, soup: BeautifulSoup) -> Dict:
146
- metadata = {
147
- 'title': soup.title.string if soup.title else '',
148
- 'description': '',
149
- 'keywords': [],
150
- 'open_graph': {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  }
152
 
153
- meta_tags = soup.find_all('meta')
154
- for tag in meta_tags:
155
- name = tag.get('name', '').lower()
156
- property = tag.get('property', '').lower()
157
- content = tag.get('content', '')
158
-
159
- if name == 'description':
160
- metadata['description'] = content
161
- elif name == 'keywords':
162
- metadata['keywords'] = [kw.strip() for kw in content.split(',')]
163
- elif property.startswith('og:'):
164
- key = property[3:]
165
- metadata['open_graph'][key] = content
166
-
167
- return metadata
168
-
169
- def analyze_content(self, content: Dict) -> Dict:
170
- analysis = {
171
- 'summary': self.models['summarization'](content['content'],
172
- max_length=150,
173
- min_length=30)[0]['summary_text'],
174
- 'entities': self.models['ner'](content['content']),
175
- 'semantic_embedding': self.models['semantic'].encode(content['content']),
176
- 'seo_analysis': self._perform_seo_analysis(content)
177
  }
178
 
179
- if content['type'] == 'pdf':
180
- analysis['document_analysis'] = self._analyze_document_structure(content)
 
 
181
 
182
- return analysis
183
-
184
- def _perform_seo_analysis(self, content: Dict) -> Dict:
185
- text = content['content']
186
- doc = self.models['spacy'](text)
187
 
188
- return {
189
- 'readability_score': self._calculate_readability(text),
190
- 'keyword_density': self._calculate_keyword_density(text),
191
- 'heading_structure': self._analyze_headings(doc),
192
- 'content_length': len(text.split()),
193
- 'semantic_topics': self._extract_semantic_topics(text)
194
- }
 
 
 
 
195
 
196
- def _extract_semantic_topics(self, text: str) -> List[str]:
197
- vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
198
- tfidf = vectorizer.fit_transform([text])
199
- feature_array = np.array(vectorizer.get_feature_names_out())
200
- tfidf_sorting = np.argsort(tfidf.toarray()).flatten()[::-1]
 
201
 
202
- return feature_array[tfidf_sorting][:5].tolist()
203
-
204
- def run_analysis(self, max_workers: int = 4) -> Dict:
205
- sitemap_urls = self._parse_sitemap()
206
- results = []
207
 
208
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
209
- futures = [executor.submit(self.download_content, url)
210
- for url in sitemap_urls]
211
 
212
- for future in as_completed(futures):
213
- result = future.result()
214
- if result:
215
- analyzed = self.analyze_content(result)
216
- results.append({**result, **analyzed})
217
- self._update_link_graph(result)
 
218
 
219
- self._save_full_analysis(results)
220
- return {
221
- 'total_pages': len(results),
222
- 'document_types': self._count_document_types(results),
223
- 'link_analysis': self._analyze_link_graph(),
224
- 'content_analysis': self._aggregate_content_stats(results)
225
- }
226
-
227
- def _save_document(self, url: str, content: bytes, file_type: str) -> None:
228
- parsed = urlparse(url)
229
- path = parsed.path.lstrip('/')
230
- filename = f"documents/{parsed.netloc}/{path}" if path else f"documents/{parsed.netloc}/index"
231
 
232
- os.makedirs(os.path.dirname(filename), exist_ok=True)
233
- with open(filename + f'.{file_type}', 'wb') as f:
234
- f.write(content)
235
-
236
- def launch_interface(self):
237
- interface = gr.Interface(
238
- fn=self.run_analysis,
239
- inputs=gr.Textbox(label="Sitemap URL"),
240
- outputs=[
241
- gr.JSON(label="Analysis Results"),
242
- gr.File(label="Download Data")
243
- ],
244
- title="Advanced SEO Analyzer",
245
- description="Analyze websites with AI-powered SEO insights"
246
  )
247
- interface.launch()
 
248
 
249
  if __name__ == "__main__":
250
- analyzer = AdvancedSEOAanalyzer("https://www.ing.es/ennaranja/sitemap.xml")
251
- analyzer.launch_interface()
 
4
  import re
5
  import requests
6
  import hashlib
7
+ import PyPDF2
8
+ import numpy as np
9
+ import pandas as pd
10
+ from io import BytesIO
11
+ from typing import List, Dict, Optional
12
  from urllib.parse import urlparse, urljoin
 
13
  from concurrent.futures import ThreadPoolExecutor, as_completed
14
  from bs4 import BeautifulSoup
15
+ from pathlib import Path
16
+ from datetime import datetime
17
+ from collections import defaultdict
18
+
19
+ import gradio as gr
20
+ import matplotlib.pyplot as plt
21
+ from sklearn.feature_extractioimport json
22
+ import logging
23
+ import re
24
+ import requests
25
+ import hashlib
26
  import PyPDF2
 
 
27
  import numpy as np
28
+ import pandas as pd
29
+ from io import BytesIO
30
+ from typing import List, Dict, Optional
31
+ from urllib.parse import urlparse, urljoin
32
+ from concurrent.futures import ThreadPoolExecutor, as_completed
33
+ from bs4 import BeautifulSoup
34
+ from pathlib import Path
35
+ from datetime import datetime
36
+ from collections import defaultdict
37
+
38
+ import gradio as gr
39
+ import matplotlib.pyplot as plt
40
  from sklearn.feature_extraction.text import TfidfVectorizer
41
+ from requests.adapters import HTTPAdapter
42
+ from requests.packages.urllib3.util.retry import Retry
43
+ from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
44
  from sentence_transformers import SentenceTransformer
45
  import spacy
46
+ import torch
47
 
48
+ # Configuración inicial
49
+ logging.basicConfig(level=logging.INFO)
 
50
  logger = logging.getLogger(__name__)
51
 
52
+ class SEOSpaceAnalyzer:
53
+ def __init__(self):
 
54
  self.session = self._configure_session()
55
  self.models = self._load_models()
56
+ self.base_dir = Path("content_storage")
57
+ self.link_analysis = defaultdict(list)
 
58
  self.documents = []
59
+ self.current_analysis = {}
60
+
61
+ def _configure_session(self):
62
+ """Configuración avanzada de sesión HTTP con reintentos"""
63
  session = requests.Session()
64
  retry = Retry(
65
+ total=3,
66
  backoff_factor=1,
67
  status_forcelist=[500, 502, 503, 504]
68
  )
69
  adapter = HTTPAdapter(max_retries=retry)
70
  session.mount('https://', adapter)
71
  session.headers.update({
72
+ 'User-Agent': 'Mozilla/5.0 (compatible; SEOBot/1.0)',
73
+ 'Accept-Language': 'es-ES,es;q=0.9'
74
  })
75
  return session
76
+
77
+ def _load_models(self):
78
+ """Carga modelos de Hugging Face optimizados"""
79
+ device = 0 if torch.cuda.is_available() else -1
80
  return {
81
+ 'summarizer': pipeline("summarization",
82
+ model="facebook/bart-large-cnn",
83
+ device=device),
84
+ 'ner': pipeline("ner",
85
+ model="dslim/bert-base-NER",
86
+ aggregation_strategy="simple",
87
+ device=device),
88
  'qa': pipeline("question-answering",
89
  model="deepset/roberta-base-squad2",
90
+ device=device),
91
+ 'semantic': SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2'),
92
+ 'spacy': spacy.load("es_core_news_lg")
 
 
 
93
  }
94
+
95
+ def _process_url(self, url):
96
+ """Procesa una URL y extrae su contenido"""
 
 
97
  try:
98
  response = self.session.get(url, timeout=15)
99
  response.raise_for_status()
100
+
101
  content_type = response.headers.get('Content-Type', '')
102
+ result = {'url': url, 'links': []}
103
+
104
  if 'application/pdf' in content_type:
105
+ result.update(self._process_pdf(response.content))
106
  elif 'text/html' in content_type:
107
+ result.update(self._process_html(response.text, url))
108
+
109
+ self._save_content(url, response.content)
110
+ return result
111
+
112
  except Exception as e:
113
+ logger.error(f"Error procesando {url}: {str(e)}")
114
+ return {'url': url, 'error': str(e)}
115
+
116
+ def _process_html(self, html, base_url):
117
+ """Procesa contenido HTML"""
118
+ soup = BeautifulSoup(html, 'lxml')
119
+ return {
120
+ 'content': self._clean_text(soup.get_text()),
121
+ 'type': 'html',
122
+ 'metadata': self._extract_metadata(soup),
123
+ 'links': self._extract_links(soup, base_url)
124
+ }
125
+
126
+ def _process_pdf(self, content):
127
+ """Procesa documentos PDF"""
128
  text = ""
129
  with BytesIO(content) as pdf_file:
130
  reader = PyPDF2.PdfReader(pdf_file)
131
  for page in reader.pages:
132
  text += page.extract_text()
133
+
 
 
 
134
  return {
135
+ 'content': self._clean_text(text),
136
  'type': 'pdf',
137
+ 'metadata': {'pages': len(reader.pages)}
 
 
138
  }
139
+
140
+ def _extract_links(self, soup, base_url):
141
+ """Extrae y clasifica enlaces"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  links = []
143
+ for tag in soup.find_all('a', href=True):
 
 
144
  href = tag['href']
145
  full_url = urljoin(base_url, href)
146
+ link_type = 'internal' if urlparse(full_url).netloc == urlparse(base_url).netloc else 'external'
147
 
 
 
 
 
 
 
 
 
148
  links.append({
149
  'url': full_url,
150
  'type': link_type,
151
+ 'anchor': self._clean_text(tag.text),
152
+ 'file_type': self._get_file_type(href)
153
  })
 
154
  return links
155
+
156
+ def _get_file_type(self, url):
157
+ """Determina el tipo de archivo por extensión"""
158
+ ext = Path(urlparse(url).path).suffix.lower()
159
+ return ext[1:] if ext else 'html'
160
+
161
+ def _clean_text(self, text):
162
+ """Limpieza avanzada de texto"""
163
+ text = re.sub(r'\s+', ' ', text)
164
+ return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip()
165
+
166
+ def _save_content(self, url, content):
167
+ """Almacena el contenido descargado"""
168
+ path = urlparse(url).path.lstrip('/')
169
+ save_path = self.base_dir / urlparse(url).netloc / path
170
+ save_path.parent.mkdir(parents=True, exist_ok=True)
171
+
172
+ with open(save_path.with_suffix(self._get_file_type(url)), 'wb') as f:
173
+ f.write(content)
174
+
175
+ def analyze_sitemap(self, sitemap_url):
176
+ """Analiza todo el sitemap y genera reportes"""
177
+ urls = self._parse_sitemap(sitemap_url)
178
+ results = []
179
+
180
+ with ThreadPoolExecutor(max_workers=4) as executor:
181
+ futures = [executor.submit(self._process_url, url) for url in urls]
182
+ for future in as_completed(futures):
183
+ results.append(future.result())
184
+ progress(len(results)/len(urls))
185
+
186
+ self.current_analysis = {
187
+ 'basic_stats': self._calculate_stats(results),
188
+ 'content_analysis': self._analyze_content(results),
189
+ 'link_analysis': self._analyze_links(results),
190
+ 'seo_recommendations': self._generate_recommendations(results)
191
  }
192
 
193
+ return self.current_analysis
194
+
195
+ def _parse_sitemap(self, sitemap_url):
196
+ """Parsea sitemaps XML incluyendo sitemaps indexados"""
197
+ # Implementación de parsing de sitemap (similar a versiones anteriores)
198
+ return []
199
+
200
+ def _calculate_stats(self, results):
201
+ """Calcula estadísticas básicas del análisis"""
202
+ return {
203
+ 'total_urls': len(results),
204
+ 'content_types': pd.Series([r.get('type', 'unknown') for r in results]).value_counts().to_dict(),
205
+ 'avg_content_length': np.mean([len(r.get('content', '')) for r in results])
206
+ }
207
+
208
+ def create_report(self):
209
+ """Crea un reporte descargable en múltiples formatos"""
210
+ report = {
211
+ 'timestamp': datetime.now().isoformat(),
212
+ 'analysis': self.current_analysis
 
 
 
 
213
  }
214
 
215
+ # Guardar en JSON
216
+ json_path = self.base_dir / 'seo_report.json'
217
+ with open(json_path, 'w') as f:
218
+ json.dump(report, f)
219
 
220
+ # Crear CSV con enlaces
221
+ df = pd.DataFrame([link for result in self.current_analysis['link_analysis'] for link in result['links']])
222
+ csv_path = self.base_dir / 'links_analysis.csv'
223
+ df.to_csv(csv_path, index=False)
 
224
 
225
+ return [str(json_path), str(csv_path)]
226
+
227
+ def create_visualization(self):
228
+ """Genera visualizaciones interactivas"""
229
+ fig, ax = plt.subplots()
230
+ pd.Series(self.current_analysis['basic_stats']['content_types']).plot.pie(
231
+ ax=ax,
232
+ title='Distribución de Tipos de Contenido',
233
+ ylabel=''
234
+ )
235
+ return fig
236
 
237
+ # Interface Gradio
238
+ def create_interface():
239
+ analyzer = SEOSpaceAnalyzer()
240
+
241
+ with gr.Blocks(title="SEO Analyzer Pro", theme=gr.themes.Soft()) as interface:
242
+ gr.Markdown("# 🕵️ SEO Analyzer Pro")
243
 
244
+ with gr.Row():
245
+ sitemap_url = gr.Textbox(label="URL del Sitemap", placeholder="https://www.ing.es/ennaranja/sitemap.xml")
246
+ analyze_btn = gr.Button("Analizar", variant="primary")
 
 
247
 
248
+ with gr.Tab("Resultados"):
249
+ json_output = gr.JSON(label="Análisis Completo")
250
+ plot_output = gr.Plot(label="Visualización")
251
 
252
+ with gr.Tab("Enlaces"):
253
+ internal_links = gr.Dataframe(label="Enlaces Internos")
254
+ external_links = gr.Dataframe(label="Enlaces Externos")
255
+
256
+ with gr.Tab("Descargas"):
257
+ report_download = gr.Files(label="Descargar Reporte")
258
+ download_btn = gr.Button("Generar Reporte", variant="secondary")
259
 
260
+ analyze_btn.click(
261
+ fn=analyzer.analyze_sitemap,
262
+ inputs=sitemap_url,
263
+ outputs=[json_output, plot_output, internal_links, external_links]
264
+ )
 
 
 
 
 
 
 
265
 
266
+ download_btn.click(
267
+ fn=analyzer.create_report,
268
+ outputs=report_download
 
 
 
 
 
 
 
 
 
 
 
269
  )
270
+
271
+ return interface
272
 
273
  if __name__ == "__main__":
274
+ interface = create_interface()
275
+ interface.launch(server_name="0.0.0.0", server_port=7860)