not-lain commited on
Commit
d9c1e67
1 Parent(s): 87d6c49

add url parser

Browse files
Files changed (1) hide show
  1. app.py +212 -1
app.py CHANGED
@@ -9,7 +9,208 @@ from typing import Optional, List
9
  import string
10
  import random
11
  import re
 
 
 
 
 
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def extract_text_from_pptx(file_path):
15
  prs = Presentation(file_path)
@@ -195,6 +396,9 @@ def sanitize_list_of_lists(text: str) -> Optional[List[List]]:
195
  print(f"Error parsing the list of lists: {e}")
196
  return None
197
 
 
 
 
198
 
199
  pdf_to_img = gr.Interface(
200
  convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
@@ -239,13 +443,20 @@ str_to_json = gr.Interface(
239
  ],
240
  )
241
 
 
 
 
 
 
 
242
  demo = gr.TabbedInterface(
243
- [pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_or_ppt_to_text, str_to_json],
244
  [
245
  "PDF to Image",
246
  "Extract PDF Text",
247
  "Extract DOC/DOCX Text",
248
  "Extract PPTX/PPT Text",
 
249
  "Extract Json",
250
  ],
251
  )
 
9
  import string
10
  import random
11
  import re
12
+ import requests
13
+ from bs4 import BeautifulSoup
14
+ import random
15
+ import logging
16
+ import time
17
+ from urllib.parse import urlparse
18
 
19
+ import requests
20
+ from bs4 import BeautifulSoup
21
+ import random
22
+ import logging
23
+ import time
24
+ from urllib.parse import urlparse
25
+
26
+ class URLTextExtractor:
27
+ """
28
+ A comprehensive utility for extracting text content from web pages with advanced features.
29
+
30
+ Features:
31
+ - Rotating User-Agents to mimic different browsers
32
+ - Robust error handling and retry mechanism
33
+ - Section preservation for maintaining document structure
34
+ - Configurable extraction options
35
+ - Logging support
36
+
37
+ Attributes:
38
+ USER_AGENTS (list): A comprehensive list of user agent strings to rotate through.
39
+ logger (logging.Logger): Logger for tracking extraction attempts and errors.
40
+
41
+ Example:
42
+ >>> extractor = URLTextExtractor()
43
+ >>> text = extractor.extract_text_from_url('https://example.com')
44
+ >>> print(text)
45
+ """
46
+
47
+ # Expanded list of user agents including mobile and less common browsers
48
+ USER_AGENTS = [
49
+ # Desktop Browsers
50
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
51
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15',
52
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0',
53
+
54
+ # Mobile Browsers
55
+ 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1',
56
+ 'Mozilla/5.0 (Linux; Android 10; SM-G970F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Mobile Safari/537.36',
57
+ ]
58
+
59
+ def __init__(self, logger=None):
60
+ """
61
+ Initialize the URLTextExtractor.
62
+
63
+ Args:
64
+ logger (logging.Logger, optional): Custom logger.
65
+ If not provided, creates a default logger.
66
+ """
67
+ self.logger = logger or self._create_default_logger()
68
+
69
+ def _create_default_logger(self):
70
+ """
71
+ Create a default logger for tracking extraction process.
72
+
73
+ Returns:
74
+ logging.Logger: Configured logger instance
75
+ """
76
+ logger = logging.getLogger(__name__)
77
+ logger.setLevel(logging.INFO)
78
+ handler = logging.StreamHandler()
79
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
80
+ handler.setFormatter(formatter)
81
+ logger.addHandler(handler)
82
+ return logger
83
+
84
+ def _process_element_text(self, element):
85
+ """
86
+ Process text within an element, handling anchor tags specially.
87
+
88
+ Args:
89
+ element (bs4.element.Tag): BeautifulSoup element to process
90
+
91
+ Returns:
92
+ str: Processed text with proper spacing
93
+ """
94
+ # Replace anchor tags with spaced text
95
+ for a_tag in element.find_all('a'):
96
+ # Add spaces around the anchor text
97
+ a_tag.replace_with(f' {a_tag.get_text(strip=True)} ')
98
+
99
+ # Get text with separator
100
+ return element.get_text(separator=' ', strip=True)
101
+
102
+ def extract_text_from_url(self, url, max_retries=3, preserve_sections=True,
103
+ min_section_length=30, allowed_tags=None):
104
+ """
105
+ Extract text content from a given URL with advanced configuration.
106
+
107
+ Args:
108
+ url (str): The URL of the webpage to extract text from.
109
+ max_retries (int, optional): Maximum number of retry attempts. Defaults to 3.
110
+ preserve_sections (bool, optional): Whether to preserve section separations. Defaults to True.
111
+ min_section_length (int, optional): Minimum length of text sections to include. Defaults to 30.
112
+ allowed_tags (list, optional): Specific HTML tags to extract text from.
113
+ If None, uses a default set of content-rich tags.
114
+
115
+ Returns:
116
+ str: Extracted text content from the webpage
117
+
118
+ Raises:
119
+ ValueError: If URL cannot be fetched after maximum retries
120
+ requests.RequestException: For network-related errors
121
+
122
+ Examples:
123
+ >>> extractor = URLTextExtractor()
124
+ >>> text = extractor.extract_text_from_url('https://example.com')
125
+ >>> text = extractor.extract_text_from_url('https://example.com', preserve_sections=False)
126
+ """
127
+ # Default allowed tags if not specified
128
+ if allowed_tags is None:
129
+ allowed_tags = ['p', 'div', 'article', 'section', 'main',
130
+ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
131
+
132
+ # Validate URL
133
+ try:
134
+ parsed_url = urlparse(url)
135
+ if not all([parsed_url.scheme, parsed_url.netloc]):
136
+ # raise ValueError("Invalid URL format")
137
+ return None
138
+ except Exception as e:
139
+ self.logger.error(f"URL parsing error: {e}")
140
+ raise
141
+
142
+ for attempt in range(max_retries):
143
+ try:
144
+ # Randomly select a user agent
145
+ headers = {
146
+ 'User-Agent': random.choice(self.USER_AGENTS),
147
+ 'Accept-Language': 'en-US,en;q=0.9',
148
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
149
+ }
150
+
151
+ # Send a GET request to the URL
152
+ response = requests.get(
153
+ url,
154
+ headers=headers,
155
+ timeout=10,
156
+ allow_redirects=True
157
+ )
158
+
159
+ # Raise an exception for bad status codes
160
+ response.raise_for_status()
161
+
162
+ # Log successful fetch
163
+ self.logger.info(f"Successfully fetched URL: {url}")
164
+
165
+ # Parse the HTML content
166
+ soup = BeautifulSoup(response.text, 'html.parser')
167
+
168
+ # Remove unwanted elements
169
+ for script in soup(["script", "style", "head", "header", "footer", "nav"]):
170
+ script.decompose()
171
+
172
+ # Extract text with section preservation
173
+ if preserve_sections:
174
+ # Extract text from specified tags
175
+ sections = []
176
+ for tag in allowed_tags:
177
+ for element in soup.find_all(tag):
178
+ # Process element text, handling anchor tags
179
+ section_text = self._process_element_text(element)
180
+
181
+ # Only add sections meeting minimum length
182
+ if len(section_text) >= min_section_length:
183
+ sections.append(section_text)
184
+
185
+ # Join sections with newline
186
+ text = '\n'.join(sections)
187
+ else:
188
+ # If not preserving sections, use modified text extraction
189
+ text = ' '.join(self._process_element_text(element)
190
+ for tag in allowed_tags
191
+ for element in soup.find_all(tag))
192
+
193
+ # Remove excessive whitespace and empty lines
194
+ text = '\n'.join(line.strip() for line in text.split('\n') if line.strip())
195
+
196
+ return text
197
+
198
+ except (requests.RequestException, ValueError) as e:
199
+ # Log error details
200
+ self.logger.warning(f"Attempt {attempt + 1} failed: {e}")
201
+
202
+ # If it's the last retry, raise the error
203
+ if attempt == max_retries - 1:
204
+ self.logger.error(f"Failed to fetch URL after {max_retries} attempts")
205
+ raise ValueError(f"Error fetching URL after {max_retries} attempts: {e}")
206
+
207
+ # Exponential backoff
208
+ wait_time = 2 ** attempt
209
+ self.logger.info(f"Waiting {wait_time} seconds before retry")
210
+ time.sleep(wait_time)
211
+
212
+ # Fallback (though this should never be reached due to the raise in the loop)
213
+ return None
214
 
215
  def extract_text_from_pptx(file_path):
216
  prs = Presentation(file_path)
 
396
  print(f"Error parsing the list of lists: {e}")
397
  return None
398
 
399
+ extractor = URLTextExtractor()
400
+ def parse_url(url):
401
+ return extractor.extract_text_from_url(url)
402
 
403
  pdf_to_img = gr.Interface(
404
  convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
 
443
  ],
444
  )
445
 
446
+ url_parser = gr.Interface(
447
+ parse_url,
448
+ inputs=["text"],
449
+ outputs=["text"],
450
+ api_name="url_to_text",
451
+ )
452
  demo = gr.TabbedInterface(
453
+ [pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_or_ppt_to_text, url_parser, str_to_json],
454
  [
455
  "PDF to Image",
456
  "Extract PDF Text",
457
  "Extract DOC/DOCX Text",
458
  "Extract PPTX/PPT Text",
459
+ "Extract text from URL",
460
  "Extract Json",
461
  ],
462
  )