Spaces:

not-lain
/

utils

Running

App Files Files Community

not-lain commited on Nov 26, 2024

Commit

d9c1e67

1 Parent(s): 87d6c49

add url parser

Browse files

Files changed (1) hide show

app.py +212 -1

app.py CHANGED Viewed

@@ -9,7 +9,208 @@ from typing import Optional, List
 import string
 import random
 import re
 def extract_text_from_pptx(file_path):
     prs = Presentation(file_path)
@@ -195,6 +396,9 @@ def sanitize_list_of_lists(text: str) -> Optional[List[List]]:
         print(f"Error parsing the list of lists: {e}")
         return None
 pdf_to_img = gr.Interface(
     convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
@@ -239,13 +443,20 @@ str_to_json = gr.Interface(
     ],
 )
 demo = gr.TabbedInterface(
-    [pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_or_ppt_to_text, str_to_json],
     [
         "PDF to Image",
         "Extract PDF Text",
         "Extract DOC/DOCX Text",
         "Extract PPTX/PPT Text",
         "Extract Json",
     ],
 )

 import string
 import random
 import re
+import requests
+from bs4 import BeautifulSoup
+import random
+import logging
+import time
+from urllib.parse import urlparse
+import requests
+from bs4 import BeautifulSoup
+import random
+import logging
+import time
+from urllib.parse import urlparse
+class URLTextExtractor:
+    """
+    A comprehensive utility for extracting text content from web pages with advanced features.
+    Features:
+    - Rotating User-Agents to mimic different browsers
+    - Robust error handling and retry mechanism
+    - Section preservation for maintaining document structure
+    - Configurable extraction options
+    - Logging support
+    Attributes:
+        USER_AGENTS (list): A comprehensive list of user agent strings to rotate through.
+        logger (logging.Logger): Logger for tracking extraction attempts and errors.
+    Example:
+        >>> extractor = URLTextExtractor()
+        >>> text = extractor.extract_text_from_url('https://example.com')
+        >>> print(text)
+    """
+    # Expanded list of user agents including mobile and less common browsers
+    USER_AGENTS = [
+        # Desktop Browsers
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15',
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0',
+        # Mobile Browsers
+        'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1',
+        'Mozilla/5.0 (Linux; Android 10; SM-G970F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Mobile Safari/537.36',
+    ]
+    def __init__(self, logger=None):
+        """
+        Initialize the URLTextExtractor.
+        Args:
+            logger (logging.Logger, optional): Custom logger.
+                If not provided, creates a default logger.
+        """
+        self.logger = logger or self._create_default_logger()
+    def _create_default_logger(self):
+        """
+        Create a default logger for tracking extraction process.
+        Returns:
+            logging.Logger: Configured logger instance
+        """
+        logger = logging.getLogger(__name__)
+        logger.setLevel(logging.INFO)
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        return logger
+    def _process_element_text(self, element):
+        """
+        Process text within an element, handling anchor tags specially.
+        Args:
+            element (bs4.element.Tag): BeautifulSoup element to process
+        Returns:
+            str: Processed text with proper spacing
+        """
+        # Replace anchor tags with spaced text
+        for a_tag in element.find_all('a'):
+            # Add spaces around the anchor text
+            a_tag.replace_with(f' {a_tag.get_text(strip=True)} ')
+        # Get text with separator
+        return element.get_text(separator=' ', strip=True)
+    def extract_text_from_url(self, url, max_retries=3, preserve_sections=True,
+                               min_section_length=30, allowed_tags=None):
+        """
+        Extract text content from a given URL with advanced configuration.
+        Args:
+            url (str): The URL of the webpage to extract text from.
+            max_retries (int, optional): Maximum number of retry attempts. Defaults to 3.
+            preserve_sections (bool, optional): Whether to preserve section separations. Defaults to True.
+            min_section_length (int, optional): Minimum length of text sections to include. Defaults to 30.
+            allowed_tags (list, optional): Specific HTML tags to extract text from.
+                If None, uses a default set of content-rich tags.
+        Returns:
+            str: Extracted text content from the webpage
+        Raises:
+            ValueError: If URL cannot be fetched after maximum retries
+            requests.RequestException: For network-related errors
+        Examples:
+            >>> extractor = URLTextExtractor()
+            >>> text = extractor.extract_text_from_url('https://example.com')
+            >>> text = extractor.extract_text_from_url('https://example.com', preserve_sections=False)
+        """
+        # Default allowed tags if not specified
+        if allowed_tags is None:
+            allowed_tags = ['p', 'div', 'article', 'section', 'main',
+                            'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+        # Validate URL
+        try:
+            parsed_url = urlparse(url)
+            if not all([parsed_url.scheme, parsed_url.netloc]):
+                # raise ValueError("Invalid URL format")
+                return None
+        except Exception as e:
+            self.logger.error(f"URL parsing error: {e}")
+            raise
+        for attempt in range(max_retries):
+            try:
+                # Randomly select a user agent
+                headers = {
+                    'User-Agent': random.choice(self.USER_AGENTS),
+                    'Accept-Language': 'en-US,en;q=0.9',
+                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
+                }
+                # Send a GET request to the URL
+                response = requests.get(
+                    url,
+                    headers=headers,
+                    timeout=10,
+                    allow_redirects=True
+                )
+                # Raise an exception for bad status codes
+                response.raise_for_status()
+                # Log successful fetch
+                self.logger.info(f"Successfully fetched URL: {url}")
+                # Parse the HTML content
+                soup = BeautifulSoup(response.text, 'html.parser')
+                # Remove unwanted elements
+                for script in soup(["script", "style", "head", "header", "footer", "nav"]):
+                    script.decompose()
+                # Extract text with section preservation
+                if preserve_sections:
+                    # Extract text from specified tags
+                    sections = []
+                    for tag in allowed_tags:
+                        for element in soup.find_all(tag):
+                            # Process element text, handling anchor tags
+                            section_text = self._process_element_text(element)
+                            # Only add sections meeting minimum length
+                            if len(section_text) >= min_section_length:
+                                sections.append(section_text)
+                    # Join sections with newline
+                    text = '\n'.join(sections)
+                else:
+                    # If not preserving sections, use modified text extraction
+                    text = ' '.join(self._process_element_text(element)
+                                    for tag in allowed_tags
+                                    for element in soup.find_all(tag))
+                # Remove excessive whitespace and empty lines
+                text = '\n'.join(line.strip() for line in text.split('\n') if line.strip())
+                return text
+            except (requests.RequestException, ValueError) as e:
+                # Log error details
+                self.logger.warning(f"Attempt {attempt + 1} failed: {e}")
+                # If it's the last retry, raise the error
+                if attempt == max_retries - 1:
+                    self.logger.error(f"Failed to fetch URL after {max_retries} attempts")
+                    raise ValueError(f"Error fetching URL after {max_retries} attempts: {e}")
+                # Exponential backoff
+                wait_time = 2 ** attempt
+                self.logger.info(f"Waiting {wait_time} seconds before retry")
+                time.sleep(wait_time)
+        # Fallback (though this should never be reached due to the raise in the loop)
+        return None
 def extract_text_from_pptx(file_path):
     prs = Presentation(file_path)
         print(f"Error parsing the list of lists: {e}")
         return None
+extractor = URLTextExtractor()
+def parse_url(url):
+    return extractor.extract_text_from_url(url)
 pdf_to_img = gr.Interface(
     convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
     ],
 )
+url_parser = gr.Interface(
+    parse_url,
+    inputs=["text"],
+    outputs=["text"],
+    api_name="url_to_text",
+)
 demo = gr.TabbedInterface(
+    [pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_or_ppt_to_text, url_parser, str_to_json],
     [
         "PDF to Image",
         "Extract PDF Text",
         "Extract DOC/DOCX Text",
         "Extract PPTX/PPT Text",
+        "Extract text from URL",
         "Extract Json",
     ],
 )