Spaces:
Running
Running
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin | |
from ..utils import get_relevant_images, extract_title | |
class BeautifulSoupScraper: | |
def __init__(self, link, session=None): | |
self.link = link | |
self.session = session | |
def scrape(self): | |
""" | |
This function scrapes content from a webpage by making a GET request, parsing the HTML using | |
BeautifulSoup, and extracting script and style elements before returning the cleaned content. | |
Returns: | |
The `scrape` method is returning the cleaned and extracted content from the webpage specified | |
by the `self.link` attribute. The method fetches the webpage content, removes script and style | |
tags, extracts the text content, and returns the cleaned content as a string. If any exception | |
occurs during the process, an error message is printed and an empty string is returned. | |
""" | |
try: | |
response = self.session.get(self.link, timeout=4) | |
soup = BeautifulSoup( | |
response.content, "lxml", from_encoding=response.encoding | |
) | |
for script_or_style in soup(["script", "style"]): | |
script_or_style.extract() | |
raw_content = self.get_content_from_url(soup) | |
lines = (line.strip() for line in raw_content.splitlines()) | |
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
content = "\n".join(chunk for chunk in chunks if chunk) | |
image_urls = get_relevant_images(soup, self.link) | |
# Extract the title using the utility function | |
title = extract_title(soup) | |
return content, image_urls, title | |
except Exception as e: | |
print("Error! : " + str(e)) | |
return "", [], "" | |
def get_content_from_url(self, soup: BeautifulSoup) -> str: | |
"""Get the relevant text from the soup with improved filtering""" | |
text_elements = [] | |
tags = ["h1", "h2", "h3", "h4", "h5", "p", "li", "div", "span"] | |
for element in soup.find_all(tags): | |
# Skip empty elements | |
if not element.text.strip(): | |
continue | |
# Skip elements with very short text (likely buttons or links) | |
if len(element.text.split()) < 3: | |
continue | |
# Check if the element is likely to be navigation or a menu | |
parent_classes = element.parent.get('class', []) | |
if any(cls in ['nav', 'menu', 'sidebar', 'footer'] for cls in parent_classes): | |
continue | |
# Remove excess whitespace and join lines | |
cleaned_text = ' '.join(element.text.split()) | |
# Add the cleaned text to our list of elements | |
text_elements.append(cleaned_text) | |
# Join all text elements with newlines | |
return '\n\n'.join(text_elements) | |