Spaces:

OEvortex
/

Webscout-API

Running

App Files Files Community

Abhaykoul commited on Jul 26

Commit

e344f2d

•

1 Parent(s): 1ee12e5

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -13

app.py CHANGED Viewed

@@ -177,10 +177,13 @@ async def chat(
 def extract_text_from_webpage(html_content):
  """Extracts visible text from HTML content using BeautifulSoup."""
- soup = BeautifulSoup(html_content)
- for tag in soup(["script", "style", "header", "footer"]):
  tag.extract()
- return soup.get_text(strip=True)
 async def fetch_and_extract(url, max_chars, proxy: Optional[str] = None):
  """Fetches a URL and extracts text asynchronously."""
@@ -245,15 +248,19 @@ async def web_search_and_extract(
 def extract_text_from_webpage2(html_content):
  """Extracts visible text from HTML content using BeautifulSoup."""
- soup = BeautifulSoup(html_content)
- for tag in soup(["script", "style", "header", "footer"]):
  tag.extract()
- return soup.get_text(strip=True)
-def fetch_and_extract2(url, max_chars):
  """Fetches a URL and extracts text using threading."""
  try:
- response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
  response.raise_for_status()
  html_content = response.text
  visible_text = extract_text_from_webpage2(html_content)
@@ -267,19 +274,20 @@ def fetch_and_extract2(url, max_chars):
 @app.get("/api/websearch-and-extract-threading")
 def web_search_and_extract_threading(
  q: str,
- max_results: int = 10,
  timelimit: Optional[str] = None,
  safesearch: str = "moderate",
  region: str = "wt-wt",
  backend: str = "html",
- max_chars: int = 10000,
- extract_only: bool = True
 ):
  """
  Searches using WEBS, extracts text from the top results using threading, and returns both.
  """
  try:
- with WEBS() as webs:
  # Perform WEBS search
  search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
  timelimit=timelimit, backend=backend, max_results=max_results)
@@ -289,7 +297,7 @@ def web_search_and_extract_threading(
  threads = []
  for result in search_results:
  if 'href' in result:
- thread = threading.Thread(target=lambda: extracted_results.append(fetch_and_extract2(result['href'], max_chars)))
  threads.append(thread)
  thread.start()

 def extract_text_from_webpage(html_content):
  """Extracts visible text from HTML content using BeautifulSoup."""
+ soup = BeautifulSoup(html_content, "html.parser")
+ # Remove unwanted tags
+ for tag in soup(["script", "style", "header", "footer", "nav"]):
  tag.extract()
+ # Get the remaining visible text
+ visible_text = soup.get_text(strip=True)
+ return visible_text
 async def fetch_and_extract(url, max_chars, proxy: Optional[str] = None):
  """Fetches a URL and extracts text asynchronously."""
 def extract_text_from_webpage2(html_content):
  """Extracts visible text from HTML content using BeautifulSoup."""
+ soup = BeautifulSoup(html_content, "html.parser")
+ # Remove unwanted tags
+ for tag in soup(["script", "style", "header", "footer", "nav"]):
  tag.extract()
+ # Get the remaining visible text
+ visible_text = soup.get_text(strip=True)
+ return visible_text
+def fetch_and_extract2(url, max_chars, proxy: Optional[str] = None):
  """Fetches a URL and extracts text using threading."""
+ proxies = {'http': proxy, 'https': proxy} if proxy else None
  try:
+ response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, proxies=proxies)
  response.raise_for_status()
  html_content = response.text
  visible_text = extract_text_from_webpage2(html_content)
 @app.get("/api/websearch-and-extract-threading")
 def web_search_and_extract_threading(
  q: str,
+ max_results: int = 3,
  timelimit: Optional[str] = None,
  safesearch: str = "moderate",
  region: str = "wt-wt",
  backend: str = "html",
+ max_chars: int = 6000,
+ extract_only: bool = True,
+ proxy: Optional[str] = None
 ):
  """
  Searches using WEBS, extracts text from the top results using threading, and returns both.
  """
  try:
+ with WEBS(proxy=proxy) as webs:
  # Perform WEBS search
  search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
  timelimit=timelimit, backend=backend, max_results=max_results)
  threads = []
  for result in search_results:
  if 'href' in result:
+ thread = threading.Thread(target=lambda: extracted_results.append(fetch_and_extract2(result['href'], max_chars, proxy)))
  threads.append(thread)
  thread.start()