Abhaykoul commited on
Commit
e344f2d
1 Parent(s): 1ee12e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -13
app.py CHANGED
@@ -177,10 +177,13 @@ async def chat(
177
 
178
  def extract_text_from_webpage(html_content):
179
  """Extracts visible text from HTML content using BeautifulSoup."""
180
- soup = BeautifulSoup(html_content)
181
- for tag in soup(["script", "style", "header", "footer"]):
 
182
  tag.extract()
183
- return soup.get_text(strip=True)
 
 
184
 
185
  async def fetch_and_extract(url, max_chars, proxy: Optional[str] = None):
186
  """Fetches a URL and extracts text asynchronously."""
@@ -245,15 +248,19 @@ async def web_search_and_extract(
245
 
246
  def extract_text_from_webpage2(html_content):
247
  """Extracts visible text from HTML content using BeautifulSoup."""
248
- soup = BeautifulSoup(html_content)
249
- for tag in soup(["script", "style", "header", "footer"]):
 
250
  tag.extract()
251
- return soup.get_text(strip=True)
 
 
252
 
253
- def fetch_and_extract2(url, max_chars):
254
  """Fetches a URL and extracts text using threading."""
 
255
  try:
256
- response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
257
  response.raise_for_status()
258
  html_content = response.text
259
  visible_text = extract_text_from_webpage2(html_content)
@@ -267,19 +274,20 @@ def fetch_and_extract2(url, max_chars):
267
  @app.get("/api/websearch-and-extract-threading")
268
  def web_search_and_extract_threading(
269
  q: str,
270
- max_results: int = 10,
271
  timelimit: Optional[str] = None,
272
  safesearch: str = "moderate",
273
  region: str = "wt-wt",
274
  backend: str = "html",
275
- max_chars: int = 10000,
276
- extract_only: bool = True
 
277
  ):
278
  """
279
  Searches using WEBS, extracts text from the top results using threading, and returns both.
280
  """
281
  try:
282
- with WEBS() as webs:
283
  # Perform WEBS search
284
  search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
285
  timelimit=timelimit, backend=backend, max_results=max_results)
@@ -289,7 +297,7 @@ def web_search_and_extract_threading(
289
  threads = []
290
  for result in search_results:
291
  if 'href' in result:
292
- thread = threading.Thread(target=lambda: extracted_results.append(fetch_and_extract2(result['href'], max_chars)))
293
  threads.append(thread)
294
  thread.start()
295
 
 
177
 
178
  def extract_text_from_webpage(html_content):
179
  """Extracts visible text from HTML content using BeautifulSoup."""
180
+ soup = BeautifulSoup(html_content, "html.parser")
181
+ # Remove unwanted tags
182
+ for tag in soup(["script", "style", "header", "footer", "nav"]):
183
  tag.extract()
184
+ # Get the remaining visible text
185
+ visible_text = soup.get_text(strip=True)
186
+ return visible_text
187
 
188
  async def fetch_and_extract(url, max_chars, proxy: Optional[str] = None):
189
  """Fetches a URL and extracts text asynchronously."""
 
248
 
249
  def extract_text_from_webpage2(html_content):
250
  """Extracts visible text from HTML content using BeautifulSoup."""
251
+ soup = BeautifulSoup(html_content, "html.parser")
252
+ # Remove unwanted tags
253
+ for tag in soup(["script", "style", "header", "footer", "nav"]):
254
  tag.extract()
255
+ # Get the remaining visible text
256
+ visible_text = soup.get_text(strip=True)
257
+ return visible_text
258
 
259
+ def fetch_and_extract2(url, max_chars, proxy: Optional[str] = None):
260
  """Fetches a URL and extracts text using threading."""
261
+ proxies = {'http': proxy, 'https': proxy} if proxy else None
262
  try:
263
+ response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, proxies=proxies)
264
  response.raise_for_status()
265
  html_content = response.text
266
  visible_text = extract_text_from_webpage2(html_content)
 
274
  @app.get("/api/websearch-and-extract-threading")
275
  def web_search_and_extract_threading(
276
  q: str,
277
+ max_results: int = 3,
278
  timelimit: Optional[str] = None,
279
  safesearch: str = "moderate",
280
  region: str = "wt-wt",
281
  backend: str = "html",
282
+ max_chars: int = 6000,
283
+ extract_only: bool = True,
284
+ proxy: Optional[str] = None
285
  ):
286
  """
287
  Searches using WEBS, extracts text from the top results using threading, and returns both.
288
  """
289
  try:
290
+ with WEBS(proxy=proxy) as webs:
291
  # Perform WEBS search
292
  search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
293
  timelimit=timelimit, backend=backend, max_results=max_results)
 
297
  threads = []
298
  for result in search_results:
299
  if 'href' in result:
300
+ thread = threading.Thread(target=lambda: extracted_results.append(fetch_and_extract2(result['href'], max_chars, proxy)))
301
  threads.append(thread)
302
  thread.start()
303