KingNish commited on
Commit
b6dd7da
·
verified ·
1 Parent(s): 5b3a290

Update chatbot.py

Browse files
Files changed (1) hide show
  1. chatbot.py +64 -26
chatbot.py CHANGED
@@ -224,42 +224,80 @@ def extract_images_from_msg_list(msg_list):
224
  all_images.append(c_)
225
  return all_images
226
 
227
- # Perform a Google search and return the results
228
- @lru_cache(maxsize=128)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  def extract_text_from_webpage(html_content):
230
  """Extracts visible text from HTML content using BeautifulSoup."""
231
  soup = BeautifulSoup(html_content, "html.parser")
232
- # Remove unwanted tags
233
- for tag in soup(["script", "style", "header", "footer", "nav"]):
 
 
234
  tag.extract()
235
- # Get the remaining visible text
236
- visible_text = soup.get_text(strip=True)
237
- return visible_text
238
 
239
- from duckduckgo_search import DDGS
 
 
 
 
 
 
 
 
 
 
 
240
 
241
- # Perform a Google search and return the results
242
- def search(term):
243
  all_results = []
244
- # Limit the number of characters from each webpage to stay under the token limit
245
- max_chars_per_page = 8000 # Adjust this value based on your token limit and average webpage length
246
- result_block = DDGS().text(term, max_results=2)
 
 
247
  for result in result_block:
248
  if 'href' in result:
249
  link = result["href"]
250
- try:
251
- webpage = requests.get(link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
252
- webpage.raise_for_status()
253
- visible_text = extract_text_from_webpage(webpage.text)
254
- # Truncate text if it's too long
255
- if len(visible_text) > max_chars_per_page:
256
- visible_text = visible_text[:max_chars_per_page] + "..."
257
- all_results.append({"link": link, "text": visible_text})
258
- except requests.exceptions.RequestException as e:
259
- print(f"Error fetching or processing {link}: {e}")
260
- all_results.append({"link": link, "text": None})
261
- else:
262
- all_results.append({"link": None, "text": None})
263
  return all_results
264
 
265
  # Format the prompt for the language model
 
224
  all_images.append(c_)
225
  return all_images
226
 
227
+ from duckduckgo_search import DDGS
228
+ from threading import Thread
229
+ from queue import Queue
230
+ import random
231
+
232
+ def get_useragent():
233
+ return random.choice(_useragent_list)
234
+
235
+ _useragent_list = [
236
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
237
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
238
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
239
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
240
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
241
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
242
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0',
243
+ 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Mobile/15E148 Safari/605.1.15',
244
+ 'Mozilla/5.0 (iPad; CPU OS 16_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Mobile/15E148 Safari/605.1.15',
245
+ 'Mozilla/5.0 (Android 13; Mobile; rv:109.0) Gecko/109.0 Firefox/109.0',
246
+ 'Mozilla/5.0 (Linux; Android 13; SM-G991B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36',
247
+ 'Mozilla/5.0 (Linux; U; Android 11; en-us; SM-G991U) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/89.0.4387.119 Mobile Safari/537.36',
248
+ 'Mozilla/5.0 (Linux; Android 12; SM-G998U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36',
249
+ 'Mozilla/5.0 (Linux; Android 13; Pixel 7 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36',
250
+ 'Mozilla/5.0 (Linux; Android 12; LM-G900V) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36',
251
+ 'Mozilla/5.0 (Linux; Android 11; SM-G975U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36',
252
+ 'Mozilla/5.0 (Linux; Android 11; SM-N975U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36',
253
+ 'Mozilla/5.0 (Linux; Android 13; SM-S918U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36',
254
+ 'Mozilla/5.0 (Linux; Android 13; SM-F936U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36'
255
+ ]
256
+
257
+ @lru_cache(maxsize=512)
258
  def extract_text_from_webpage(html_content):
259
  """Extracts visible text from HTML content using BeautifulSoup."""
260
  soup = BeautifulSoup(html_content, "html.parser")
261
+ for tag in soup(["script", "style", "header", "footer", "nav", "aside",
262
+ "figure", "figcaption", "template", "form", "input",
263
+ "svg", "canvas", "video", "audio", "head", "meta",
264
+ "link", "img", "iframe", "noscript"]):
265
  tag.extract()
266
+ return soup.get_text(strip=True)
 
 
267
 
268
+ def fetch_and_extract(link, max_chars_per_page, queue):
269
+ """Fetches webpage content and extracts text in a separate thread."""
270
+ try:
271
+ webpage = requests.get(link, headers={"User-Agent": get_useragent()})
272
+ webpage.raise_for_status()
273
+ visible_text = extract_text_from_webpage(webpage.text)
274
+ if len(visible_text) > max_chars_per_page:
275
+ visible_text = visible_text[:max_chars_per_page] + "..."
276
+ queue.put({"link": link, "text": visible_text})
277
+ except requests.exceptions.RequestException as e:
278
+ queue.put({"link": link, "text": None})
279
+ print(f"Error fetching or processing {link}: {e}")
280
 
281
+ def search(term, max_results=2, max_chars_per_page=8000, max_threads=5):
282
+ """Performs a DuckDuckGo search and extracts text from webpages using threads."""
283
  all_results = []
284
+ result_block = DDGS().text(term, max_results=max_results)
285
+ # Use a queue to store results from threads
286
+ queue = Queue()
287
+ # Create and start threads for each link
288
+ threads = []
289
  for result in result_block:
290
  if 'href' in result:
291
  link = result["href"]
292
+ thread = Thread(target=fetch_and_extract, args=(link, max_chars_per_page, queue))
293
+ threads.append(thread)
294
+ thread.start()
295
+ # Wait for all threads to finish
296
+ for thread in threads:
297
+ thread.join()
298
+ # Retrieve results from the queue
299
+ while not queue.empty():
300
+ all_results.append(queue.get())
 
 
 
 
301
  return all_results
302
 
303
  # Format the prompt for the language model