Abhaykoul commited on
Commit
25af675
1 Parent(s): 0e6e27c

Update webscout.py

Browse files
Files changed (1) hide show
  1. webscout.py +51 -31
webscout.py CHANGED
@@ -5,11 +5,14 @@ from datetime import datetime, timezone
5
  from decimal import Decimal
6
  from functools import cached_property
7
  from itertools import cycle, islice
 
8
  from threading import Event
9
  from types import TracebackType
10
  from typing import Dict, List, Optional, Tuple, Type, Union, cast
11
 
12
- import pyreqwest_impersonate as pri # type: ignore
 
 
13
 
14
  try:
15
  from lxml.etree import _Element
@@ -26,29 +29,33 @@ from html import unescape
26
  from math import atan2, cos, radians, sin, sqrt
27
  from typing import Any, Dict, List, Union
28
  from urllib.parse import unquote
29
- import orjson
30
- import requests
31
- import base64
32
- from typing import List, Dict, Union
33
- import json
34
- import requests
35
- import base64
36
- from typing import List, Dict, Union
37
 
 
 
 
 
 
 
 
 
38
 
39
  REGEX_STRIP_TAGS = re.compile("<.*?>")
40
 
41
 
42
  def json_dumps(obj: Any) -> str:
43
  try:
44
- return orjson.dumps(obj).decode("utf-8")
 
 
 
 
45
  except Exception as ex:
46
  raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
47
 
48
 
49
  def json_loads(obj: Union[str, bytes]) -> Any:
50
  try:
51
- return orjson.loads(obj)
52
  except Exception as ex:
53
  raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
54
 
@@ -101,8 +108,6 @@ def _calculate_distance(lat1: Decimal, lon1: Decimal, lat2: Decimal, lon2: Decim
101
  c = 2 * atan2(sqrt(a), sqrt(1 - a))
102
  return R * c
103
 
104
- logger = logging.getLogger("webscout.WEBS")
105
-
106
  class WebscoutE(Exception):
107
  """Base exception class for search."""
108
 
@@ -121,14 +126,26 @@ class AllProvidersFailure(Exception):
121
  """None of the providers generated response successfully"""
122
 
123
  pass
 
 
 
124
  class WEBS:
125
  """webscout class to get search results from duckduckgo.com."""
126
 
127
  _executor: ThreadPoolExecutor = ThreadPoolExecutor()
 
 
 
 
 
 
 
 
 
128
 
129
  def __init__(
130
  self,
131
- headers: Optional[Dict[str, str]] = {'0': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', '1': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', '2': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', '3': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', '4': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', '5': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62', '6': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0', '7': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Mobile/15E148 Safari/605.1.15', '8': 'Mozilla/5.0 (iPad; CPU OS 16_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Mobile/15E148 Safari/605.1.15', '9': 'Mozilla/5.0 (Android 13; Mobile; rv:109.0) Gecko/109.0 Firefox/109.0', '10': 'Mozilla/5.0 (Linux; Android 13; SM-G991B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36', '11': 'Mozilla/5.0 (Linux; U; Android 11; en-us; SM-G991U) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/89.0.4387.119 Mobile Safari/537.36', '12': 'Mozilla/5.0 (Linux; Android 12; SM-G998U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36', '13': 'Mozilla/5.0 (Linux; Android 13; Pixel 7 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36', '14': 'Mozilla/5.0 (Linux; Android 12; LM-G900V) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36', '15': 'Mozilla/5.0 (Linux; Android 11; SM-G975U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36', '16': 'Mozilla/5.0 (Linux; Android 11; SM-N975U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36', '17': 'Mozilla/5.0 (Linux; Android 13; SM-S918U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36', '18': 'Mozilla/5.0 (Linux; Android 13; SM-F936U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36'},
132
  proxy: Optional[str] = None,
133
  proxies: Union[Dict[str, str], str, None] = None, # deprecated
134
  timeout: Optional[int] = 10,
@@ -152,9 +169,9 @@ class WEBS:
152
  headers=self.headers,
153
  proxy=self.proxy,
154
  timeout=timeout,
155
- cookie_store=False,
156
  referer=True,
157
- impersonate="chrome_124",
158
  follow_redirects=False,
159
  verify=False,
160
  )
@@ -208,13 +225,14 @@ class WEBS:
208
  resp_content = self._get_url("POST", "https://duckduckgo.com", data={"q": keywords})
209
  return _extract_vqd(resp_content, keywords)
210
 
211
- def chat(self, keywords: str, model: str = "gpt-3.5") -> str:
212
  """Initiates a chat session with DuckDuckGo AI.
213
 
214
  Args:
215
  keywords (str): The initial message or question to send to the AI.
216
  model (str): The model to use: "gpt-3.5", "claude-3-haiku", "llama-3-70b", "mixtral-8x7b".
217
  Defaults to "gpt-3.5".
 
218
 
219
  Returns:
220
  str: The response from the AI.
@@ -237,18 +255,16 @@ class WEBS:
237
  "messages": self._chat_messages,
238
  }
239
  resp = self.client.post(
240
- "https://duckduckgo.com/duckchat/v1/chat", headers={"x-vqd-4": self._chat_vqd}, json=json_data
 
 
 
241
  )
242
  self._chat_vqd = resp.headers.get("x-vqd-4", "")
243
 
244
- messages = []
245
- for line in resp.text.replace("data: ", "").replace("[DONE]", "").split("\n\n"):
246
- x = line.strip()
247
- if x:
248
- j = json_loads(x)
249
- message = j.get("message", "")
250
- messages.append(message)
251
- result = "".join(messages)
252
  self._chat_messages.append({"role": "assistant", "content": result})
253
  return result
254
 
@@ -435,7 +451,7 @@ class WEBS:
435
  for e in elements:
436
  if isinstance(e, _Element):
437
  hrefxpath = e.xpath("./a/@href")
438
- href = str(hrefxpath[0]) if isinstance(hrefxpath, List) else None
439
  if (
440
  href
441
  and href not in cache
@@ -445,9 +461,9 @@ class WEBS:
445
  ):
446
  cache.add(href)
447
  titlexpath = e.xpath("./h2/a/text()")
448
- title = str(titlexpath[0]) if isinstance(titlexpath, List) else ""
449
  bodyxpath = e.xpath("./a//text()")
450
- body = "".join(str(x) for x in bodyxpath) if isinstance(bodyxpath, List) else ""
451
  result = {
452
  "title": _normalize(title),
453
  "href": _normalize_url(href),
@@ -537,10 +553,14 @@ class WEBS:
537
  else:
538
  cache.add(href)
539
  titlexpath = e.xpath(".//a//text()")
540
- title = str(titlexpath[0]) if isinstance(titlexpath, List) else ""
541
  elif i == 2:
542
  bodyxpath = e.xpath(".//td[@class='result-snippet']//text()")
543
- body = "".join(str(x) for x in bodyxpath) if isinstance(bodyxpath, List) else ""
 
 
 
 
544
  if href:
545
  result = {
546
  "title": _normalize(title),
 
5
  from decimal import Decimal
6
  from functools import cached_property
7
  from itertools import cycle, islice
8
+ from random import choice
9
  from threading import Event
10
  from types import TracebackType
11
  from typing import Dict, List, Optional, Tuple, Type, Union, cast
12
 
13
+ import pyreqwest_impersonate as pri
14
+
15
+
16
 
17
  try:
18
  from lxml.etree import _Element
 
29
  from math import atan2, cos, radians, sin, sqrt
30
  from typing import Any, Dict, List, Union
31
  from urllib.parse import unquote
 
 
 
 
 
 
 
 
32
 
33
+ from .exceptions import WebscoutE
34
+
35
+ try:
36
+ HAS_ORJSON = True
37
+ import orjson
38
+ except ImportError:
39
+ HAS_ORJSON = False
40
+ import json
41
 
42
  REGEX_STRIP_TAGS = re.compile("<.*?>")
43
 
44
 
45
  def json_dumps(obj: Any) -> str:
46
  try:
47
+ return (
48
+ orjson.dumps(obj, option=orjson.OPT_INDENT_2).decode()
49
+ if HAS_ORJSON
50
+ else json.dumps(obj, ensure_ascii=False, indent=2)
51
+ )
52
  except Exception as ex:
53
  raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
54
 
55
 
56
  def json_loads(obj: Union[str, bytes]) -> Any:
57
  try:
58
+ return orjson.loads(obj) if HAS_ORJSON else json.loads(obj)
59
  except Exception as ex:
60
  raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
61
 
 
108
  c = 2 * atan2(sqrt(a), sqrt(1 - a))
109
  return R * c
110
 
 
 
111
  class WebscoutE(Exception):
112
  """Base exception class for search."""
113
 
 
126
  """None of the providers generated response successfully"""
127
 
128
  pass
129
+ logger = logging.getLogger("webscout.WEBS")
130
+
131
+
132
  class WEBS:
133
  """webscout class to get search results from duckduckgo.com."""
134
 
135
  _executor: ThreadPoolExecutor = ThreadPoolExecutor()
136
+ _impersonates = (
137
+ "chrome_99", "chrome_100", "chrome_101", "chrome_104", "chrome_105", "chrome_106", "chrome_108",
138
+ "chrome_107", "chrome_109", "chrome_114", "chrome_116", "chrome_117", "chrome_118", "chrome_119",
139
+ "chrome_120", #"chrome_123", "chrome_124", "chrome_126",
140
+ "safari_ios_16.5", "safari_ios_17.2", "safari_ios_17.4.1", "safari_15.3", "safari_15.5",
141
+ "safari_15.6.1", "safari_16", "safari_16.5", "safari_17.2.1", "safari_17.4.1", "safari_17.5",
142
+ #"okhttp_3.9", "okhttp_3.11", "okhttp_3.13", "okhttp_3.14", "okhttp_4.9", "okhttp_4.10", "okhttp_5",
143
+ "edge_99", "edge_101", "edge_122",
144
+ ) # fmt: skip
145
 
146
  def __init__(
147
  self,
148
+ headers: Optional[Dict[str, str]] = None,
149
  proxy: Optional[str] = None,
150
  proxies: Union[Dict[str, str], str, None] = None, # deprecated
151
  timeout: Optional[int] = 10,
 
169
  headers=self.headers,
170
  proxy=self.proxy,
171
  timeout=timeout,
172
+ cookie_store=True,
173
  referer=True,
174
+ impersonate=choice(self._impersonates),
175
  follow_redirects=False,
176
  verify=False,
177
  )
 
225
  resp_content = self._get_url("POST", "https://duckduckgo.com", data={"q": keywords})
226
  return _extract_vqd(resp_content, keywords)
227
 
228
+ def chat(self, keywords: str, model: str = "gpt-3.5", timeout: int = 20) -> str:
229
  """Initiates a chat session with DuckDuckGo AI.
230
 
231
  Args:
232
  keywords (str): The initial message or question to send to the AI.
233
  model (str): The model to use: "gpt-3.5", "claude-3-haiku", "llama-3-70b", "mixtral-8x7b".
234
  Defaults to "gpt-3.5".
235
+ timeout (int): Timeout value for the HTTP client. Defaults to 20.
236
 
237
  Returns:
238
  str: The response from the AI.
 
255
  "messages": self._chat_messages,
256
  }
257
  resp = self.client.post(
258
+ "https://duckduckgo.com/duckchat/v1/chat",
259
+ headers={"x-vqd-4": self._chat_vqd},
260
+ json=json_data,
261
+ timeout=timeout,
262
  )
263
  self._chat_vqd = resp.headers.get("x-vqd-4", "")
264
 
265
+ data = ",".join(x for line in resp.text.rstrip("[DONE]\n").split("data:") if (x := line.strip()))
266
+ result = "".join(x.get("message", "") for x in json_loads("[" + data + "]"))
267
+
 
 
 
 
 
268
  self._chat_messages.append({"role": "assistant", "content": result})
269
  return result
270
 
 
451
  for e in elements:
452
  if isinstance(e, _Element):
453
  hrefxpath = e.xpath("./a/@href")
454
+ href = str(hrefxpath[0]) if hrefxpath and isinstance(hrefxpath, List) else None
455
  if (
456
  href
457
  and href not in cache
 
461
  ):
462
  cache.add(href)
463
  titlexpath = e.xpath("./h2/a/text()")
464
+ title = str(titlexpath[0]) if titlexpath and isinstance(titlexpath, List) else ""
465
  bodyxpath = e.xpath("./a//text()")
466
+ body = "".join(str(x) for x in bodyxpath) if bodyxpath and isinstance(bodyxpath, List) else ""
467
  result = {
468
  "title": _normalize(title),
469
  "href": _normalize_url(href),
 
553
  else:
554
  cache.add(href)
555
  titlexpath = e.xpath(".//a//text()")
556
+ title = str(titlexpath[0]) if titlexpath and isinstance(titlexpath, List) else ""
557
  elif i == 2:
558
  bodyxpath = e.xpath(".//td[@class='result-snippet']//text()")
559
+ body = (
560
+ "".join(str(x) for x in bodyxpath).strip()
561
+ if bodyxpath and isinstance(bodyxpath, List)
562
+ else ""
563
+ )
564
  if href:
565
  result = {
566
  "title": _normalize(title),