Update webscout.py
Browse files- webscout.py +51 -31
webscout.py
CHANGED
@@ -5,11 +5,14 @@ from datetime import datetime, timezone
|
|
5 |
from decimal import Decimal
|
6 |
from functools import cached_property
|
7 |
from itertools import cycle, islice
|
|
|
8 |
from threading import Event
|
9 |
from types import TracebackType
|
10 |
from typing import Dict, List, Optional, Tuple, Type, Union, cast
|
11 |
|
12 |
-
import pyreqwest_impersonate as pri
|
|
|
|
|
13 |
|
14 |
try:
|
15 |
from lxml.etree import _Element
|
@@ -26,29 +29,33 @@ from html import unescape
|
|
26 |
from math import atan2, cos, radians, sin, sqrt
|
27 |
from typing import Any, Dict, List, Union
|
28 |
from urllib.parse import unquote
|
29 |
-
import orjson
|
30 |
-
import requests
|
31 |
-
import base64
|
32 |
-
from typing import List, Dict, Union
|
33 |
-
import json
|
34 |
-
import requests
|
35 |
-
import base64
|
36 |
-
from typing import List, Dict, Union
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
REGEX_STRIP_TAGS = re.compile("<.*?>")
|
40 |
|
41 |
|
42 |
def json_dumps(obj: Any) -> str:
|
43 |
try:
|
44 |
-
return
|
|
|
|
|
|
|
|
|
45 |
except Exception as ex:
|
46 |
raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
|
47 |
|
48 |
|
49 |
def json_loads(obj: Union[str, bytes]) -> Any:
|
50 |
try:
|
51 |
-
return orjson.loads(obj)
|
52 |
except Exception as ex:
|
53 |
raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
|
54 |
|
@@ -101,8 +108,6 @@ def _calculate_distance(lat1: Decimal, lon1: Decimal, lat2: Decimal, lon2: Decim
|
|
101 |
c = 2 * atan2(sqrt(a), sqrt(1 - a))
|
102 |
return R * c
|
103 |
|
104 |
-
logger = logging.getLogger("webscout.WEBS")
|
105 |
-
|
106 |
class WebscoutE(Exception):
|
107 |
"""Base exception class for search."""
|
108 |
|
@@ -121,14 +126,26 @@ class AllProvidersFailure(Exception):
|
|
121 |
"""None of the providers generated response successfully"""
|
122 |
|
123 |
pass
|
|
|
|
|
|
|
124 |
class WEBS:
|
125 |
"""webscout class to get search results from duckduckgo.com."""
|
126 |
|
127 |
_executor: ThreadPoolExecutor = ThreadPoolExecutor()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
def __init__(
|
130 |
self,
|
131 |
-
headers: Optional[Dict[str, str]] =
|
132 |
proxy: Optional[str] = None,
|
133 |
proxies: Union[Dict[str, str], str, None] = None, # deprecated
|
134 |
timeout: Optional[int] = 10,
|
@@ -152,9 +169,9 @@ class WEBS:
|
|
152 |
headers=self.headers,
|
153 |
proxy=self.proxy,
|
154 |
timeout=timeout,
|
155 |
-
cookie_store=
|
156 |
referer=True,
|
157 |
-
impersonate=
|
158 |
follow_redirects=False,
|
159 |
verify=False,
|
160 |
)
|
@@ -208,13 +225,14 @@ class WEBS:
|
|
208 |
resp_content = self._get_url("POST", "https://duckduckgo.com", data={"q": keywords})
|
209 |
return _extract_vqd(resp_content, keywords)
|
210 |
|
211 |
-
def chat(self, keywords: str, model: str = "gpt-3.5") -> str:
|
212 |
"""Initiates a chat session with DuckDuckGo AI.
|
213 |
|
214 |
Args:
|
215 |
keywords (str): The initial message or question to send to the AI.
|
216 |
model (str): The model to use: "gpt-3.5", "claude-3-haiku", "llama-3-70b", "mixtral-8x7b".
|
217 |
Defaults to "gpt-3.5".
|
|
|
218 |
|
219 |
Returns:
|
220 |
str: The response from the AI.
|
@@ -237,18 +255,16 @@ class WEBS:
|
|
237 |
"messages": self._chat_messages,
|
238 |
}
|
239 |
resp = self.client.post(
|
240 |
-
"https://duckduckgo.com/duckchat/v1/chat",
|
|
|
|
|
|
|
241 |
)
|
242 |
self._chat_vqd = resp.headers.get("x-vqd-4", "")
|
243 |
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
if x:
|
248 |
-
j = json_loads(x)
|
249 |
-
message = j.get("message", "")
|
250 |
-
messages.append(message)
|
251 |
-
result = "".join(messages)
|
252 |
self._chat_messages.append({"role": "assistant", "content": result})
|
253 |
return result
|
254 |
|
@@ -435,7 +451,7 @@ class WEBS:
|
|
435 |
for e in elements:
|
436 |
if isinstance(e, _Element):
|
437 |
hrefxpath = e.xpath("./a/@href")
|
438 |
-
href = str(hrefxpath[0]) if isinstance(hrefxpath, List) else None
|
439 |
if (
|
440 |
href
|
441 |
and href not in cache
|
@@ -445,9 +461,9 @@ class WEBS:
|
|
445 |
):
|
446 |
cache.add(href)
|
447 |
titlexpath = e.xpath("./h2/a/text()")
|
448 |
-
title = str(titlexpath[0]) if isinstance(titlexpath, List) else ""
|
449 |
bodyxpath = e.xpath("./a//text()")
|
450 |
-
body = "".join(str(x) for x in bodyxpath) if isinstance(bodyxpath, List) else ""
|
451 |
result = {
|
452 |
"title": _normalize(title),
|
453 |
"href": _normalize_url(href),
|
@@ -537,10 +553,14 @@ class WEBS:
|
|
537 |
else:
|
538 |
cache.add(href)
|
539 |
titlexpath = e.xpath(".//a//text()")
|
540 |
-
title = str(titlexpath[0]) if isinstance(titlexpath, List) else ""
|
541 |
elif i == 2:
|
542 |
bodyxpath = e.xpath(".//td[@class='result-snippet']//text()")
|
543 |
-
body =
|
|
|
|
|
|
|
|
|
544 |
if href:
|
545 |
result = {
|
546 |
"title": _normalize(title),
|
|
|
5 |
from decimal import Decimal
|
6 |
from functools import cached_property
|
7 |
from itertools import cycle, islice
|
8 |
+
from random import choice
|
9 |
from threading import Event
|
10 |
from types import TracebackType
|
11 |
from typing import Dict, List, Optional, Tuple, Type, Union, cast
|
12 |
|
13 |
+
import pyreqwest_impersonate as pri
|
14 |
+
|
15 |
+
|
16 |
|
17 |
try:
|
18 |
from lxml.etree import _Element
|
|
|
29 |
from math import atan2, cos, radians, sin, sqrt
|
30 |
from typing import Any, Dict, List, Union
|
31 |
from urllib.parse import unquote
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
+
from .exceptions import WebscoutE
|
34 |
+
|
35 |
+
try:
|
36 |
+
HAS_ORJSON = True
|
37 |
+
import orjson
|
38 |
+
except ImportError:
|
39 |
+
HAS_ORJSON = False
|
40 |
+
import json
|
41 |
|
42 |
REGEX_STRIP_TAGS = re.compile("<.*?>")
|
43 |
|
44 |
|
45 |
def json_dumps(obj: Any) -> str:
|
46 |
try:
|
47 |
+
return (
|
48 |
+
orjson.dumps(obj, option=orjson.OPT_INDENT_2).decode()
|
49 |
+
if HAS_ORJSON
|
50 |
+
else json.dumps(obj, ensure_ascii=False, indent=2)
|
51 |
+
)
|
52 |
except Exception as ex:
|
53 |
raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
|
54 |
|
55 |
|
56 |
def json_loads(obj: Union[str, bytes]) -> Any:
|
57 |
try:
|
58 |
+
return orjson.loads(obj) if HAS_ORJSON else json.loads(obj)
|
59 |
except Exception as ex:
|
60 |
raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
|
61 |
|
|
|
108 |
c = 2 * atan2(sqrt(a), sqrt(1 - a))
|
109 |
return R * c
|
110 |
|
|
|
|
|
111 |
class WebscoutE(Exception):
|
112 |
"""Base exception class for search."""
|
113 |
|
|
|
126 |
"""None of the providers generated response successfully"""
|
127 |
|
128 |
pass
|
129 |
+
logger = logging.getLogger("webscout.WEBS")
|
130 |
+
|
131 |
+
|
132 |
class WEBS:
|
133 |
"""webscout class to get search results from duckduckgo.com."""
|
134 |
|
135 |
_executor: ThreadPoolExecutor = ThreadPoolExecutor()
|
136 |
+
_impersonates = (
|
137 |
+
"chrome_99", "chrome_100", "chrome_101", "chrome_104", "chrome_105", "chrome_106", "chrome_108",
|
138 |
+
"chrome_107", "chrome_109", "chrome_114", "chrome_116", "chrome_117", "chrome_118", "chrome_119",
|
139 |
+
"chrome_120", #"chrome_123", "chrome_124", "chrome_126",
|
140 |
+
"safari_ios_16.5", "safari_ios_17.2", "safari_ios_17.4.1", "safari_15.3", "safari_15.5",
|
141 |
+
"safari_15.6.1", "safari_16", "safari_16.5", "safari_17.2.1", "safari_17.4.1", "safari_17.5",
|
142 |
+
#"okhttp_3.9", "okhttp_3.11", "okhttp_3.13", "okhttp_3.14", "okhttp_4.9", "okhttp_4.10", "okhttp_5",
|
143 |
+
"edge_99", "edge_101", "edge_122",
|
144 |
+
) # fmt: skip
|
145 |
|
146 |
def __init__(
|
147 |
self,
|
148 |
+
headers: Optional[Dict[str, str]] = None,
|
149 |
proxy: Optional[str] = None,
|
150 |
proxies: Union[Dict[str, str], str, None] = None, # deprecated
|
151 |
timeout: Optional[int] = 10,
|
|
|
169 |
headers=self.headers,
|
170 |
proxy=self.proxy,
|
171 |
timeout=timeout,
|
172 |
+
cookie_store=True,
|
173 |
referer=True,
|
174 |
+
impersonate=choice(self._impersonates),
|
175 |
follow_redirects=False,
|
176 |
verify=False,
|
177 |
)
|
|
|
225 |
resp_content = self._get_url("POST", "https://duckduckgo.com", data={"q": keywords})
|
226 |
return _extract_vqd(resp_content, keywords)
|
227 |
|
228 |
+
def chat(self, keywords: str, model: str = "gpt-3.5", timeout: int = 20) -> str:
|
229 |
"""Initiates a chat session with DuckDuckGo AI.
|
230 |
|
231 |
Args:
|
232 |
keywords (str): The initial message or question to send to the AI.
|
233 |
model (str): The model to use: "gpt-3.5", "claude-3-haiku", "llama-3-70b", "mixtral-8x7b".
|
234 |
Defaults to "gpt-3.5".
|
235 |
+
timeout (int): Timeout value for the HTTP client. Defaults to 20.
|
236 |
|
237 |
Returns:
|
238 |
str: The response from the AI.
|
|
|
255 |
"messages": self._chat_messages,
|
256 |
}
|
257 |
resp = self.client.post(
|
258 |
+
"https://duckduckgo.com/duckchat/v1/chat",
|
259 |
+
headers={"x-vqd-4": self._chat_vqd},
|
260 |
+
json=json_data,
|
261 |
+
timeout=timeout,
|
262 |
)
|
263 |
self._chat_vqd = resp.headers.get("x-vqd-4", "")
|
264 |
|
265 |
+
data = ",".join(x for line in resp.text.rstrip("[DONE]\n").split("data:") if (x := line.strip()))
|
266 |
+
result = "".join(x.get("message", "") for x in json_loads("[" + data + "]"))
|
267 |
+
|
|
|
|
|
|
|
|
|
|
|
268 |
self._chat_messages.append({"role": "assistant", "content": result})
|
269 |
return result
|
270 |
|
|
|
451 |
for e in elements:
|
452 |
if isinstance(e, _Element):
|
453 |
hrefxpath = e.xpath("./a/@href")
|
454 |
+
href = str(hrefxpath[0]) if hrefxpath and isinstance(hrefxpath, List) else None
|
455 |
if (
|
456 |
href
|
457 |
and href not in cache
|
|
|
461 |
):
|
462 |
cache.add(href)
|
463 |
titlexpath = e.xpath("./h2/a/text()")
|
464 |
+
title = str(titlexpath[0]) if titlexpath and isinstance(titlexpath, List) else ""
|
465 |
bodyxpath = e.xpath("./a//text()")
|
466 |
+
body = "".join(str(x) for x in bodyxpath) if bodyxpath and isinstance(bodyxpath, List) else ""
|
467 |
result = {
|
468 |
"title": _normalize(title),
|
469 |
"href": _normalize_url(href),
|
|
|
553 |
else:
|
554 |
cache.add(href)
|
555 |
titlexpath = e.xpath(".//a//text()")
|
556 |
+
title = str(titlexpath[0]) if titlexpath and isinstance(titlexpath, List) else ""
|
557 |
elif i == 2:
|
558 |
bodyxpath = e.xpath(".//td[@class='result-snippet']//text()")
|
559 |
+
body = (
|
560 |
+
"".join(str(x) for x in bodyxpath).strip()
|
561 |
+
if bodyxpath and isinstance(bodyxpath, List)
|
562 |
+
else ""
|
563 |
+
)
|
564 |
if href:
|
565 |
result = {
|
566 |
"title": _normalize(title),
|