Abhaykoul commited on
Commit
1b2ef6f
·
verified ·
1 Parent(s): e218904

Upload utils.py

Browse files
Files changed (1) hide show
  1. webscout/utils.py +74 -0
webscout/utils.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from decimal import Decimal
3
+ from html import unescape
4
+ from math import atan2, cos, radians, sin, sqrt
5
+ from typing import Any, Dict, List, Union
6
+ from urllib.parse import unquote
7
+ import orjson
8
+
9
+ from .exceptions import WebscoutE
10
+
11
+ REGEX_STRIP_TAGS = re.compile("<.*?>")
12
+
13
+
14
+ def json_dumps(obj: Any) -> str:
15
+ try:
16
+ return orjson.dumps(obj).decode("utf-8")
17
+ except Exception as ex:
18
+ raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
19
+
20
+
21
+ def json_loads(obj: Union[str, bytes]) -> Any:
22
+ try:
23
+ return orjson.loads(obj)
24
+ except Exception as ex:
25
+ raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
26
+
27
+
28
+ def _extract_vqd(html_bytes: bytes, keywords: str) -> str:
29
+ """Extract vqd from html bytes."""
30
+ for c1, c1_len, c2 in (
31
+ (b'vqd="', 5, b'"'),
32
+ (b"vqd=", 4, b"&"),
33
+ (b"vqd='", 5, b"'"),
34
+ ):
35
+ try:
36
+ start = html_bytes.index(c1) + c1_len
37
+ end = html_bytes.index(c2, start)
38
+ return html_bytes[start:end].decode()
39
+ except ValueError:
40
+ pass
41
+ raise WebscoutE(f"_extract_vqd() {keywords=} Could not extract vqd.")
42
+
43
+
44
+ def _text_extract_json(html_bytes: bytes, keywords: str) -> List[Dict[str, str]]:
45
+ """text(backend="api") -> extract json from html."""
46
+ try:
47
+ start = html_bytes.index(b"DDG.pageLayout.load('d',") + 24
48
+ end = html_bytes.index(b");DDG.duckbar.load(", start)
49
+ data = html_bytes[start:end]
50
+ result: List[Dict[str, str]] = json_loads(data)
51
+ return result
52
+ except Exception as ex:
53
+ raise WebscoutE(f"_text_extract_json() {keywords=} {type(ex).__name__}: {ex}") from ex
54
+ raise WebscoutE(f"_text_extract_json() {keywords=} return None")
55
+
56
+
57
+ def _normalize(raw_html: str) -> str:
58
+ """Strip HTML tags from the raw_html string."""
59
+ return unescape(REGEX_STRIP_TAGS.sub("", raw_html)) if raw_html else ""
60
+
61
+
62
+ def _normalize_url(url: str) -> str:
63
+ """Unquote URL and replace spaces with '+'."""
64
+ return unquote(url.replace(" ", "+")) if url else ""
65
+
66
+
67
+ def _calculate_distance(lat1: Decimal, lon1: Decimal, lat2: Decimal, lon2: Decimal) -> float:
68
+ """Calculate distance between two points in km. Haversine formula."""
69
+ R = 6371.0087714 # Earth's radius in km
70
+ rlat1, rlon1, rlat2, rlon2 = map(radians, [float(lat1), float(lon1), float(lat2), float(lon2)])
71
+ dlon, dlat = rlon2 - rlon1, rlat2 - rlat1
72
+ a = sin(dlat / 2) ** 2 + cos(rlat1) * cos(rlat2) * sin(dlon / 2) ** 2
73
+ c = 2 * atan2(sqrt(a), sqrt(1 - a))
74
+ return R * c