Abhaykoul commited on
Commit
0b5365b
1 Parent(s): 1c60654

Update webscout.py

Browse files
Files changed (1) hide show
  1. webscout.py +171 -0
webscout.py CHANGED
@@ -1811,3 +1811,174 @@ def fastai(user, model="llama3-70b", system="Answer as concisely as possible."):
1811
  return output
1812
 
1813
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1811
  return output
1812
 
1813
 
1814
+ from bs4 import BeautifulSoup
1815
+ import requests
1816
+ from typing import Dict, List, Optional, Union
1817
+ from concurrent.futures import ThreadPoolExecutor, as_completed
1818
+ from urllib.parse import quote
1819
+ from termcolor import colored
1820
+ import time
1821
+ import random
1822
+
1823
+ class GoogleS:
1824
+ """
1825
+ Class to perform Google searches and retrieve results.
1826
+ """
1827
+
1828
+ def __init__(
1829
+ self,
1830
+ headers: Optional[Dict[str, str]] = None,
1831
+ proxy: Optional[str] = None,
1832
+ timeout: Optional[int] = 10,
1833
+ max_workers: int = 20 # Increased max workers for thread pool
1834
+ ):
1835
+ """Initializes the GoogleS object."""
1836
+ self.proxy = proxy
1837
+ self.headers = headers if headers else {
1838
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62"
1839
+ }
1840
+ self.headers["Referer"] = "https://www.google.com/"
1841
+ self.client = requests.Session()
1842
+ self.client.headers.update(self.headers)
1843
+ self.client.proxies.update({"http": self.proxy, "https": self.proxy})
1844
+ self.timeout = timeout
1845
+ self._executor = ThreadPoolExecutor(max_workers=max_workers)
1846
+
1847
+ def __enter__(self):
1848
+ return self
1849
+
1850
+ def __exit__(self, exc_type, exc_val, exc_tb):
1851
+ self.client.close()
1852
+
1853
+ def _get_url(self, method: str, url: str, params: Optional[Dict[str, str]] = None,
1854
+ data: Optional[Union[Dict[str, str], bytes]] = None) -> bytes:
1855
+ """
1856
+ Makes an HTTP request and returns the response content.
1857
+ """
1858
+ try:
1859
+ resp = self.client.request(method, url, params=params, data=data, timeout=self.timeout)
1860
+ except Exception as ex:
1861
+ raise Exception(f"{url} {type(ex).__name__}: {ex}") from ex
1862
+ if resp.status_code == 200:
1863
+ return resp.content
1864
+ raise Exception(f"{resp.url} returned status code {resp.status_code}. {params=} {data=}")
1865
+
1866
+ def _extract_text_from_webpage(self, html_content: bytes, max_characters: Optional[int] = None) -> str:
1867
+ """
1868
+ Extracts visible text from HTML content using lxml parser.
1869
+ """
1870
+ soup = BeautifulSoup(html_content, 'lxml') # Use lxml parser
1871
+ for tag in soup(["script", "style", "header", "footer", "nav"]):
1872
+ tag.extract()
1873
+ visible_text = soup.get_text(strip=True)
1874
+ if max_characters:
1875
+ visible_text = visible_text[:max_characters]
1876
+ return visible_text
1877
+
1878
+ def search(
1879
+ self,
1880
+ query: str,
1881
+ region: str = "us-en",
1882
+ language: str = "en",
1883
+ safe: str = "off",
1884
+ time_period: Optional[str] = None,
1885
+ max_results: int = 10,
1886
+ extract_text: bool = False,
1887
+ max_text_length: Optional[int] = 100,
1888
+ ) -> List[Dict[str, Union[str, int]]]:
1889
+ """
1890
+ Performs a Google search and returns the results.
1891
+
1892
+ Args:
1893
+ query (str): The search query.
1894
+ region (str, optional): The region to search in (e.g., "us-en"). Defaults to "us-en".
1895
+ language (str, optional): The language of the search results (e.g., "en"). Defaults to "en".
1896
+ safe (str, optional): Safe search setting ("off", "active"). Defaults to "off".
1897
+ time_period (Optional[str], optional): Time period filter (e.g., "h" for past hour, "d" for past day).
1898
+ Defaults to None.
1899
+ max_results (int, optional): The maximum number of results to retrieve. Defaults to 10.
1900
+ extract_text (bool, optional): Whether to extract text from the linked web pages. Defaults to False.
1901
+ max_text_length (Optional[int], optional): The maximum length of the extracted text (in characters).
1902
+ Defaults to 100.
1903
+
1904
+ Returns:
1905
+ List[Dict[str, Union[str, int]]]: A list of dictionaries, each representing a search result, containing:
1906
+ - 'title': The title of the result.
1907
+ - 'href': The URL of the result.
1908
+ - 'abstract': The description snippet of the result.
1909
+ - 'index': The index of the result in the list.
1910
+ - 'type': The type of result (currently always "web").
1911
+ - 'visible_text': The extracted text from the web page (if `extract_text` is True).
1912
+ """
1913
+ assert query, "Query cannot be empty."
1914
+
1915
+ results = []
1916
+ futures = []
1917
+ start = 0
1918
+
1919
+ while len(results) < max_results:
1920
+ params = {
1921
+ "q": query,
1922
+ "num": 10,
1923
+ "hl": language,
1924
+ "start": start,
1925
+ "safe": safe,
1926
+ "gl": region,
1927
+ }
1928
+ if time_period:
1929
+ params["tbs"] = f"qdr:{time_period}"
1930
+
1931
+ futures.append(self._executor.submit(self._get_url, "GET", "https://www.google.com/search", params=params))
1932
+ start += 10
1933
+
1934
+ for future in as_completed(futures):
1935
+ try:
1936
+ resp_content = future.result()
1937
+ soup = BeautifulSoup(resp_content, 'lxml') # Use lxml parser
1938
+ result_blocks = soup.find_all("div", class_="g")
1939
+
1940
+ if not result_blocks:
1941
+ break
1942
+
1943
+ # Extract links and titles first
1944
+ for result_block in result_blocks:
1945
+ link = result_block.find("a", href=True)
1946
+ title = result_block.find("h3")
1947
+ description_box = result_block.find(
1948
+ "div", {"style": "-webkit-line-clamp:2"}
1949
+ )
1950
+
1951
+ if link and title and description_box:
1952
+ url = link["href"]
1953
+ results.append({
1954
+ "title": title.text,
1955
+ "href": url,
1956
+ "abstract": description_box.text,
1957
+ "index": len(results),
1958
+ "type": "web",
1959
+ "visible_text": "" # Initialize visible_text as empty string
1960
+ })
1961
+
1962
+ if len(results) >= max_results:
1963
+ break # Stop if we have enough results
1964
+
1965
+ # Parallelize text extraction if needed
1966
+ if extract_text:
1967
+ with ThreadPoolExecutor(max_workers=self._executor._max_workers) as text_extractor:
1968
+ extraction_futures = [
1969
+ text_extractor.submit(self._extract_text_from_webpage,
1970
+ self._get_url("GET", result['href']),
1971
+ max_characters=max_text_length)
1972
+ for result in results
1973
+ if 'href' in result
1974
+ ]
1975
+ for i, future in enumerate(as_completed(extraction_futures)):
1976
+ try:
1977
+ results[i]['visible_text'] = future.result()
1978
+ except Exception as e:
1979
+ print(f"Error extracting text: {e}")
1980
+
1981
+ except Exception as e:
1982
+ print(f"Error: {e}")
1983
+
1984
+ return results