Spaces:

lintasmediadanawa
/

web_scrape

Sleeping

App Files Files Community

jonathanjordan21 commited on Sep 6, 2024

Commit

5d14fc1

verified ·

1 Parent(s): d76a753

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -1

app.py CHANGED Viewed

@@ -246,7 +246,7 @@ async def fb_post_detail(username: Optional[str] = None, post_id: Optional[str]
 @app.get("/google_search")
-async def google_search(q: Optional[str] = None, delimiter: str = "\n---\n", sites: Annotated[list[str] | None, Query()] = None):
     print(sites)
     print(type(sites))
     url = f"https://www.google.com/search?q={q} "
@@ -269,6 +269,43 @@ async def google_search(q: Optional[str] = None, delimiter: str = "\n---\n", sit
     return {"results":texts}
 @app.get("/tiktok_video_details")
 async def tiktok_video_details(username: Optional[str] = None, video_id:Optional[str] = None, url: Optional[str] = None):
     if not url:

 @app.get("/google_search")
+async def google_search(q: str, delimiter: str = "\n---\n", sites: Annotated[list[str] | None, Query()] = None):
     print(sites)
     print(type(sites))
     url = f"https://www.google.com/search?q={q} "
     return {"results":texts}
+@app.get("/google_search_urls")
+async def google_search_url(q: str, sites: Annotated[list[str] | None, Query()] = None):
+    url = f"https://www.google.com/search?q={q} "
+    if sites:
+        url += " OR ".join(["site:"+site for site in sites])
+    res = requests.get(
+        url,
+        headers={
+            "user-agent": "Googlebot",
+            "accept-language": "en-US"
+        },
+        timeout=(10, 27),
+    )
+    soup = BeautifulSoup(res.content, "html.parser")
+    prefix = "/url?q=h"
+    len_prefix = len(prefix)
+    docs = []
+    for div in soup.find_all(True):
+        if len(div.find_parents()) == 2:  # Depth 4 means 3 parent divs (0-indexed)
+            a_tags = div.find_all("a")
+            for a in a_tags:
+                doc = a.get("href")
+                if (
+                    doc[:len_prefix] == prefix
+                    and "google.com" not in doc[len_prefix - 1 :]
+                ):
+                    docs.append(
+                        doc[len_prefix - 1 :]
+                        .split("&")[0]
+                        .replace("%3F", "?")
+                        .replace("%3D", "=")
+                    )
+    return {"results":docs}
 @app.get("/tiktok_video_details")
 async def tiktok_video_details(username: Optional[str] = None, video_id:Optional[str] = None, url: Optional[str] = None):
     if not url: