Spaces:

lintasmediadanawa
/

web_scrape

Sleeping

App Files Files Community

jonathanjordan21 commited on Sep 5, 2024

Commit

385e9be

verified ·

1 Parent(s): 1ecee5c

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -41

app.py CHANGED Viewed

@@ -55,7 +55,7 @@ async def linkedin_post_details(post_id: str):
         "content": content,
         "date": date,
         "is_edited": edited,
-        "insights": {"likeCount": likes, "commentCount": comments, "shareCount": None},
     }
@@ -84,51 +84,56 @@ async def google_search(q: str, delimiter: str = "\n---\n", sites: Annotated[lis
 @app.get("/tiktok_video_details")
-async def read_item(username: str, video_id:str):
-    user_agent = "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36"
-    # user_agent = "Googlebot/2.1"
-    # if "https:" in link_detail:
-    #     url = link_detail
-    # elif link_detail[0] == "/":
-    #     url = "https://tiktok.com" + link_detail
-    # else:
-    #     url = "https://tiktok.com/"+link_detail
-    url = f"https://www.tiktok.com/@{username}/video/{video_id}"
-    with httpx.Client() as client:
-        res = client.get(url, headers={"User-Agent":user_agent})
-    # res = requests.get(url, headers={"user-agent":user_agent})
-    text_maker = html2text.HTML2Text()
-    text_maker.ignore_links = True
-    text_maker.ignore_images = True
-    text_maker.bypass_tables = False
-    print("RESPONSE DETAIlL", res.content.decode("utf-8"))
-    docs = text_maker.handle(res.content.decode("utf-8"))
-    print("DOCS", docs)
-    content_detail = docs.split("###")[5]
-    likes, comments, bookmarks, shares = re.findall(r'\*\*([\w.]+)\*\*', content_detail)
-    profile = [x.strip() for x in content_detail.split("\n\nSpeed\n\n", 1)[1].split("\n", 6) if x.strip()]
-    username = profile[0]
-    date = profile[1].rsplit(" · ", 1)[-1]
-    desc = profile[-1].replace("**", "")
-    return {
-        "insights":{
-            "likeCount":likes,
-            "commentCount":comments,
-            "bookmarkCount":bookmarks,
-            "shareCount":shares
-        },
-        "username":username,
-        "date":date,
-        "description":desc
-    }

         "content": content,
         "date": date,
         "is_edited": edited,
+        "insights": {"likeCount": likes, "commentCount": comments, "shareCount": None, "viewCount":None},
     }
 @app.get("/tiktok_video_details")
+async def tiktok_video_details(username: str, video_id:str):
+    url = f"https://www.tiktok.com/{username}/video/{video_id}"
+    user_agent = "LinkedInBot/1.0 (compatible; Mozilla/5.0; Jakarta Commons-HttpClient/3.1 +http://www.linkedin.com)/1.0 (LinkedInBot; https://www.linkedin.com/; [email protected])"
+    res = requests.get(url, headers={"user-agent": user_agent})
+    soup = BeautifulSoup(res.content, "html.parser")
+    insights = soup.find("meta", {"property": "og:description"}).get("content")
+    likes = insights.split(" ", 1)[0]
+    desc = insights.rsplit(" comments. “", 1)[-1][:-1]
+    comments = insights.split(", ", 1)[-1].split(" ", 1)[0]
+    name = soup.find("meta", {"property": "og:title"}).get("content")[9:]
+    content = {
+        "insights": {"likeCount": likes, "commentCount": comments, "shareCount":None, "viewCount":None},
+        "description": desc,
+        "username": username,
+        "name": name,
+    }
+    # text_maker = html2text.HTML2Text()
+    # text_maker.ignore_links = True
+    # text_maker.ignore_images = True
+    # text_maker.bypass_tables = False
+    # print("RESPONSE DETAIlL", res.content.decode("utf-8"))
+    # docs = text_maker.handle(res.content.decode("utf-8"))
+    # print("DOCS", docs)
+    # content_detail = docs.split("###")[5]
+    # likes, comments, bookmarks, shares = re.findall(r'\*\*([\w.]+)\*\*', content_detail)
+    # profile = [x.strip() for x in content_detail.split("\n\nSpeed\n\n", 1)[1].split("\n", 6) if x.strip()]
+    # username = profile[0]
+    # date = profile[1].rsplit(" · ", 1)[-1]
+    # desc = profile[-1].replace("**", "")
+    # return {
+    #     "insights":{
+    #         "likeCount":likes,
+    #         "commentCount":comments,
+    #         "bookmarkCount":bookmarks,
+    #         "shareCount":shares
+    #     },
+    #     "username":username,
+    #     "date":date,
+    #     "description":desc
+    # }