Spaces:

lintasmediadanawa
/

web_scrape

Sleeping

App Files Files Community

jonathanjordan21 commited on Sep 6, 2024

Commit

2e4e6d4

verified ·

1 Parent(s): 1b686c4

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -28

app.py CHANGED Viewed

@@ -28,40 +28,82 @@ async def linkedin_post_details(post_id: str):
     url = "https://www.linkedin.com/posts/"+post_id
     res = requests.get(url, headers={"user-agent":"Googlebot", "accept-language": "en-US"})
-    text_maker = html2text.HTML2Text()
-    text_maker.ignore_links = True
-    text_maker.ignore_images = True
-    text_maker.bypass_tables = False
-    docs = text_maker.handle(res.content.decode("utf-8"))
-    chunks = docs.split("\n\n#")
-    linkedin_content = chunks[1]
-    user = linkedin_content.split("\n\n", 5)
-    full_name = user[1]
-    bio = user[2]
-    try:
-        date, edited = user[3].split("  ")
-        edited = True
-    except:
-        date = user[3].strip()
-        edited = False
-    content = "\n\n".join(user[5:])
-    insights = chunks[3].split("\n\n")[2]
-    likes = insights.split(" ", 1)[0].strip()
-    comments = insights.rsplit(" ", 2)[1].strip()
-    username = url.rsplit("/",1)[-1].split("_")[0]
     return {
-        "userDetails": {"full_name": full_name, "username":username,"bio": bio},
-        "content": content,
         "date": date,
-        "is_edited": edited,
-        "insights": {"likeCount": likes, "commentCount": comments, "shareCount": None, "viewCount":None},
-        "username":username
     }
 @app.get("/facebook_post_detail")

     url = "https://www.linkedin.com/posts/"+post_id
     res = requests.get(url, headers={"user-agent":"Googlebot", "accept-language": "en-US"})
+    soup = BeautifulSoup(res.content, "html.parser")
+    script_tags = soup.find_all("script")
+    for script_tag in script_tags:
+        try:
+            script_tag = json.loads(script_tag.string)
+            if script_tag.get("articleBody"):
+                desc = script_tag.get("articleBody")
+                author = script_tag.get("author")
+                full_name = author.get("name")
+                username = author.get("url").rsplit("/", 1)[-1]
+                user_type = author.get("@type").lower()
+                date = script_tag.get("datePublished")
+        except Exception as e:
+            continue
+    spans = soup.find_all("span", {"data-test-id": "social-actions__reaction-count"})
+    reactions = spans[0].text.strip()
+    shares = spans[-1].text.strip()
+    comments = soup.find("a", {"data-test-id": "social-actions__comments"}).get(
+        "data-num-comments"
+    )
     return {
+        "insights": {
+            "likeCount": None,
+            "commentCount": comments,
+            "shareCount": shares,
+            "reactionCount": reactions,
+            "reactions": [],
+        },
+        "description": desc,
+        "username": username,
+        "name": full_name,
+        "userType": user_type,
         "date": date,
     }
+# async def linkedin_post_details(post_id: str):
+    # url = "https://www.linkedin.com/posts/"+post_id
+    # res = requests.get(url, headers={"user-agent":"Googlebot", "accept-language": "en-US"})
+    # text_maker = html2text.HTML2Text()
+    # text_maker.ignore_links = True
+    # text_maker.ignore_images = True
+    # text_maker.bypass_tables = False
+    # docs = text_maker.handle(res.content.decode("utf-8"))
+    # chunks = docs.split("\n\n#")
+    # linkedin_content = chunks[1]
+    # user = linkedin_content.split("\n\n", 5)
+    # full_name = user[1]
+    # bio = user[2]
+    # try:
+    #     date, edited = user[3].split("  ")
+    #     edited = True
+    # except:
+    #     date = user[3].strip()
+    #     edited = False
+    # content = "\n\n".join(user[5:])
+    # insights = chunks[3].split("\n\n")[2]
+    # likes = insights.split(" ", 1)[0].strip()
+    # comments = insights.rsplit(" ", 2)[1].strip()
+    # username = url.rsplit("/",1)[-1].split("_")[0]
+    # return {
+    #     "userDetails": {"full_name": full_name, "username":username,"bio": bio},
+    #     "content": content,
+    #     "date": date,
+    #     "is_edited": edited,
+    #     "insights": {"likeCount": likes, "commentCount": comments, "shareCount": None, "viewCount":None},
+    #     "username":username
+    # }
 @app.get("/facebook_post_detail")