Spaces:

lintasmediadanawa
/

web_scrape

Sleeping

App Files Files Community

web_scrape / app.py

jonathanjordan21

Update app.py

a270de9 verified 8 months ago

raw

history blame

2.69 kB

	from typing import Annotated

	from fastapi import FastAPI, Header

	import html2text
	import requests

	from fastapi.middleware.cors import CORSMiddleware

	from bs4 import BeautifulSoup


	app = FastAPI()

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	@app.get("/google_search")
	async def google_search(q: str, sites: list):
	url = f"https://www.google.com/search?q={q}"
	if sites:
	url += "&" + " OR ".join(["site:"+site for site in sites])

	texts = ""
	soup = BeautifulSoup(requests.get(url).content, "html.parser")

	for div in soup.find_all("div")[24:]:
	if len(div.find_parents("div")) == 8: # Depth 4 means 3 parent divs (0-indexed)
	# print(div.get_text().strip())
	href = div.find(href=True, recursive=True)
	text = div.find(text=True, recursive=False)
	if href and text:
	print(text)
	text = f'[{text}]({href["href"].split("/url?q=")[-1]})'
	if text != None and text.strip():
	texts += text + "\n---\n"
	return {"results":texts}


	@app.get("/tiktok_details")
	async def read_item(username: str, video_id:str):
	# user_agent = "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36"
	user_agent = "Googlebot/2.1"
	# if "https:" in link_detail:
	# url = link_detail
	# elif link_detail[0] == "/":
	# url = "https://tiktok.com" + link_detail
	# else:
	# url = "https://tiktok.com/"+link_detail

	url = f"https://tiktok.com/@{username}/video/{video_id}"

	res = requests.get(url, headers={"user-agent":user_agent})
	text_maker = html2text.HTML2Text()
	text_maker.ignore_links = True
	text_maker.ignore_images = True
	text_maker.bypass_tables = False

	print("RESPONSE DETAIlL", res.content.decode("utf-8"))

	docs = text_maker.handle(res.content.decode("utf-8"))

	print("DOCS", docs)

	content_detail = docs.split("###")[5]

	likes, comments, bookmarks, shares = re.findall(r'\\([\w.]+)\\', content_detail)


	profile = [x.strip() for x in content_detail.split("\n\nSpeed\n\n", 1)[1].split("\n", 6) if x.strip()]
	username = profile[0]
	date = profile[1].rsplit(" · ", 1)[-1]
	desc = profile[-1].replace("**", "")

	return {
	"insights":{
	"likeCount":likes,
	"commentCount":comments,
	"bookmarkCount":bookmarks,
	"shareCount":shares
	},
	"username":username,
	"date":date,
	"description":desc
	}