Spaces:

menorki
/

ArtVendorScrapperPython

Running

9607910 9 months ago

7.81 kB

	# -- coding: utf-8 --
	"""
	NOTE 1: Start Command starting a FastAPI on render:
	@see https://community.render.com/t/fastapi-python-web-service-deploying-for-hours/6662
	uvicorn app:app --host 0.0.0.0 --port 10000


	"""

	import os , sys
	import datetime , requests , random , logging , time , timeit
	import simplejson as json
	from fastapi import FastAPI , Request, HTTPException
	from fastapi.responses import PlainTextResponse , HTMLResponse , Response , JSONResponse
	# from fastapi import Request
	from starlette.requests import Request

	from bs4 import BeautifulSoup
	from furl import furl
	# from apscheduler.schedulers.blocking import BlockingScheduler
	# from apscheduler.schedulers.background import BackgroundScheduler
	from pymongo import MongoClient
	import fire
	import socket
	import requests
	from functools import wraps

	from apscheduler.schedulers.background import BackgroundScheduler

	HOSTNAME = socket.gethostname()

	USER_AGENTS = [
	"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0",
	"Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0",
	"Mozilla/5.0 (X11; Linux; rv:74.0) Gecko/20100101 Firefox/74.0",
	"Mozilla/5.0 (X11; Linux ppc64le; rv:75.0) Gecko/20100101 Firefox/75.0",
	"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:77.0) Gecko/20190101 Firefox/77.0"
	]

	BOT_AGENTS = [
	"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
	"Googlebot/2.1 (+http://www.googlebot.com/bot.html)",
	"Mozilla/5.0 (compatible; bingbot/2.0 +http://www.bing.com/bingbot.htm)",
	"Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
	"DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)",
	"Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
	"Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)"
	]

	# MONGODB-ATLAS SETUP
	MONGOATLAS_URI = os.environ.get('MONGOATLAS_URI') or None
	AK = os.environ.get('AK') or None

	##############################################################################
	#
	# LOGGING
	#
	##############################################################################

	logging.basicConfig(level=logging.INFO , format='%(message)s')
	logging.getLogger("requests").setLevel(logging.ERROR)

	logger = logging.getLogger(__name__)
	logger.setLevel(logging.DEBUG)
	logger.propagate=False

	console_logger = logging.StreamHandler()
	console_logger.setLevel(logging.DEBUG)
	console_logger.setFormatter(logging.Formatter('%(message)s'))

	logger.addHandler(console_logger)

	if not MONGOATLAS_URI:
	logger.warning('Could not read the database URI')

	if not MONGOATLAS_URI:
	logger.warning('Could not read the access key')


	# Disable urllib3 warnings (sent by requests)
	# requests.packages.urllib3.disable_warnings()

	app = FastAPI(docs_url=None, redoc_url=None, openapi_url=None)
	#app.config.from_pyfile('flaskapp.cfg')

	port = 5000
	scheduler = None
	proxies = {}
	# local_ip = socket.gethostbyname(hostname)

	# Custom decorator to check for the access key
	def require_access_key(func):
	@wraps(func)
	def wrapper(args, *kwargs):
	request = kwargs.get('request') # Get the 'request' object from the endpoint's kwargs
	access_key = request.query_params.get('AK')

	# Check if the provided ACCESS_KEY matches the secret
	if access_key != AK:
	return PlainTextResponse("ERROR: Unauthorized call" , status_code=401)

	return func(args, *kwargs) # Call the actual endpoint function
	return wrapper

	@app.get('/')
	def index():
	#return render_template('index.html')
	logger.info(f'hostname: {HOSTNAME}')
	return PlainTextResponse('OK' , 200)

	@app.get('/ping')
	def index():
	return PlainTextResponse('PONG' , 200)

	@app.get("/remote_ip")
	@require_access_key
	def remote_ip(request:Request):
	client_host = request.client.host
	return PlainTextResponse(client_host , 200)

	@app.get("/task/faa_scrap_sold_listings_featured")
	@require_access_key
	def faa_scrap_sold_listings_featured_local(request:Request):

	global proxies

	timeit_request = 0
	timeit_parsing = 0
	timeit_mongo = 0

	response_body = '?'

	if not MONGOATLAS_URI:
	return PlainTextResponse("ERROR: MONGOATLAS_URI is undefined" , status_code=500)

	#access_key = request.query_params['AK']
	#if access_key != AK:
	# return PlainTextResponse("ERROR: Unauthorized call" , status_code=401)


	cnt_dbs = 4

	headers = {
	'User-Agent': random.choice(USER_AGENTS)
	}

	site_url = "https://fineartamerica.com/recentprintsales.html?rand={}".format(random.randint(1000,1000000))
	r=None

	try:
	start = time.time()
	r = requests.get(site_url , proxies=proxies , timeout=30 , verify=False , headers=headers)
	timeit_request = time.time()-start
	except Exception as e:
	response_body = str(e)

	if r and r.status_code==200:

	try:

	start = time.time()
	listings = parse_faa_sold_listings_page(r.text)
	timeit_parsing = time.time() - start

	d = dict()
	d['date_utc'] = datetime.datetime.utcnow()
	d['results'] = listings
	d['processed']= False

	status = "ok"

	db_name = 'faa_scrap_' + str(random.randint(1,cnt_dbs))
	col_name = 'faa_sl'

	mongo_client = None
	try:
	start = time.time()
	mongo_client = MongoClient(MONGOATLAS_URI)
	db = mongo_client[db_name]
	col = db[col_name]
	r = col.insert_one(d)
	timeit_mongo = time.time() - start
	except Exception as e:
	status = "error saving to mongodb ({})".format(str(e))
	logging.error(status)
	finally:
	try:
	mongo_client.close()
	except Exception:
	pass


	o = dict()
	o['site']="faa"
	o['status']=status
	o['date'] = d['date_utc']
	o['results_count'] = len(listings)
	o['db_name'] = db_name
	o['timeit'] = {'request':timeit_request,
	'parsing':timeit_parsing,
	'db':timeit_mongo}
	# o['proxy'] = json.dumps(proxies)

	response_body = str(o)

	except Exception as e:
	response_body = str(e)

	return PlainTextResponse(response_body, 200)


	def parse_faa_sold_listings_page(html):

	soup = BeautifulSoup(html , 'lxml') # "html.parser"

	listings_els = soup.find_all('div' , {'class':'productImageDiv'})

	listings = []

	for i,listing_el in enumerate(listings_els):

	#if listing_el['style'].find('hidden') > -1:
	# continue

	l = dict()

	item_url = listing_el.find('a')['href']
	if not item_url.startswith('http'):
	item_url = 'https://fineartamerica.com/' + item_url

	item_page = furl(item_url)
	item_page.path.normalize()
	l['item_page'] = item_page.url

	l['image'] = listing_el.find('img' , {'class':'productImage'})['src']

	artist_url = listing_el.find('p',{'class':'artistName'}).a['href']
	if not artist_url.startswith('http'):
	artist_url = 'https://fineartamerica.com/' + artist_url
	artist_page = furl(artist_url)
	artist_page.path.normalize()
	l['artist_page'] = artist_page.url

	l['artist'] = listing_el.find('p',{'class':'artistName'}).text
	l['sell_info'] = listing_el.find('p' , {'class':'orderLocation'}).text

	listings.append(l)

	del soup

	return listings

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)