# -*- coding: utf-8 -*- """ NOTE 1: Start Command starting a FastAPI on render: @see https://community.render.com/t/fastapi-python-web-service-deploying-for-hours/6662 uvicorn app:app --host 0.0.0.0 --port 10000 """ import os , sys import datetime , requests , random , logging , time , timeit import simplejson as json from fastapi import FastAPI , Request, HTTPException from fastapi.responses import PlainTextResponse , HTMLResponse , Response , JSONResponse # from fastapi import Request from starlette.requests import Request from bs4 import BeautifulSoup from furl import furl # from apscheduler.schedulers.blocking import BlockingScheduler # from apscheduler.schedulers.background import BackgroundScheduler from pymongo import MongoClient import fire import socket import requests from functools import wraps from apscheduler.schedulers.background import BackgroundScheduler HOSTNAME = socket.gethostname() USER_AGENTS = [ "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0", "Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0", "Mozilla/5.0 (X11; Linux; rv:74.0) Gecko/20100101 Firefox/74.0", "Mozilla/5.0 (X11; Linux ppc64le; rv:75.0) Gecko/20100101 Firefox/75.0", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:77.0) Gecko/20190101 Firefox/77.0" ] BOT_AGENTS = [ "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", "Googlebot/2.1 (+http://www.googlebot.com/bot.html)", "Mozilla/5.0 (compatible; bingbot/2.0 +http://www.bing.com/bingbot.htm)", "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)", "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)", "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)", "Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)" ] # MONGODB-ATLAS SETUP MONGOATLAS_URI = os.environ.get('MONGOATLAS_URI') or None AK = os.environ.get('AK') or None ############################################################################## # # LOGGING # ############################################################################## logging.basicConfig(level=logging.INFO , format='%(message)s') logging.getLogger("requests").setLevel(logging.ERROR) logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) logger.propagate=False console_logger = logging.StreamHandler() console_logger.setLevel(logging.DEBUG) console_logger.setFormatter(logging.Formatter('%(message)s')) logger.addHandler(console_logger) if not MONGOATLAS_URI: logger.warning('Could not read the database URI') if not MONGOATLAS_URI: logger.warning('Could not read the access key') # Disable urllib3 warnings (sent by requests) # requests.packages.urllib3.disable_warnings() app = FastAPI(docs_url=None, redoc_url=None, openapi_url=None) #app.config.from_pyfile('flaskapp.cfg') port = 5000 scheduler = None proxies = {} # local_ip = socket.gethostbyname(hostname) # Custom decorator to check for the access key def require_access_key(func): @wraps(func) def wrapper(*args, **kwargs): request = kwargs.get('request') # Get the 'request' object from the endpoint's kwargs access_key = request.query_params.get('AK') # Check if the provided ACCESS_KEY matches the secret if access_key != AK: return PlainTextResponse("ERROR: Unauthorized call" , status_code=401) return func(*args, **kwargs) # Call the actual endpoint function return wrapper @app.get('/') def index(): #return render_template('index.html') logger.info(f'hostname: {HOSTNAME}') return PlainTextResponse('OK' , 200) @app.get('/ping') def index(): return PlainTextResponse('PONG' , 200) @app.get("/remote_ip") @require_access_key def remote_ip(request:Request): client_host = request.client.host return PlainTextResponse(client_host , 200) @app.get("/task/faa_scrap_sold_listings_featured") @require_access_key def faa_scrap_sold_listings_featured_local(request:Request): global proxies timeit_request = 0 timeit_parsing = 0 timeit_mongo = 0 response_body = '?' if not MONGOATLAS_URI: return PlainTextResponse("ERROR: MONGOATLAS_URI is undefined" , status_code=500) #access_key = request.query_params['AK'] #if access_key != AK: # return PlainTextResponse("ERROR: Unauthorized call" , status_code=401) cnt_dbs = 4 headers = { 'User-Agent': random.choice(USER_AGENTS) } site_url = "https://fineartamerica.com/recentprintsales.html?rand={}".format(random.randint(1000,1000000)) r=None try: start = time.time() r = requests.get(site_url , proxies=proxies , timeout=30 , verify=False , headers=headers) timeit_request = time.time()-start except Exception as e: response_body = str(e) if r and r.status_code==200: try: start = time.time() listings = parse_faa_sold_listings_page(r.text) timeit_parsing = time.time() - start d = dict() d['date_utc'] = datetime.datetime.utcnow() d['results'] = listings d['processed']= False status = "ok" db_name = 'faa_scrap_' + str(random.randint(1,cnt_dbs)) col_name = 'faa_sl' mongo_client = None try: start = time.time() mongo_client = MongoClient(MONGOATLAS_URI) db = mongo_client[db_name] col = db[col_name] r = col.insert_one(d) timeit_mongo = time.time() - start except Exception as e: status = "error saving to mongodb ({})".format(str(e)) logging.error(status) finally: try: mongo_client.close() except Exception: pass o = dict() o['site']="faa" o['status']=status o['date'] = d['date_utc'] o['results_count'] = len(listings) o['db_name'] = db_name o['timeit'] = {'request':timeit_request, 'parsing':timeit_parsing, 'db':timeit_mongo} # o['proxy'] = json.dumps(proxies) response_body = str(o) except Exception as e: response_body = str(e) return PlainTextResponse(response_body, 200) def parse_faa_sold_listings_page(html): soup = BeautifulSoup(html , 'lxml') # "html.parser" listings_els = soup.find_all('div' , {'class':'productImageDiv'}) listings = [] for i,listing_el in enumerate(listings_els): #if listing_el['style'].find('hidden') > -1: # continue l = dict() item_url = listing_el.find('a')['href'] if not item_url.startswith('http'): item_url = 'https://fineartamerica.com/' + item_url item_page = furl(item_url) item_page.path.normalize() l['item_page'] = item_page.url l['image'] = listing_el.find('img' , {'class':'productImage'})['src'] artist_url = listing_el.find('p',{'class':'artistName'}).a['href'] if not artist_url.startswith('http'): artist_url = 'https://fineartamerica.com/' + artist_url artist_page = furl(artist_url) artist_page.path.normalize() l['artist_page'] = artist_page.url l['artist'] = listing_el.find('p',{'class':'artistName'}).text l['sell_info'] = listing_el.find('p' , {'class':'orderLocation'}).text listings.append(l) del soup return listings if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)