menorki's picture
!
9607910
raw
history blame
7.81 kB
# -*- coding: utf-8 -*-
"""
NOTE 1: Start Command starting a FastAPI on render:
@see https://community.render.com/t/fastapi-python-web-service-deploying-for-hours/6662
uvicorn app:app --host 0.0.0.0 --port 10000
"""
import os , sys
import datetime , requests , random , logging , time , timeit
import simplejson as json
from fastapi import FastAPI , Request, HTTPException
from fastapi.responses import PlainTextResponse , HTMLResponse , Response , JSONResponse
# from fastapi import Request
from starlette.requests import Request
from bs4 import BeautifulSoup
from furl import furl
# from apscheduler.schedulers.blocking import BlockingScheduler
# from apscheduler.schedulers.background import BackgroundScheduler
from pymongo import MongoClient
import fire
import socket
import requests
from functools import wraps
from apscheduler.schedulers.background import BackgroundScheduler
HOSTNAME = socket.gethostname()
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0",
"Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0",
"Mozilla/5.0 (X11; Linux; rv:74.0) Gecko/20100101 Firefox/74.0",
"Mozilla/5.0 (X11; Linux ppc64le; rv:75.0) Gecko/20100101 Firefox/75.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:77.0) Gecko/20190101 Firefox/77.0"
]
BOT_AGENTS = [
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Googlebot/2.1 (+http://www.googlebot.com/bot.html)",
"Mozilla/5.0 (compatible; bingbot/2.0 +http://www.bing.com/bingbot.htm)",
"Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
"DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)",
"Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
"Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)"
]
# MONGODB-ATLAS SETUP
MONGOATLAS_URI = os.environ.get('MONGOATLAS_URI') or None
AK = os.environ.get('AK') or None
##############################################################################
#
# LOGGING
#
##############################################################################
logging.basicConfig(level=logging.INFO , format='%(message)s')
logging.getLogger("requests").setLevel(logging.ERROR)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.propagate=False
console_logger = logging.StreamHandler()
console_logger.setLevel(logging.DEBUG)
console_logger.setFormatter(logging.Formatter('%(message)s'))
logger.addHandler(console_logger)
if not MONGOATLAS_URI:
logger.warning('Could not read the database URI')
if not MONGOATLAS_URI:
logger.warning('Could not read the access key')
# Disable urllib3 warnings (sent by requests)
# requests.packages.urllib3.disable_warnings()
app = FastAPI(docs_url=None, redoc_url=None, openapi_url=None)
#app.config.from_pyfile('flaskapp.cfg')
port = 5000
scheduler = None
proxies = {}
# local_ip = socket.gethostbyname(hostname)
# Custom decorator to check for the access key
def require_access_key(func):
@wraps(func)
def wrapper(*args, **kwargs):
request = kwargs.get('request') # Get the 'request' object from the endpoint's kwargs
access_key = request.query_params.get('AK')
# Check if the provided ACCESS_KEY matches the secret
if access_key != AK:
return PlainTextResponse("ERROR: Unauthorized call" , status_code=401)
return func(*args, **kwargs) # Call the actual endpoint function
return wrapper
@app.get('/')
def index():
#return render_template('index.html')
logger.info(f'hostname: {HOSTNAME}')
return PlainTextResponse('OK' , 200)
@app.get('/ping')
def index():
return PlainTextResponse('PONG' , 200)
@app.get("/remote_ip")
@require_access_key
def remote_ip(request:Request):
client_host = request.client.host
return PlainTextResponse(client_host , 200)
@app.get("/task/faa_scrap_sold_listings_featured")
@require_access_key
def faa_scrap_sold_listings_featured_local(request:Request):
global proxies
timeit_request = 0
timeit_parsing = 0
timeit_mongo = 0
response_body = '?'
if not MONGOATLAS_URI:
return PlainTextResponse("ERROR: MONGOATLAS_URI is undefined" , status_code=500)
#access_key = request.query_params['AK']
#if access_key != AK:
# return PlainTextResponse("ERROR: Unauthorized call" , status_code=401)
cnt_dbs = 4
headers = {
'User-Agent': random.choice(USER_AGENTS)
}
site_url = "https://fineartamerica.com/recentprintsales.html?rand={}".format(random.randint(1000,1000000))
r=None
try:
start = time.time()
r = requests.get(site_url , proxies=proxies , timeout=30 , verify=False , headers=headers)
timeit_request = time.time()-start
except Exception as e:
response_body = str(e)
if r and r.status_code==200:
try:
start = time.time()
listings = parse_faa_sold_listings_page(r.text)
timeit_parsing = time.time() - start
d = dict()
d['date_utc'] = datetime.datetime.utcnow()
d['results'] = listings
d['processed']= False
status = "ok"
db_name = 'faa_scrap_' + str(random.randint(1,cnt_dbs))
col_name = 'faa_sl'
mongo_client = None
try:
start = time.time()
mongo_client = MongoClient(MONGOATLAS_URI)
db = mongo_client[db_name]
col = db[col_name]
r = col.insert_one(d)
timeit_mongo = time.time() - start
except Exception as e:
status = "error saving to mongodb ({})".format(str(e))
logging.error(status)
finally:
try:
mongo_client.close()
except Exception:
pass
o = dict()
o['site']="faa"
o['status']=status
o['date'] = d['date_utc']
o['results_count'] = len(listings)
o['db_name'] = db_name
o['timeit'] = {'request':timeit_request,
'parsing':timeit_parsing,
'db':timeit_mongo}
# o['proxy'] = json.dumps(proxies)
response_body = str(o)
except Exception as e:
response_body = str(e)
return PlainTextResponse(response_body, 200)
def parse_faa_sold_listings_page(html):
soup = BeautifulSoup(html , 'lxml') # "html.parser"
listings_els = soup.find_all('div' , {'class':'productImageDiv'})
listings = []
for i,listing_el in enumerate(listings_els):
#if listing_el['style'].find('hidden') > -1:
# continue
l = dict()
item_url = listing_el.find('a')['href']
if not item_url.startswith('http'):
item_url = 'https://fineartamerica.com/' + item_url
item_page = furl(item_url)
item_page.path.normalize()
l['item_page'] = item_page.url
l['image'] = listing_el.find('img' , {'class':'productImage'})['src']
artist_url = listing_el.find('p',{'class':'artistName'}).a['href']
if not artist_url.startswith('http'):
artist_url = 'https://fineartamerica.com/' + artist_url
artist_page = furl(artist_url)
artist_page.path.normalize()
l['artist_page'] = artist_page.url
l['artist'] = listing_el.find('p',{'class':'artistName'}).text
l['sell_info'] = listing_el.find('p' , {'class':'orderLocation'}).text
listings.append(l)
del soup
return listings
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)