Spaces:
Running
Running
# -*- coding: utf-8 -*- | |
""" | |
NOTE 1: Start Command starting a FastAPI on render: | |
@see https://community.render.com/t/fastapi-python-web-service-deploying-for-hours/6662 | |
uvicorn app:app --host 0.0.0.0 --port 10000 | |
""" | |
import os , sys | |
import datetime , requests , random , logging , time , timeit | |
import simplejson as json | |
from fastapi import FastAPI , Request, HTTPException | |
from fastapi.responses import PlainTextResponse , HTMLResponse , Response , JSONResponse | |
# from fastapi import Request | |
from starlette.requests import Request | |
from bs4 import BeautifulSoup | |
from furl import furl | |
# from apscheduler.schedulers.blocking import BlockingScheduler | |
# from apscheduler.schedulers.background import BackgroundScheduler | |
from pymongo import MongoClient | |
import fire | |
import socket | |
import requests | |
from functools import wraps | |
from apscheduler.schedulers.background import BackgroundScheduler | |
HOSTNAME = socket.gethostname() | |
USER_AGENTS = [ | |
"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0", | |
"Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0", | |
"Mozilla/5.0 (X11; Linux; rv:74.0) Gecko/20100101 Firefox/74.0", | |
"Mozilla/5.0 (X11; Linux ppc64le; rv:75.0) Gecko/20100101 Firefox/75.0", | |
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:77.0) Gecko/20190101 Firefox/77.0" | |
] | |
BOT_AGENTS = [ | |
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", | |
"Googlebot/2.1 (+http://www.googlebot.com/bot.html)", | |
"Mozilla/5.0 (compatible; bingbot/2.0 +http://www.bing.com/bingbot.htm)", | |
"Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)", | |
"DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)", | |
"Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)", | |
"Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)" | |
] | |
# MONGODB-ATLAS SETUP | |
MONGOATLAS_URI = os.environ.get('MONGOATLAS_URI') or None | |
AK = os.environ.get('AK') or None | |
############################################################################## | |
# | |
# LOGGING | |
# | |
############################################################################## | |
logging.basicConfig(level=logging.INFO , format='%(message)s') | |
logging.getLogger("requests").setLevel(logging.ERROR) | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.DEBUG) | |
logger.propagate=False | |
console_logger = logging.StreamHandler() | |
console_logger.setLevel(logging.DEBUG) | |
console_logger.setFormatter(logging.Formatter('%(message)s')) | |
logger.addHandler(console_logger) | |
if not MONGOATLAS_URI: | |
logger.warning('Could not read the database URI') | |
if not MONGOATLAS_URI: | |
logger.warning('Could not read the access key') | |
# Disable urllib3 warnings (sent by requests) | |
# requests.packages.urllib3.disable_warnings() | |
app = FastAPI(docs_url=None, redoc_url=None, openapi_url=None) | |
#app.config.from_pyfile('flaskapp.cfg') | |
port = 5000 | |
scheduler = None | |
proxies = {} | |
# local_ip = socket.gethostbyname(hostname) | |
# Custom decorator to check for the access key | |
def require_access_key(func): | |
def wrapper(*args, **kwargs): | |
request = kwargs.get('request') # Get the 'request' object from the endpoint's kwargs | |
access_key = request.query_params.get('AK') | |
# Check if the provided ACCESS_KEY matches the secret | |
if access_key != AK: | |
return PlainTextResponse("ERROR: Unauthorized call" , status_code=401) | |
return func(*args, **kwargs) # Call the actual endpoint function | |
return wrapper | |
def index(): | |
#return render_template('index.html') | |
logger.info(f'hostname: {HOSTNAME}') | |
return PlainTextResponse('OK' , 200) | |
def index(): | |
return PlainTextResponse('PONG' , 200) | |
def remote_ip(request:Request): | |
client_host = request.client.host | |
return PlainTextResponse(client_host , 200) | |
def faa_scrap_sold_listings_featured_local(request:Request): | |
global proxies | |
timeit_request = 0 | |
timeit_parsing = 0 | |
timeit_mongo = 0 | |
response_body = '?' | |
if not MONGOATLAS_URI: | |
return PlainTextResponse("ERROR: MONGOATLAS_URI is undefined" , status_code=500) | |
#access_key = request.query_params['AK'] | |
#if access_key != AK: | |
# return PlainTextResponse("ERROR: Unauthorized call" , status_code=401) | |
cnt_dbs = 4 | |
headers = { | |
'User-Agent': random.choice(USER_AGENTS) | |
} | |
site_url = "https://fineartamerica.com/recentprintsales.html?rand={}".format(random.randint(1000,1000000)) | |
r=None | |
try: | |
start = time.time() | |
r = requests.get(site_url , proxies=proxies , timeout=30 , verify=False , headers=headers) | |
timeit_request = time.time()-start | |
except Exception as e: | |
response_body = str(e) | |
if r and r.status_code==200: | |
try: | |
start = time.time() | |
listings = parse_faa_sold_listings_page(r.text) | |
timeit_parsing = time.time() - start | |
d = dict() | |
d['date_utc'] = datetime.datetime.utcnow() | |
d['results'] = listings | |
d['processed']= False | |
status = "ok" | |
db_name = 'faa_scrap_' + str(random.randint(1,cnt_dbs)) | |
col_name = 'faa_sl' | |
mongo_client = None | |
try: | |
start = time.time() | |
mongo_client = MongoClient(MONGOATLAS_URI) | |
db = mongo_client[db_name] | |
col = db[col_name] | |
r = col.insert_one(d) | |
timeit_mongo = time.time() - start | |
except Exception as e: | |
status = "error saving to mongodb ({})".format(str(e)) | |
logging.error(status) | |
finally: | |
try: | |
mongo_client.close() | |
except Exception: | |
pass | |
o = dict() | |
o['site']="faa" | |
o['status']=status | |
o['date'] = d['date_utc'] | |
o['results_count'] = len(listings) | |
o['db_name'] = db_name | |
o['timeit'] = {'request':timeit_request, | |
'parsing':timeit_parsing, | |
'db':timeit_mongo} | |
# o['proxy'] = json.dumps(proxies) | |
response_body = str(o) | |
except Exception as e: | |
response_body = str(e) | |
return PlainTextResponse(response_body, 200) | |
def parse_faa_sold_listings_page(html): | |
soup = BeautifulSoup(html , 'lxml') # "html.parser" | |
listings_els = soup.find_all('div' , {'class':'productImageDiv'}) | |
listings = [] | |
for i,listing_el in enumerate(listings_els): | |
#if listing_el['style'].find('hidden') > -1: | |
# continue | |
l = dict() | |
item_url = listing_el.find('a')['href'] | |
if not item_url.startswith('http'): | |
item_url = 'https://fineartamerica.com/' + item_url | |
item_page = furl(item_url) | |
item_page.path.normalize() | |
l['item_page'] = item_page.url | |
l['image'] = listing_el.find('img' , {'class':'productImage'})['src'] | |
artist_url = listing_el.find('p',{'class':'artistName'}).a['href'] | |
if not artist_url.startswith('http'): | |
artist_url = 'https://fineartamerica.com/' + artist_url | |
artist_page = furl(artist_url) | |
artist_page.path.normalize() | |
l['artist_page'] = artist_page.url | |
l['artist'] = listing_el.find('p',{'class':'artistName'}).text | |
l['sell_info'] = listing_el.find('p' , {'class':'orderLocation'}).text | |
listings.append(l) | |
del soup | |
return listings | |
if __name__ == "__main__": | |
import uvicorn | |
uvicorn.run(app, host="0.0.0.0", port=7860) |