Spaces:
Running
Running
app.py
CHANGED
@@ -1,7 +1,232 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
def greet(name):
|
4 |
-
return "Hello " + name + "!!"
|
5 |
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
NOTE 1: Start Command starting a FastAPI on render:
|
4 |
+
@see https://community.render.com/t/fastapi-python-web-service-deploying-for-hours/6662
|
5 |
+
uvicorn app:app --host 0.0.0.0 --port 10000
|
6 |
|
|
|
|
|
7 |
|
8 |
+
"""
|
9 |
+
|
10 |
+
import os , sys
|
11 |
+
import datetime , requests , random , logging , time , timeit
|
12 |
+
import simplejson as json
|
13 |
+
from fastapi import FastAPI
|
14 |
+
from fastapi.responses import PlainTextResponse , HTMLResponse , Response , JSONResponse
|
15 |
+
# from fastapi import Request
|
16 |
+
from starlette.requests import Request
|
17 |
+
|
18 |
+
from bs4 import BeautifulSoup
|
19 |
+
from furl import furl
|
20 |
+
# from apscheduler.schedulers.blocking import BlockingScheduler
|
21 |
+
# from apscheduler.schedulers.background import BackgroundScheduler
|
22 |
+
from pymongo import MongoClient
|
23 |
+
import fire
|
24 |
+
import socket
|
25 |
+
import requests
|
26 |
+
|
27 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
28 |
+
|
29 |
+
HOSTNAME = socket.gethostname()
|
30 |
+
|
31 |
+
USER_AGENTS = [
|
32 |
+
"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0",
|
33 |
+
"Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0",
|
34 |
+
"Mozilla/5.0 (X11; Linux; rv:74.0) Gecko/20100101 Firefox/74.0",
|
35 |
+
"Mozilla/5.0 (X11; Linux ppc64le; rv:75.0) Gecko/20100101 Firefox/75.0",
|
36 |
+
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:77.0) Gecko/20190101 Firefox/77.0"
|
37 |
+
]
|
38 |
+
|
39 |
+
BOT_AGENTS = [
|
40 |
+
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
|
41 |
+
"Googlebot/2.1 (+http://www.googlebot.com/bot.html)",
|
42 |
+
"Mozilla/5.0 (compatible; bingbot/2.0 +http://www.bing.com/bingbot.htm)",
|
43 |
+
"Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
|
44 |
+
"DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)",
|
45 |
+
"Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
|
46 |
+
"Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)"
|
47 |
+
]
|
48 |
+
|
49 |
+
# MONGODB-ATLAS SETUP
|
50 |
+
MONGOATLAS_URI = os.environ.get('MONGOATLAS_URI') or None
|
51 |
+
|
52 |
+
##############################################################################
|
53 |
+
#
|
54 |
+
# LOGGING
|
55 |
+
#
|
56 |
+
##############################################################################
|
57 |
+
|
58 |
+
logging.basicConfig(level=logging.INFO , format='%(message)s')
|
59 |
+
logging.getLogger("requests").setLevel(logging.ERROR)
|
60 |
+
|
61 |
+
logger = logging.getLogger(__name__)
|
62 |
+
logger.setLevel(logging.DEBUG)
|
63 |
+
logger.propagate=False
|
64 |
+
|
65 |
+
console_logger = logging.StreamHandler()
|
66 |
+
console_logger.setLevel(logging.DEBUG)
|
67 |
+
console_logger.setFormatter(logging.Formatter('%(message)s'))
|
68 |
+
|
69 |
+
logger.addHandler(console_logger)
|
70 |
+
|
71 |
+
if not MONGOATLAS_URI:
|
72 |
+
logger.warning('Coulf not read the database URI')
|
73 |
+
|
74 |
+
# Disable urllib3 warnings (sent by requests)
|
75 |
+
# requests.packages.urllib3.disable_warnings()
|
76 |
+
|
77 |
+
app = FastAPI()
|
78 |
+
#app.config.from_pyfile('flaskapp.cfg')
|
79 |
+
|
80 |
+
port = 5000
|
81 |
+
scheduler = None
|
82 |
+
proxies = {}
|
83 |
+
# local_ip = socket.gethostbyname(hostname)
|
84 |
+
if HOSTNAME == 'OCTOCORE':
|
85 |
+
#proxies = {'http': 'http://192.168.1.68:80', 'https': 'http://192.168.1.68:80'}
|
86 |
+
proxies = {'http': 'https://anonyland:[email protected]:8080', 'https': 'http://anonyland:[email protected]:8080'}
|
87 |
+
proxy_ip = '192.168.1.43:80'
|
88 |
+
|
89 |
+
@app.get('/')
|
90 |
+
def index():
|
91 |
+
#return render_template('index.html')
|
92 |
+
logger.info(f'hostname: {HOSTNAME}')
|
93 |
+
return PlainTextResponse('OK' , 200)
|
94 |
+
|
95 |
+
@app.get('/ping')
|
96 |
+
def index():
|
97 |
+
return Response(status_code=200)
|
98 |
+
|
99 |
+
@app.get("/remote_ip")
|
100 |
+
def remote_ip(request:Request):
|
101 |
+
client_host = request.client.host
|
102 |
+
return PlainTextResponse(client_host , 200)
|
103 |
+
|
104 |
+
@app.get("/task/faa_scrap_sold_listings_featured")
|
105 |
+
def faa_scrap_sold_listings_featured_local():
|
106 |
+
|
107 |
+
global proxies
|
108 |
+
|
109 |
+
timeit_request = 0
|
110 |
+
timeit_parsing = 0
|
111 |
+
timeit_mongo = 0
|
112 |
+
|
113 |
+
response_body = '?'
|
114 |
+
|
115 |
+
if not MONGOATLAS_URI:
|
116 |
+
return PlainTextResponse("ERROR: MONGOATLAS_URI is undefined" , status_code=500)
|
117 |
+
|
118 |
+
cnt_dbs = 4
|
119 |
+
|
120 |
+
headers = {
|
121 |
+
'User-Agent': random.choice(USER_AGENTS)
|
122 |
+
}
|
123 |
+
|
124 |
+
site_url = "https://fineartamerica.com/recentprintsales.html?rand={}".format(random.randint(1000,1000000))
|
125 |
+
r=None
|
126 |
+
|
127 |
+
try:
|
128 |
+
start = time.time()
|
129 |
+
r = requests.get(site_url , proxies=proxies , timeout=30 , verify=False , headers=headers)
|
130 |
+
timeit_request = time.time()-start
|
131 |
+
except Exception as e:
|
132 |
+
response_body = str(e)
|
133 |
+
|
134 |
+
if r and r.status_code==200:
|
135 |
+
|
136 |
+
try:
|
137 |
+
|
138 |
+
start = time.time()
|
139 |
+
listings = parse_faa_sold_listings_page(r.text)
|
140 |
+
timeit_parsing = time.time() - start
|
141 |
+
|
142 |
+
d = dict()
|
143 |
+
d['date_utc'] = datetime.datetime.utcnow()
|
144 |
+
d['results'] = listings
|
145 |
+
d['processed']= False
|
146 |
+
|
147 |
+
status = "ok"
|
148 |
+
|
149 |
+
db_name = 'faa_scrap_' + str(random.randint(1,cnt_dbs))
|
150 |
+
col_name = 'faa_sl'
|
151 |
+
|
152 |
+
mongo_client = None
|
153 |
+
try:
|
154 |
+
start = time.time()
|
155 |
+
mongo_client = MongoClient(MONGOATLAS_URI)
|
156 |
+
db = mongo_client[db_name]
|
157 |
+
col = db[col_name]
|
158 |
+
r = col.insert_one(d)
|
159 |
+
timeit_mongo = time.time() - start
|
160 |
+
except Exception as e:
|
161 |
+
status = "error saving to mongodb ({})".format(str(e))
|
162 |
+
logging.error(status)
|
163 |
+
finally:
|
164 |
+
try:
|
165 |
+
mongo_client.close()
|
166 |
+
except Exception:
|
167 |
+
pass
|
168 |
+
|
169 |
+
|
170 |
+
o = dict()
|
171 |
+
o['site']="faa"
|
172 |
+
o['status']=status
|
173 |
+
o['date'] = d['date_utc']
|
174 |
+
o['results_count'] = len(listings)
|
175 |
+
o['db_name'] = db_name
|
176 |
+
o['timeit'] = {'request':timeit_request,
|
177 |
+
'parsing':timeit_parsing,
|
178 |
+
'db':timeit_mongo}
|
179 |
+
# o['proxy'] = json.dumps(proxies)
|
180 |
+
|
181 |
+
response_body = str(o)
|
182 |
+
|
183 |
+
except Exception as e:
|
184 |
+
response_body = str(e)
|
185 |
+
|
186 |
+
return PlainTextResponse(response_body, 200)
|
187 |
+
|
188 |
+
|
189 |
+
def parse_faa_sold_listings_page(html):
|
190 |
+
|
191 |
+
soup = BeautifulSoup(html , 'lxml') # "html.parser"
|
192 |
+
|
193 |
+
listings_els = soup.find_all('div' , {'class':'productImageDiv'})
|
194 |
+
|
195 |
+
listings = []
|
196 |
+
|
197 |
+
for i,listing_el in enumerate(listings_els):
|
198 |
+
|
199 |
+
#if listing_el['style'].find('hidden') > -1:
|
200 |
+
# continue
|
201 |
+
|
202 |
+
l = dict()
|
203 |
+
|
204 |
+
item_url = listing_el.find('a')['href']
|
205 |
+
if not item_url.startswith('http'):
|
206 |
+
item_url = 'https://fineartamerica.com/' + item_url
|
207 |
+
|
208 |
+
item_page = furl(item_url)
|
209 |
+
item_page.path.normalize()
|
210 |
+
l['item_page'] = item_page.url
|
211 |
+
|
212 |
+
l['image'] = listing_el.find('img' , {'class':'productImage'})['src']
|
213 |
+
|
214 |
+
artist_url = listing_el.find('p',{'class':'artistName'}).a['href']
|
215 |
+
if not artist_url.startswith('http'):
|
216 |
+
artist_url = 'https://fineartamerica.com/' + artist_url
|
217 |
+
artist_page = furl(artist_url)
|
218 |
+
artist_page.path.normalize()
|
219 |
+
l['artist_page'] = artist_page.url
|
220 |
+
|
221 |
+
l['artist'] = listing_el.find('p',{'class':'artistName'}).text
|
222 |
+
l['sell_info'] = listing_el.find('p' , {'class':'orderLocation'}).text
|
223 |
+
|
224 |
+
listings.append(l)
|
225 |
+
|
226 |
+
del soup
|
227 |
+
|
228 |
+
return listings
|
229 |
+
|
230 |
+
if __name__ == "__main__":
|
231 |
+
import uvicorn
|
232 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|