Spaces:
Sleeping
Sleeping
import bs4 as BeautifulSoup | |
import pandas as pd | |
import urllib.request | |
from io import BytesIO | |
from pyxlsb import open_workbook as open_xlsb | |
from urllib.request import urlopen, Request | |
import re | |
rutas_websearch = ['https://en.hespareparts.com/search/?search=', 'https://offroadeq.com/parts-search/'] | |
def extrae_dato_web(idx): | |
datx = [] | |
idxx = str(idx).replace('-', '').replace(' ', '') | |
urlg = rutas_websearch[1] + idxx + '/' | |
htmlg = urlopen(urlg).read() | |
soup = BeautifulSoup.BeautifulSoup(htmlg, 'html.parser') | |
lista0 = soup.find_all('h2')[0] | |
lista1 = soup.find_all('dt') | |
lista2 = soup.find_all('dd') | |
if len(lista1)<3: | |
lista1 = ['Alt NA'] + lista1 | |
lista2 = ['Alternate NA'] + lista2 | |
else: | |
pass | |
for i, j in zip(lista1, lista2): | |
try: | |
datx.append( float(j.text.replace('lbs', '').replace('$', '')) ) | |
except: | |
datx.append(j) | |
datx.append( lista0.text.split('-')[1:][0] ) | |
return(datx) | |
def extrae_web(idx): | |
idxx = str(idx).replace('-', '').replace(' ', '') | |
urlz = rutas_websearch[0] + idxx + '/' | |
try: | |
htmlz = urlopen(urlz).read() | |
soup = BeautifulSoup.BeautifulSoup(htmlz, 'html.parser') | |
lista = soup.find_all('a', {'class': 'link-dark'}) | |
ls = lista[0] | |
page = urlopen(ls['href']) | |
html = page.read() | |
soup = BeautifulSoup.BeautifulSoup(html, 'html.parser') | |
gg = soup.find_all('h1') | |
print(gg) | |
dd = [] | |
for typex in ['depth', 'width', 'height']: | |
try: | |
aa = soup.find_all('span', {'itemprop': typex})[0].text | |
bb = re.findall('[0-9.]+', aa) | |
except: | |
bb = [float(-1.0)] | |
dd.append(float(bb[0])/1000) | |
cc = soup.find_all('div', {'itemprop': 'description'})[0].text | |
cc1 = cc.replace('\r', '').replace('\n', ' ') | |
ggtx = gg[0].text | |
posx = ggtx.find(' - ') | |
ggx = ggtx[posx+3:] | |
vol = dd[0] * dd[1] * dd[2] | |
dd0, dd1, dd2 = dd[0], dd[1], dd[2] | |
except: | |
ggx, dd0, dd1, dd2, vol, cc1 = 'Not Available', -1.0, -1.0, -1.0, -1.0, 'NA' | |
return ggx, dd0, dd1, dd2, vol, cc1 | |
def extrae_alternate(idx): | |
idxx = str(idx).replace('-', '').replace(' ', '') | |
urlg = rutas_websearch[1] + idxx + '/' | |
htmlg = urlopen(urlg).read() | |
soup = BeautifulSoup.BeautifulSoup(htmlg, 'html.parser') | |
dt1 = soup.find_all('dt')[0].text | |
print(dt1) | |
dt2 = soup.find_all('dd')[0].text | |
if dt1 == 'Alternate for': | |
return(dt2) | |
def convierte_excel(df): | |
output = BytesIO() | |
writer = pd.ExcelWriter(output, engine='xlsxwriter') | |
df.to_excel(writer, index=False, sheet_name='data_extraida') | |
workbook = writer.book | |
worksheet = writer.sheets['data_extraida'] | |
format1 = workbook.add_format({'num_format': '0.00'}) | |
worksheet.set_column('A:A', None, format1) | |
writer.save() | |
processed_data = output.getvalue() | |
writer.close() | |
return processed_data |