Spaces:
Sleeping
Sleeping
jcmachicao
commited on
Commit
•
b6c448a
1
Parent(s):
9973f7b
Upload 6 files
Browse files- app.py +81 -0
- codigos_prueba.xlsx +0 -0
- encopartslogo.jpg +0 -0
- funcs.py +94 -0
- gdmklogo.png +0 -0
- requirements.txt +4 -0
app.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
import streamlit as st
|
4 |
+
import pandas as pd
|
5 |
+
import base64
|
6 |
+
from pyxlsb import open_workbook as open_xlsb
|
7 |
+
from datetime import datetime
|
8 |
+
from funcs import extrae_dato_web, extrae_web, extrae_alternate, convierte_excel
|
9 |
+
import bs4 as BeautifulSoup
|
10 |
+
import urllib.request
|
11 |
+
from urllib.request import urlopen, Request
|
12 |
+
import re
|
13 |
+
|
14 |
+
c1, c2 = st.columns([6,6])
|
15 |
+
with c2:
|
16 |
+
st.image('encopartslogo.jpg', width=300, caption='https://encoparts.com/')
|
17 |
+
|
18 |
+
rutas_websearch = ['https://en.hespareparts.com/search/?search=', 'https://offroadeq.com/parts-search/']
|
19 |
+
st.title('Generación de Tablas de Datos de Extracción')
|
20 |
+
st.subheader('Carga de Datos')
|
21 |
+
selec = st.radio('Seleccione: ', [None, 'Carga por Texto con Comas', 'Carga por Archivo Excel'])
|
22 |
+
items = None
|
23 |
+
|
24 |
+
if selec is None:
|
25 |
+
|
26 |
+
st.write('Por favor seleccione una opción válida de carga.')
|
27 |
+
|
28 |
+
else:
|
29 |
+
|
30 |
+
if selec == 'Carga por Texto con Comas' and items is None:
|
31 |
+
st.write(selec)
|
32 |
+
codigos = st.text_input('Escriba o peque aqui texto separando los códigos por comas: ')
|
33 |
+
if st.button('Proceder'):
|
34 |
+
items = list(codigos.split(','))
|
35 |
+
|
36 |
+
else:
|
37 |
+
st.write(selec)
|
38 |
+
file = st.file_uploader('Seleccione un archivo: ')
|
39 |
+
if file is not None:
|
40 |
+
codigosf = pd.read_excel(file)
|
41 |
+
st.write('Filas, Columnas de Data de Prueba: ', codigosf.shape)
|
42 |
+
namcol = codigosf.columns[0]
|
43 |
+
items = pd.Series(codigosf[namcol]).astype(str)
|
44 |
+
|
45 |
+
if selec is not None and items is not None:
|
46 |
+
|
47 |
+
st.write(items)
|
48 |
+
|
49 |
+
datos_tot = []
|
50 |
+
st.write('Por favor espere mientas se extrae datos...')
|
51 |
+
for it in items:
|
52 |
+
extrae_med = extrae_web(it)
|
53 |
+
extrae_dat = extrae_dato_web(it)
|
54 |
+
itxx = it[:-4]+'-'+it[-4:]
|
55 |
+
datos = [it, itxx] + list(extrae_med) + list(extrae_dat)
|
56 |
+
datos_tot.append(datos)
|
57 |
+
|
58 |
+
dtdf = pd.DataFrame(datos_tot)
|
59 |
+
dtdf.columns = ['part_no_', 'part_no',
|
60 |
+
'descrip_en', 'length_m', 'width_m', 'height_m', 'vol_m3', 'compatible',
|
61 |
+
'alternate', 'precio_bm_us', 'peso_lb', 'descr']
|
62 |
+
now = datetime.now()
|
63 |
+
date_time = now.strftime("%m/%d/%Y, %H:%M:%S").replace('/','_').replace(':','_').replace(', ', '_')
|
64 |
+
dtdf['peso_kg'] = dtdf.peso_lb*0.453592
|
65 |
+
|
66 |
+
dtdf2 = dtdf[['part_no_', 'part_no', 'descr', 'length_m', 'width_m', 'height_m', 'vol_m3', 'peso_kg', 'precio_bm_us', 'alternate', 'compatible']]
|
67 |
+
|
68 |
+
df_xlsx = convierte_excel(dtdf2)
|
69 |
+
st.download_button(label='📩 Descargar XLSX', data=df_xlsx,
|
70 |
+
file_name = 'df_'+date_time+'.xlsx')
|
71 |
+
|
72 |
+
csv = dtdf2.to_csv(index=False)
|
73 |
+
st.download_button(label='📩 Descargar CSV', data=csv,
|
74 |
+
file_name = 'df_'+date_time+'.csv')
|
75 |
+
|
76 |
+
else:
|
77 |
+
st.write('Cuando seleccione la opción, por favor cargue datos y proceda.')
|
78 |
+
|
79 |
+
c1, c2, c3 = st.columns([4,4,4])
|
80 |
+
with c3:
|
81 |
+
st.image('gdmklogo.png', width=100, caption='Diseñado por GestioDinámica 2022')
|
codigos_prueba.xlsx
ADDED
Binary file (8.6 kB). View file
|
|
encopartslogo.jpg
ADDED
funcs.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import bs4 as BeautifulSoup
|
2 |
+
import pandas as pd
|
3 |
+
import urllib.request
|
4 |
+
from io import BytesIO
|
5 |
+
from pyxlsb import open_workbook as open_xlsb
|
6 |
+
from urllib.request import urlopen, Request
|
7 |
+
import re
|
8 |
+
|
9 |
+
rutas_websearch = ['https://en.hespareparts.com/search/?search=', 'https://offroadeq.com/parts-search/']
|
10 |
+
|
11 |
+
def extrae_dato_web(idx):
|
12 |
+
datx = []
|
13 |
+
idxx = str(idx).replace('-', '').replace(' ', '')
|
14 |
+
urlg = rutas_websearch[1] + idxx + '/'
|
15 |
+
htmlg = urlopen(urlg).read()
|
16 |
+
soup = BeautifulSoup.BeautifulSoup(htmlg, 'html.parser')
|
17 |
+
lista0 = soup.find_all('h2')[0]
|
18 |
+
lista1 = soup.find_all('dt')
|
19 |
+
lista2 = soup.find_all('dd')
|
20 |
+
if len(lista1)<3:
|
21 |
+
lista1 = ['Alt NA'] + lista1
|
22 |
+
lista2 = ['Alternate NA'] + lista2
|
23 |
+
else:
|
24 |
+
pass
|
25 |
+
for i, j in zip(lista1, lista2):
|
26 |
+
try:
|
27 |
+
datx.append( float(j.text.replace('lbs', '').replace('$', '')) )
|
28 |
+
except:
|
29 |
+
datx.append(j)
|
30 |
+
datx.append( lista0.text.split('-')[1:][0] )
|
31 |
+
return(datx)
|
32 |
+
|
33 |
+
def extrae_web(idx):
|
34 |
+
idxx = str(idx).replace('-', '').replace(' ', '')
|
35 |
+
urlz = rutas_websearch[0] + idxx + '/'
|
36 |
+
|
37 |
+
try:
|
38 |
+
|
39 |
+
htmlz = urlopen(urlz).read()
|
40 |
+
soup = BeautifulSoup.BeautifulSoup(htmlz, 'html.parser')
|
41 |
+
lista = soup.find_all('a', {'class': 'link-dark'})
|
42 |
+
ls = lista[0]
|
43 |
+
page = urlopen(ls['href'])
|
44 |
+
html = page.read()
|
45 |
+
soup = BeautifulSoup.BeautifulSoup(html, 'html.parser')
|
46 |
+
gg = soup.find_all('h1')
|
47 |
+
print(gg)
|
48 |
+
|
49 |
+
dd = []
|
50 |
+
for typex in ['depth', 'width', 'height']:
|
51 |
+
try:
|
52 |
+
aa = soup.find_all('span', {'itemprop': typex})[0].text
|
53 |
+
bb = re.findall('[0-9.]+', aa)
|
54 |
+
except:
|
55 |
+
bb = [float(-1.0)]
|
56 |
+
dd.append(float(bb[0])/1000)
|
57 |
+
|
58 |
+
cc = soup.find_all('div', {'itemprop': 'description'})[0].text
|
59 |
+
cc1 = cc.replace('\r', '').replace('\n', ' ')
|
60 |
+
|
61 |
+
ggtx = gg[0].text
|
62 |
+
posx = ggtx.find(' - ')
|
63 |
+
ggx = ggtx[posx+3:]
|
64 |
+
vol = dd[0] * dd[1] * dd[2]
|
65 |
+
dd0, dd1, dd2 = dd[0], dd[1], dd[2]
|
66 |
+
|
67 |
+
except:
|
68 |
+
ggx, dd0, dd1, dd2, vol, cc1 = 'Not Available', -1.0, -1.0, -1.0, -1.0, 'NA'
|
69 |
+
|
70 |
+
return ggx, dd0, dd1, dd2, vol, cc1
|
71 |
+
|
72 |
+
def extrae_alternate(idx):
|
73 |
+
idxx = str(idx).replace('-', '').replace(' ', '')
|
74 |
+
urlg = rutas_websearch[1] + idxx + '/'
|
75 |
+
htmlg = urlopen(urlg).read()
|
76 |
+
soup = BeautifulSoup.BeautifulSoup(htmlg, 'html.parser')
|
77 |
+
dt1 = soup.find_all('dt')[0].text
|
78 |
+
print(dt1)
|
79 |
+
dt2 = soup.find_all('dd')[0].text
|
80 |
+
if dt1 == 'Alternate for':
|
81 |
+
return(dt2)
|
82 |
+
|
83 |
+
def convierte_excel(df):
|
84 |
+
output = BytesIO()
|
85 |
+
writer = pd.ExcelWriter(output, engine='xlsxwriter')
|
86 |
+
df.to_excel(writer, index=False, sheet_name='data_extraida')
|
87 |
+
workbook = writer.book
|
88 |
+
worksheet = writer.sheets['data_extraida']
|
89 |
+
format1 = workbook.add_format({'num_format': '0.00'})
|
90 |
+
worksheet.set_column('A:A', None, format1)
|
91 |
+
writer.save()
|
92 |
+
processed_data = output.getvalue()
|
93 |
+
writer.close()
|
94 |
+
return processed_data
|
gdmklogo.png
ADDED
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
beautifulsoup4==4.11.1
|
2 |
+
pandas==1.3.4
|
3 |
+
openpyxl==3.0.10
|
4 |
+
pyxlsb==1.0.9
|