|
|
|
"""
|
|
Created on Sat Feb 19 20:23:31 2022
|
|
|
|
@author: nperuma
|
|
"""
|
|
|
|
import streamlit as st
|
|
import subprocess
|
|
from subprocess import STDOUT, check_call
|
|
import os
|
|
import base64
|
|
import camelot as cam
|
|
|
|
|
|
@st.cache
|
|
def gh():
|
|
"""install ghostscript on the linux machine"""
|
|
proc = subprocess.Popen('apt-get install -y ghostscript', shell=True, stdin=None, stdout=open(os.devnull,"wb"), stderr=STDOUT, executable="/bin/bash")
|
|
proc.wait()
|
|
|
|
gh()
|
|
|
|
|
|
|
|
st.title("PDF Table Extractor")
|
|
st.subheader("Extract the contents in ease")
|
|
|
|
st.image("https://raw.githubusercontent.com/camelot-dev/camelot/master/docs/_static/camelot.png", width=150)
|
|
|
|
|
|
|
|
|
|
|
|
input_pdf = st.file_uploader(label = "upload your pdf here", type = 'pdf')
|
|
|
|
st.markdown("### Page Number")
|
|
|
|
page_number = st.text_input("Enter the page # from where you want to extract the PDF eg: 3", value = 1)
|
|
|
|
|
|
|
|
if input_pdf is not None:
|
|
|
|
with open("input.pdf", "wb") as f:
|
|
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
|
|
f.write(base64.b64decode(base64_pdf))
|
|
f.close()
|
|
|
|
Ddlist_selection = st.selectbox("Does the pdf contain a proper table structure?",['lattice', 'stream'])
|
|
|
|
table = cam.read_pdf("input.pdf", pages = page_number, flavor = Ddlist_selection)
|
|
|
|
st.markdown("### Number of Tables")
|
|
|
|
|
|
st.write(table)
|
|
|
|
|
|
|
|
if len(table) > 0:
|
|
|
|
|
|
|
|
option = st.selectbox(label = "Select the Table to be displayed", options = range(len(table) + 1))
|
|
|
|
st.markdown('### Output Table')
|
|
|
|
|
|
|
|
st.dataframe(table[int(option)-1].df)
|
|
|
|
|
|
|
|
@st.cache
|
|
def convert_df(df):
|
|
|
|
return df.to_csv().encode('utf-8')
|
|
|
|
csv = convert_df(table[int(option)-1].df)
|
|
|
|
st.download_button(
|
|
label="Download data as CSV",
|
|
data=csv,
|
|
file_name='Data_table.csv',
|
|
mime='text/csv',
|
|
)
|
|
|
|
|