Spaces:
Sleeping
Sleeping
File size: 1,760 Bytes
66e260e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import streamlit as st
import pdfplumber
import docx
import pylatexenc
class docLoader():
def __init__(self):
pass
def load(self, uploaded_file):
if uploaded_file is not None:
st.write("File uploaded successfully!")
file_extension = uploaded_file.name.split(".")[-1]
load_functions = {
"pdf": self.load_pdf,
"txt": self.load_txt,
"docx": self.load_docx,
"tex": self.load_tex
}
if file_extension in load_functions:
text = load_functions[file_extension](uploaded_file)
st.text_area("Extracted From Document", value=text)
else:
st.write("Unsupported file format")
else:
text = ''
return text
def load_pdf(self, uploaded_file):
with pdfplumber.open(uploaded_file) as pdf:
pages = pdf.pages
text = ""
for page in pages:
text += page.extract_text()
return text
def load_txt(self, uploaded_file):
return uploaded_file.getvalue().decode("utf-8")
def load_docx(self, uploaded_file):
docx_text = docx.Document(uploaded_file)
full_text = [para.text for para in docx_text.paragraphs]
return "\n".join(full_text)
def load_tex(self, uploaded_file):
with open(uploaded_file.name, 'r') as tex_file:
tex_content = tex_file.read()
return pylatexenc.latex2text(tex_content)
def load_doc():
uploaded_file = st.file_uploader("Choose a document file", type=["pdf", "txt", "docx"])
loader = docLoader()
return loader.load(uploaded_file)
if __name__ == "__main__":
load_doc() |