import streamlit as st import pdfplumber import docx import pylatexenc class docLoader(): def __init__(self): pass def load(self, uploaded_file): if uploaded_file is not None: st.write("File uploaded successfully!") file_extension = uploaded_file.name.split(".")[-1] load_functions = { "pdf": self.load_pdf, "txt": self.load_txt, "docx": self.load_docx, "tex": self.load_tex } if file_extension in load_functions: text = load_functions[file_extension](uploaded_file) st.text_area("Extracted From Document", value=text) else: st.write("Unsupported file format") else: text = '' return text def load_pdf(self, uploaded_file): with pdfplumber.open(uploaded_file) as pdf: pages = pdf.pages text = "" for page in pages: text += page.extract_text() return text def load_txt(self, uploaded_file): return uploaded_file.getvalue().decode("utf-8") def load_docx(self, uploaded_file): docx_text = docx.Document(uploaded_file) full_text = [para.text for para in docx_text.paragraphs] return "\n".join(full_text) def load_tex(self, uploaded_file): with open(uploaded_file.name, 'r') as tex_file: tex_content = tex_file.read() return pylatexenc.latex2text(tex_content) def load_doc(): uploaded_file = st.file_uploader("Choose a document file", type=["pdf", "txt", "docx"]) loader = docLoader() return loader.load(uploaded_file) if __name__ == "__main__": load_doc()