File size: 1,760 Bytes
66e260e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import streamlit as st
import pdfplumber
import docx
import pylatexenc

class docLoader():
    def  __init__(self):
        pass

    def load(self, uploaded_file):
        if uploaded_file is not None:
            st.write("File uploaded successfully!")

            file_extension = uploaded_file.name.split(".")[-1]

            load_functions = {
                "pdf": self.load_pdf,
                "txt": self.load_txt,
                "docx": self.load_docx,
                "tex": self.load_tex
            }

            if file_extension in load_functions:
                text = load_functions[file_extension](uploaded_file)
                st.text_area("Extracted From Document", value=text)
            else:
                st.write("Unsupported file format")

        else:
            text = ''
        return text

    def load_pdf(self, uploaded_file):
        with pdfplumber.open(uploaded_file) as pdf:
            pages = pdf.pages
            text = ""
            for page in pages:
                text += page.extract_text()
        return text

    def load_txt(self, uploaded_file):
        return uploaded_file.getvalue().decode("utf-8")

    def load_docx(self, uploaded_file):
        docx_text = docx.Document(uploaded_file)
        full_text = [para.text for para in docx_text.paragraphs]
        return "\n".join(full_text)

    def load_tex(self, uploaded_file):
        with open(uploaded_file.name, 'r') as tex_file:
            tex_content = tex_file.read()
        return pylatexenc.latex2text(tex_content)

def load_doc():
    uploaded_file = st.file_uploader("Choose a document file", type=["pdf", "txt", "docx"])
    loader = docLoader()
    return loader.load(uploaded_file)

if __name__ == "__main__":
    load_doc()