File size: 1,254 Bytes
f8883b1
 
 
 
 
 
 
 
e588742
f8883b1
 
 
e588742
 
f8883b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f377843
 
 
 
 
 
 
 
f8883b1
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import streamlit as st
import PyPDF2
import pytesseract
from PIL import Image
from pdf2image import convert_from_path

def pdf_to_text(pdf_file):
    # Open the PDF file
    pdf = PyPDF2.PdfReader(pdf_file)

    # Extract the text from each page
    text = ''
    for page in pdf.pages:
        text += page.extract_text()

    # If the text is empty, use OCR to extract the text
    if not text:
        # Convert the PDF to images
        images = convert_from_path(pdf_file)

        # Perform OCR on each image
        for image in images:
            text += pytesseract.image_to_string(image)

    return text

def main():
    st.title("PDF Text Extractor")
    st.write("Upload a PDF file to extract the text")

    pdf_file = st.file_uploader("Upload PDF file", type=["pdf"])

    if pdf_file is not None:
        text = pdf_to_text(pdf_file)
        txt_file = pdf_file.name.replace('.pdf', '.txt')
        with open(txt_file, 'w') as f:
            f.write(text)

        with open(txt_file, "rb") as file:
            btn = st.download_button(
                label="Download Extracted Text",
                data=file,
                file_name=txt_file,
                mime="text/plain"
            )

if __name__ == "__main__":
    main()