File size: 2,069 Bytes
4b0678e
 
c982cf8
5f1077a
 
 
 
4b0678e
c982cf8
 
 
39f86d4
5f1077a
c982cf8
 
 
 
0bce450
 
 
5f1077a
0bce450
 
5f1077a
0bce450
5f1077a
c982cf8
 
 
5f1077a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr
from pdf2image import convert_from_path
import pdfplumber
from docx import Document
import subprocess
import os


def convert_pdf_to_image(file):
    images = convert_from_path(file)
    return images


def extract_text_from_pdf(file):
    text = ""
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text


def extract_text_from_docx(file):
    text = ""
    doc = Document(file.name)
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text


def convert_doc_to_text(doc_path):
    try:
        subprocess.run(
            ["unoconv", "--format", "txt", doc_path],
            capture_output=True,
            text=True,
            check=True,
        )
        txt_file_path = doc_path.replace(".doc", ".txt")
        with open(txt_file_path, "r") as f:
            text = f.read()
        text = text.lstrip("\ufeff")
        os.remove(txt_file_path)
        return text
    except subprocess.CalledProcessError as e:
        print(f"Error converting {doc_path} to text: {e}")
        return ""


def extract_text_from_doc_or_docx(file):
    if file.name.endswith(".docx"):
        return extract_text_from_docx(file)
    elif file.name.endswith(".doc"):
        return convert_doc_to_text(file.name)
    else:
        return "Unsupported file type. Please upload a .doc or .docx file."


pdf_to_img = gr.Interface(
    convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
)
pdf_to_text = gr.Interface(
    extract_text_from_pdf,
    gr.File(),
    gr.Textbox(placeholder="Extracted text will appear here"),
    api_name="pdf_to_text",
)

doc_or_docx_to_text = gr.Interface(
    extract_text_from_doc_or_docx,
    gr.File(),
    gr.Textbox(placeholder="Extracted text from DOC or DOCX will appear here"),
    api_name="doc_or_docx_to_text",
)

demo = gr.TabbedInterface(
    [pdf_to_img, pdf_to_text, doc_or_docx_to_text],
    ["PDF to Image", "Extract PDF Text", "Extract DOC/DOCX Text"],
)

demo.launch(debug=True)