File size: 2,069 Bytes
4b0678e c982cf8 5f1077a 4b0678e c982cf8 39f86d4 5f1077a c982cf8 0bce450 5f1077a 0bce450 5f1077a 0bce450 5f1077a c982cf8 5f1077a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import gradio as gr
from pdf2image import convert_from_path
import pdfplumber
from docx import Document
import subprocess
import os
def convert_pdf_to_image(file):
images = convert_from_path(file)
return images
def extract_text_from_pdf(file):
text = ""
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
text += page.extract_text() + "\n"
return text
def extract_text_from_docx(file):
text = ""
doc = Document(file.name)
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
def convert_doc_to_text(doc_path):
try:
subprocess.run(
["unoconv", "--format", "txt", doc_path],
capture_output=True,
text=True,
check=True,
)
txt_file_path = doc_path.replace(".doc", ".txt")
with open(txt_file_path, "r") as f:
text = f.read()
text = text.lstrip("\ufeff")
os.remove(txt_file_path)
return text
except subprocess.CalledProcessError as e:
print(f"Error converting {doc_path} to text: {e}")
return ""
def extract_text_from_doc_or_docx(file):
if file.name.endswith(".docx"):
return extract_text_from_docx(file)
elif file.name.endswith(".doc"):
return convert_doc_to_text(file.name)
else:
return "Unsupported file type. Please upload a .doc or .docx file."
pdf_to_img = gr.Interface(
convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
)
pdf_to_text = gr.Interface(
extract_text_from_pdf,
gr.File(),
gr.Textbox(placeholder="Extracted text will appear here"),
api_name="pdf_to_text",
)
doc_or_docx_to_text = gr.Interface(
extract_text_from_doc_or_docx,
gr.File(),
gr.Textbox(placeholder="Extracted text from DOC or DOCX will appear here"),
api_name="doc_or_docx_to_text",
)
demo = gr.TabbedInterface(
[pdf_to_img, pdf_to_text, doc_or_docx_to_text],
["PDF to Image", "Extract PDF Text", "Extract DOC/DOCX Text"],
)
demo.launch(debug=True)
|