Spaces:

not-lain
/

utils

Sleeping

App Files Files Community

utils / app.py

not-lain

docx and doc

5f1077a about 1 month ago

raw

history blame

2.07 kB

	import gradio as gr
	from pdf2image import convert_from_path
	import pdfplumber
	from docx import Document
	import subprocess
	import os


	def convert_pdf_to_image(file):
	images = convert_from_path(file)
	return images


	def extract_text_from_pdf(file):
	text = ""
	with pdfplumber.open(file) as pdf:
	for page in pdf.pages:
	text += page.extract_text() + "\n"
	return text


	def extract_text_from_docx(file):
	text = ""
	doc = Document(file.name)
	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"
	return text


	def convert_doc_to_text(doc_path):
	try:
	subprocess.run(
	["unoconv", "--format", "txt", doc_path],
	capture_output=True,
	text=True,
	check=True,
	)
	txt_file_path = doc_path.replace(".doc", ".txt")
	with open(txt_file_path, "r") as f:
	text = f.read()
	text = text.lstrip("\ufeff")
	os.remove(txt_file_path)
	return text
	except subprocess.CalledProcessError as e:
	print(f"Error converting {doc_path} to text: {e}")
	return ""


	def extract_text_from_doc_or_docx(file):
	if file.name.endswith(".docx"):
	return extract_text_from_docx(file)
	elif file.name.endswith(".doc"):
	return convert_doc_to_text(file.name)
	else:
	return "Unsupported file type. Please upload a .doc or .docx file."


	pdf_to_img = gr.Interface(
	convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
	)
	pdf_to_text = gr.Interface(
	extract_text_from_pdf,
	gr.File(),
	gr.Textbox(placeholder="Extracted text will appear here"),
	api_name="pdf_to_text",
	)

	doc_or_docx_to_text = gr.Interface(
	extract_text_from_doc_or_docx,
	gr.File(),
	gr.Textbox(placeholder="Extracted text from DOC or DOCX will appear here"),
	api_name="doc_or_docx_to_text",
	)

	demo = gr.TabbedInterface(
	[pdf_to_img, pdf_to_text, doc_or_docx_to_text],
	["PDF to Image", "Extract PDF Text", "Extract DOC/DOCX Text"],
	)

	demo.launch(debug=True)