Spaces:

vuvko
/

from_pdf

Sleeping

from_pdf / app.py

Andrei Shadrikov

fix

2d5d236 over 1 year ago

3.39 kB

	import gradio as gr
	from time import time
	from pathlib import Path
	from gradio_pdf import PDF
	from pdf2image import convert_from_path
	import shutil
	import tempfile
	from transformers import pipeline
	import subprocess as sp

	out_files = gr.State([])
	FILE_TIMEOUT = 10 ** 3
	MAX_FILES = 10

	p = pipeline(
	"document-question-answering",
	model="impira/layoutlm-document-qa",
	)

	def handle_files(cur_files):
	cur_time = cur_files[-1][0]
	deleted_indices = set()
	for other_idx, (other_time, other_file) in enumerate(cur_files[:-1]):
	if abs(cur_time - other_time) > FILE_TIMEOUT:
	shutil.rmtree(other_file.parent)
	deleted_indices.add(other_idx)
	cur_files = [cur_files[idx] for idx in range(len(cur_files)) if idx not in deleted_indices]

	if len(cur_files) > MAX_FILES:
	for _, other_file in cur_files[:-MAX_FILES]:
	shutil.rmtree(other_file.parent)
	cur_files = cur_files[-MAX_FILES:]
	return cur_files


	def extract_text(pdf_file):
	"""
	Generate a text rendering of a PDF file in the form of a list of lines.
	"""
	args = ['pdftotext', '-layout', pdf_file, '-']
	cp = sp.run(
	args, stdout=sp.PIPE, stderr=sp.DEVNULL,
	check=True, text=True
	)
	return cp.stdout

	# Function to process PDF and generate ZIP file
	def process_pdf(pdf_file, cur_files):

	zip_output = Path(tempfile.mkdtemp()) / f'{Path(pdf_file).stem}'
	# zip_output.parent.mkdir()

	with tempfile.TemporaryDirectory() as path:
	pdf_output = path
	convert_from_path(pdf_file, output_folder=str(pdf_output))

	# Create a BytesIO object to store zip file in memory
	shutil.make_archive(zip_output, 'zip', pdf_output)

	zip_output = zip_output.with_suffix('.zip')

	cur_time = time()
	cur_files.append((cur_time, zip_output))
	cur_files = handle_files(cur_files)

	return str(zip_output), cur_files


	def interact_with_pdf(doc, question):
	with tempfile.TemporaryDirectory() as path:
	images = convert_from_path(doc, output_folder=path)
	outputs = []
	for img in images:
	outputs += p(img, question)
	return sorted(outputs, key=lambda x: x["score"], reverse=True)[0]['answer']


	text_interface = gr.Interface(
	fn=extract_text,
	inputs=PDF(label="Upload PDF"),
	outputs=gr.Textbox(label="Extracted Text"),
	title="PDF extractor",
	description="Extracts text from the PDF container."
	)

	pdf_interface = gr.Interface(
	fn=process_pdf,
	inputs=[PDF(label="Upload PDF"), out_files],
	outputs=[gr.File(label="Download ZIP"), out_files],
	title="PDF to Image Converter",
	description="Converts PDF pages to images and outputs a ZIP file."
	)

	image_interface = gr.Interface(
	fn=interact_with_pdf,
	inputs=[
	PDF(label="Upload PDF"),
	gr.Textbox(label="Text Query")
	],
	outputs=gr.Textbox(label="Possible Answer"),
	title="Ask Your PDF",
	description="Searches for text in the uploaded image based on the provided query."
	)

	# Create a tabbed interface
	tabbed_interface = gr.TabbedInterface(
	[text_interface, pdf_interface, image_interface],
	title="PDF interaction",
	tab_names=["Text extractor", "Converter", "Interaction"],
	# description="Choose a tab to perform the desired task."
	)

	# Launch the app
	tabbed_interface.launch()