|
import gradio as gr |
|
from time import time |
|
from pathlib import Path |
|
from gradio_pdf import PDF |
|
from pdf2image import convert_from_path |
|
import shutil |
|
import tempfile |
|
from transformers import pipeline |
|
import subprocess as sp |
|
|
|
out_files = gr.State([]) |
|
FILE_TIMEOUT = 10 ** 3 |
|
MAX_FILES = 10 |
|
|
|
p = pipeline( |
|
"document-question-answering", |
|
model="impira/layoutlm-document-qa", |
|
) |
|
|
|
def handle_files(cur_files): |
|
cur_time = cur_files[-1][0] |
|
deleted_indices = set() |
|
for other_idx, (other_time, other_file) in enumerate(cur_files[:-1]): |
|
if abs(cur_time - other_time) > FILE_TIMEOUT: |
|
shutil.rmtree(other_file.parent) |
|
deleted_indices.add(other_idx) |
|
cur_files = [cur_files[idx] for idx in range(len(cur_files)) if idx not in deleted_indices] |
|
|
|
if len(cur_files) > MAX_FILES: |
|
for _, other_file in cur_files[:-MAX_FILES]: |
|
shutil.rmtree(other_file.parent) |
|
cur_files = cur_files[-MAX_FILES:] |
|
return cur_files |
|
|
|
|
|
def extract_text(pdf_file): |
|
""" |
|
Generate a text rendering of a PDF file in the form of a list of lines. |
|
""" |
|
args = ['pdftotext', '-layout', pdf_file, '-'] |
|
cp = sp.run( |
|
args, stdout=sp.PIPE, stderr=sp.DEVNULL, |
|
check=True, text=True |
|
) |
|
return cp.stdout |
|
|
|
|
|
def process_pdf(pdf_file, cur_files): |
|
|
|
zip_output = Path(tempfile.mkdtemp()) / f'{Path(pdf_file).stem}' |
|
|
|
|
|
with tempfile.TemporaryDirectory() as path: |
|
pdf_output = path |
|
convert_from_path(pdf_file, output_folder=str(pdf_output)) |
|
|
|
|
|
shutil.make_archive(zip_output, 'zip', pdf_output) |
|
|
|
zip_output = zip_output.with_suffix('.zip') |
|
|
|
cur_time = time() |
|
cur_files.append((cur_time, zip_output)) |
|
cur_files = handle_files(cur_files) |
|
|
|
return str(zip_output), cur_files |
|
|
|
|
|
def interact_with_pdf(doc, question): |
|
with tempfile.TemporaryDirectory() as path: |
|
images = convert_from_path(doc, output_folder=path) |
|
outputs = [] |
|
for img in images: |
|
outputs += p(img, question) |
|
return sorted(outputs, key=lambda x: x["score"], reverse=True)[0]['answer'] |
|
|
|
|
|
text_interface = gr.Interface( |
|
fn=extract_text, |
|
inputs=PDF(label="Upload PDF"), |
|
outputs=gr.Textbox(label="Extracted Text"), |
|
title="PDF extractor", |
|
description="Extracts text from the PDF container." |
|
) |
|
|
|
pdf_interface = gr.Interface( |
|
fn=process_pdf, |
|
inputs=[PDF(label="Upload PDF"), out_files], |
|
outputs=[gr.File(label="Download ZIP"), out_files], |
|
title="PDF to Image Converter", |
|
description="Converts PDF pages to images and outputs a ZIP file." |
|
) |
|
|
|
image_interface = gr.Interface( |
|
fn=interact_with_pdf, |
|
inputs=[ |
|
PDF(label="Upload PDF"), |
|
gr.Textbox(label="Text Query") |
|
], |
|
outputs=gr.Textbox(label="Possible Answer"), |
|
title="Ask Your PDF", |
|
description="Searches for text in the uploaded image based on the provided query." |
|
) |
|
|
|
|
|
tabbed_interface = gr.TabbedInterface( |
|
[text_interface, pdf_interface, image_interface], |
|
title="PDF interaction", |
|
tab_names=["Text extractor", "Converter", "Interaction"], |
|
|
|
) |
|
|
|
|
|
tabbed_interface.launch() |
|
|