Spaces:

vuvko
/

from_pdf

Sleeping

File size: 3,388 Bytes

77eac00
 
 
 
 
 
 
 
d15e89e
77eac00
 
8ae9c70
77eac00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d15e89e
 
 
 
 
1482457
d15e89e
 
 
 
 
 
77eac00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d5d236
 
23c0f1b
 
 
 
 
77eac00
 
2d5d236
d15e89e
 
e7740fc
 
d15e89e
 
 
 
77eac00
 
e7740fc
 
77eac00
 
 
 
 
2d5d236
77eac00
6095336
77eac00
 
2d5d236
23c0f1b
77eac00
 
 
 
 
d15e89e
77eac00
d15e89e
77eac00

import gradio as gr
from time import time
from pathlib import Path
from gradio_pdf import PDF
from pdf2image import convert_from_path
import shutil
import tempfile
from transformers import pipeline
import subprocess as sp

out_files = gr.State([])
FILE_TIMEOUT = 10 ** 3
MAX_FILES = 10

p = pipeline(
    "document-question-answering",
    model="impira/layoutlm-document-qa",
)

def handle_files(cur_files):
    cur_time = cur_files[-1][0]
    deleted_indices = set()
    for other_idx, (other_time, other_file) in enumerate(cur_files[:-1]):
        if abs(cur_time - other_time) > FILE_TIMEOUT:
            shutil.rmtree(other_file.parent)
            deleted_indices.add(other_idx)
    cur_files = [cur_files[idx] for idx in range(len(cur_files)) if idx not in deleted_indices]
    
    if len(cur_files) > MAX_FILES:
        for _, other_file in cur_files[:-MAX_FILES]:
            shutil.rmtree(other_file.parent)
        cur_files = cur_files[-MAX_FILES:]
    return cur_files


def extract_text(pdf_file):
    """
    Generate a text rendering of a PDF file in the form of a list of lines.
    """
    args = ['pdftotext', '-layout', pdf_file, '-']
    cp = sp.run(
      args, stdout=sp.PIPE, stderr=sp.DEVNULL,
      check=True, text=True
    )
    return cp.stdout

# Function to process PDF and generate ZIP file
def process_pdf(pdf_file, cur_files):
    
    zip_output = Path(tempfile.mkdtemp()) / f'{Path(pdf_file).stem}'
    # zip_output.parent.mkdir()
    
    with tempfile.TemporaryDirectory() as path:
        pdf_output = path
        convert_from_path(pdf_file, output_folder=str(pdf_output))
    
        # Create a BytesIO object to store zip file in memory
        shutil.make_archive(zip_output, 'zip', pdf_output)
    
    zip_output = zip_output.with_suffix('.zip')
    
    cur_time = time()
    cur_files.append((cur_time, zip_output))
    cur_files = handle_files(cur_files)
    
    return str(zip_output), cur_files


def interact_with_pdf(doc, question):
    with tempfile.TemporaryDirectory() as path:
        images = convert_from_path(doc, output_folder=path)
        outputs = []
        for img in images:
            outputs += p(img, question)
    return sorted(outputs, key=lambda x: x["score"], reverse=True)[0]['answer']


text_interface = gr.Interface(
    fn=extract_text,
    inputs=PDF(label="Upload PDF"),
    outputs=gr.Textbox(label="Extracted Text"),
    title="PDF extractor",
    description="Extracts text from the PDF container."
)

pdf_interface = gr.Interface(
    fn=process_pdf,
    inputs=[PDF(label="Upload PDF"), out_files],
    outputs=[gr.File(label="Download ZIP"), out_files],
    title="PDF to Image Converter",
    description="Converts PDF pages to images and outputs a ZIP file."
)

image_interface = gr.Interface(
    fn=interact_with_pdf,
    inputs=[
        PDF(label="Upload PDF"),
        gr.Textbox(label="Text Query")
    ],
    outputs=gr.Textbox(label="Possible Answer"),
    title="Ask Your PDF",
    description="Searches for text in the uploaded image based on the provided query."
)

# Create a tabbed interface
tabbed_interface = gr.TabbedInterface(
    [text_interface, pdf_interface, image_interface],
    title="PDF interaction",
    tab_names=["Text extractor", "Converter", "Interaction"],
    # description="Choose a tab to perform the desired task."
)

# Launch the app
tabbed_interface.launch()