DocumentQA / DiT_Extractor /base_utils.py
Epoching's picture
init
c14d9ad
raw
history blame
13.3 kB
# Copyright (c) 2022, Lawrence Livermore National Security, LLC.
# All rights reserved.
# See the top-level LICENSE and NOTICE files for details.
# LLNL-CODE-838964
# SPDX-License-Identifier: Apache-2.0-with-LLVM-exception
from pdfminer.pdfpage import PDFParser
from pdfminer.pdfpage import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LTTextBoxHorizontal
from pdfminer.layout import LTTextLineHorizontal
from pdfminer.layout import LTChar
from pdfminer.layout import LAParams
from pdfminer.layout import LTRect
from pdfminer.layout import LTFigure
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer import pdfinterp
from collections.abc import Iterable
from collections import Counter
from collections import OrderedDict
import os
# This is use for highlighting in PDFs
from PyPDF2.generic import (
DictionaryObject,
NumberObject,
FloatObject,
NameObject,
TextStringObject,
ArrayObject
)
# Used to extract pages
from PyPDF2 import PdfFileReader, PdfFileWriter
def get_page_sizes(document):
parser = PDFParser(open(document, 'rb'))
doc = PDFDocument(parser)
pageSizesList = []
for page in PDFPage.create_pages(doc):
# the media box that is the page size as list of 4 integers x0 y0 x1 y1
pageSizesList.append(page.mediabox) # <- appending
return pageSizesList
def get_page_count(document):
# Is there a better way of getting the page count than doing this?
parser = PDFParser(document)
tmpdoc = PDFDocument(parser)
page_count = pdfinterp.resolve1(tmpdoc.catalog['Pages'])['Count']
return page_count
def get_pdf_page_count(filename):
with open(filename, 'rb') as document:
return get_page_count(document)
def get_pages(document, page_numbers = None):
#Create resource manager
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page_count = get_page_count(document)
if page_numbers is None:
page_numbers = range(page_count)
for page, page_number in zip(PDFPage.get_pages(document, page_numbers), page_numbers):
interpreter.process_page(page)
# receive the LTPage object for the page.
layout = device.get_result()
#print("Yield page:", page_number)
yield layout, page_number
def partial_overlaps(box, other):
"""
Determine if the two bounding boxes overlap eachother.
TODO: Really should just use a standard Python library for this.
box -- 2 coordinate bounding box (x1,y1,x2,y2)
other -- 2 coordinate bounding box (x1,y1,x2,y2)
"""
# a1 x1 a2 x2
# <------------------>
x_intersects = (other[0] < box[0] and other[2] > box[0]) or (
other[0] < box[2] and other[2] > box[2])
y_intersects = (other[1] < box[1] and other[3] > box[1]) or (
other[1] < box[3] and other[3] > box[3])
intersects = x_intersects or y_intersects
# TODO: Simplify?
return intersects and overlaps(box, other)
#return intersects
def overlaps(box, other):
"""
Determine if the two bounding boxes overlap eachother.
TODO: Really should just use a standard Python library for this.
box -- 2 coordinate bounding box (x1,y1,x2,y2)
other -- 2 coordinate bounding box (x1,y1,x2,y2)
"""
x_intersects = box[0] > other[2] or box[2] < other[0]
y_intersects = box[1] > other[3] or box[3] < other[1]
intersects = not (x_intersects or y_intersects)
return intersects
def union(src, other):
"""
Expand src by union of other bbox
src -- 2 coordinate bounding box (x1,y1,x2,y2)
other -- 2 coordinate bounding box (x1,y1,x2,y2)
returns union of src and other
"""
xmin = min(src[0], other[0])
ymin = min(src[1], other[1])
xmax = max(src[2], other[2])
ymax = max(src[3], other[3])
return [xmin, ymin, xmax, ymax]
# See: https://gist.github.com/agentcooper/4c55133f5d95866acdee5017cd318558#file-pypdf2highlight-py
# x1, y1 starts in bottom left corner
def createHighlight(x1, y1, x2, y2, meta, color = [1, 0, 0]):
newHighlight = DictionaryObject()
newHighlight.update({
NameObject("/F"): NumberObject(4),
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Highlight"),
NameObject("/T"): TextStringObject(meta["author"]),
NameObject("/Contents"): TextStringObject(meta["contents"]),
NameObject("/C"): ArrayObject([FloatObject(c) for c in color]),
NameObject("/Rect"): ArrayObject([
FloatObject(x1),
FloatObject(y1),
FloatObject(x2),
FloatObject(y2)
]),
NameObject("/QuadPoints"): ArrayObject([
FloatObject(x1),
FloatObject(y2),
FloatObject(x2),
FloatObject(y2),
FloatObject(x1),
FloatObject(y1),
FloatObject(x2),
FloatObject(y1)
]),
})
return newHighlight
def addHighlightToPage(highlight, page, output):
highlight_ref = output._addObject(highlight);
if "/Annots" in page:
page[NameObject("/Annots")].append(highlight_ref)
else:
page[NameObject("/Annots")] = ArrayObject([highlight_ref])
def get_pdf_words(document, page_numbers=None):
"""
Get all words from LTChar or LTTextLineHorizontal objects from the document.
:param document: string path of the PDF file to process
:returns: A map of page #'s containing lists of coordinates and PDFMiner
objects. Ex.: {page_number: [[x1, y1, x2, y2, <LTTextLineHorizontal>],]}
"""
pdf_doc = open(document, 'rb')
bboxes = {}
for layout, page in get_pages(pdf_doc, page_numbers):
#print(element.get_text())
bboxes[page] = []
for element in layout:
if not isinstance(element, Iterable):
continue # not iterable
for subElement in element:
#print('Subelement type:', type(subElement))
if isinstance(subElement, LTChar):
if (subElement.get_text() == ' '):
pass # TODO: Handle word deliminator
# Print the character in this class
# print(subElement.get_text(), end='')
item = list(subElement.bbox)
item.append(subElement)
bboxes[page].append(item)
elif isinstance(subElement, LTTextLineHorizontal):
#print(subElement.bbox)
item = list(subElement.bbox)
item.append(subElement)
bboxes[page].append(item)
else:
pass
return bboxes
def get_paragraphs(words):
paragraph_tolerance = 0.1
max_height_diff = 1
paragraphs = []
for page, elements in words.items():
# Find nominal font size
# Round to int
freq = Counter()
for element in elements:
height = int(element[3] - element[1])
#print(height,end=' ')
freq[height] += 1
nominal_font = freq.most_common(1)[0][0]
print("Nominal font is:", nominal_font)
print("Page:", page)
x_offset_prev_line = None
prev_x_offset = None
prev_y_offset = None
paragraph_content = ""
#print("Element count:", len(elements))
first_line = False
processed_first_line = False
for element in elements:
x_offset = element[0]
y_offset = element[1]
height = int(element[3] - element[1])
text = element[4].get_text()
if x_offset_prev_line != None:
large_x_offset = (abs(x_offset_prev_line - x_offset) > paragraph_tolerance)
# Font size mismatch?
if abs(height - nominal_font) > max_height_diff:
if len(paragraph_content) > 0:
print("Content append:", len(paragraph_content))
paragraphs.append(paragraph_content)
paragraph_content = ""
print("Continue due to height != nominal_font")
continue
print("ELEMENT:", element[0:4], text[0:15])
if prev_y_offset is not None and len(paragraph_content) > 0:
if y_offset < prev_y_offset - height * 1.5:
print("Content append:", len(paragraph_content))
if len(paragraph_content) > 0:
paragraphs.append(paragraph_content)
paragraph_content = text
prev_y_offset = None
continue
prev_y_offset = y_offset
prev_y_offset = y_offset
#print("element:", element)
if not isinstance(element[4], LTTextLineHorizontal):
continue
#print("Running text:", text)
#print(f"x_offset_prev_line , x_offset]: {x_offset_prev_line, x_offset}")
# Find first paragraph
if x_offset_prev_line is None:
#print("x_offset_prev is none")
x_offset_prev_line = x_offset
if not processed_first_line:
first_line = True
processed_first_line = True
if height == nominal_font:
paragraph_content += text
#print("Continue due to x_offset_prev_line is none")
continue
# Check case if first line was indented
if x_offset_prev_line > x_offset and first_line:
#print("x_offset < element[0]")
first_line = False
paragraph_content += text
x_offset_prev_line = x_offset
#print("Continue due to x_offset_prev_line > x_offset and first_line")
continue
# is this indented?
# and ignore small changes
if x_offset_prev_line < x_offset and large_x_offset:
#print(f"x_offset_prev_line > x_offset: {x_offset_prev_line, x_offset}")
if height == nominal_font and len(paragraph_content) > 0:
paragraphs.append(paragraph_content)
paragraph_content = text
# Reset at next line read
# What if next paragraph is also indented???
x_offset_prev_line = None
#print("Continue due to x_offset_prev_line < x_offset and large_x_offset")
continue
#print(element[0:4])
if height == nominal_font:
paragraph_content += text
#print("End of loop")
# TODO: Remove redundant space
if paragraph_content != "":
paragraphs.append(paragraph_content)
# Find paragraph indexes
c = 0
indexes = []
for p in paragraphs:
c += len(p)
indexes.append(c)
return paragraphs, indexes
def get_pdf_elements(document, element_type, page_numbers=None):
pdf_doc = open(document, 'rb')
items = {}
for layout, page in get_pages(pdf_doc, page_numbers):
#print(element.get_text())
items[page] = []
for element in layout:
if isinstance(element, element_type):
item = list(element.bbox)
if hasattr(element, 'non_stroking_color'):
item.append(element.non_stroking_color)
items[page].append(item)
print(items)
return items
def get_large_colored_background_rectangles(document, page_numbers=None):
# Only include rectangles that are at least 4" x 1" in size
min_size = (288.0, 72.0)
elements = get_pdf_elements(document, LTRect, page_numbers)
rects_out = {}
for page, rects in elements.items():
print("Rects:", rects)
for rect in rects:
width = rect[2] - rect[0]
height = rect[3] - rect[1]
print("Dimensions:", width, height)
if (width > min_size[0] and
height > min_size[1]):
if not page in rects_out:
rects_out[page] = []
rects_out[page].append(rect)
return rects_out
def extract_pages(document, output, page_numbers=None):
pdf = PdfFileReader(document)
pdf_writer = PdfFileWriter()
for page in page_numbers:
current_page = pdf.getPage(page)
pdf_writer.addPage(current_page)
with open(output, "wb") as out:
pdf_writer.write(out)