# Copyright (c) 2022, Lawrence Livermore National Security, LLC. # All rights reserved. # See the top-level LICENSE and NOTICE files for details. # LLNL-CODE-838964 # SPDX-License-Identifier: Apache-2.0-with-LLVM-exception from pdfminer.pdfpage import PDFParser from pdfminer.pdfpage import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.layout import LTTextBoxHorizontal from pdfminer.layout import LTTextLineHorizontal from pdfminer.layout import LTChar from pdfminer.layout import LAParams from pdfminer.layout import LTRect from pdfminer.layout import LTFigure from pdfminer.converter import PDFPageAggregator from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer import pdfinterp from collections.abc import Iterable from collections import Counter from collections import OrderedDict import os # This is use for highlighting in PDFs from PyPDF2.generic import ( DictionaryObject, NumberObject, FloatObject, NameObject, TextStringObject, ArrayObject ) # Used to extract pages from PyPDF2 import PdfFileReader, PdfFileWriter def get_page_sizes(document): parser = PDFParser(open(document, 'rb')) doc = PDFDocument(parser) pageSizesList = [] for page in PDFPage.create_pages(doc): # the media box that is the page size as list of 4 integers x0 y0 x1 y1 pageSizesList.append(page.mediabox) # <- appending return pageSizesList def get_page_count(document): # Is there a better way of getting the page count than doing this? parser = PDFParser(document) tmpdoc = PDFDocument(parser) page_count = pdfinterp.resolve1(tmpdoc.catalog['Pages'])['Count'] return page_count def get_pdf_page_count(filename): with open(filename, 'rb') as document: return get_page_count(document) def get_pages(document, page_numbers = None): #Create resource manager rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_count = get_page_count(document) if page_numbers is None: page_numbers = range(page_count) for page, page_number in zip(PDFPage.get_pages(document, page_numbers), page_numbers): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() #print("Yield page:", page_number) yield layout, page_number def partial_overlaps(box, other): """ Determine if the two bounding boxes overlap eachother. TODO: Really should just use a standard Python library for this. box -- 2 coordinate bounding box (x1,y1,x2,y2) other -- 2 coordinate bounding box (x1,y1,x2,y2) """ # a1 x1 a2 x2 # <------------------> x_intersects = (other[0] < box[0] and other[2] > box[0]) or ( other[0] < box[2] and other[2] > box[2]) y_intersects = (other[1] < box[1] and other[3] > box[1]) or ( other[1] < box[3] and other[3] > box[3]) intersects = x_intersects or y_intersects # TODO: Simplify? return intersects and overlaps(box, other) #return intersects def overlaps(box, other): """ Determine if the two bounding boxes overlap eachother. TODO: Really should just use a standard Python library for this. box -- 2 coordinate bounding box (x1,y1,x2,y2) other -- 2 coordinate bounding box (x1,y1,x2,y2) """ x_intersects = box[0] > other[2] or box[2] < other[0] y_intersects = box[1] > other[3] or box[3] < other[1] intersects = not (x_intersects or y_intersects) return intersects def union(src, other): """ Expand src by union of other bbox src -- 2 coordinate bounding box (x1,y1,x2,y2) other -- 2 coordinate bounding box (x1,y1,x2,y2) returns union of src and other """ xmin = min(src[0], other[0]) ymin = min(src[1], other[1]) xmax = max(src[2], other[2]) ymax = max(src[3], other[3]) return [xmin, ymin, xmax, ymax] # See: https://gist.github.com/agentcooper/4c55133f5d95866acdee5017cd318558#file-pypdf2highlight-py # x1, y1 starts in bottom left corner def createHighlight(x1, y1, x2, y2, meta, color = [1, 0, 0]): newHighlight = DictionaryObject() newHighlight.update({ NameObject("/F"): NumberObject(4), NameObject("/Type"): NameObject("/Annot"), NameObject("/Subtype"): NameObject("/Highlight"), NameObject("/T"): TextStringObject(meta["author"]), NameObject("/Contents"): TextStringObject(meta["contents"]), NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), NameObject("/Rect"): ArrayObject([ FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y2) ]), NameObject("/QuadPoints"): ArrayObject([ FloatObject(x1), FloatObject(y2), FloatObject(x2), FloatObject(y2), FloatObject(x1), FloatObject(y1), FloatObject(x2), FloatObject(y1) ]), }) return newHighlight def addHighlightToPage(highlight, page, output): highlight_ref = output._addObject(highlight); if "/Annots" in page: page[NameObject("/Annots")].append(highlight_ref) else: page[NameObject("/Annots")] = ArrayObject([highlight_ref]) def get_pdf_words(document, page_numbers=None): """ Get all words from LTChar or LTTextLineHorizontal objects from the document. :param document: string path of the PDF file to process :returns: A map of page #'s containing lists of coordinates and PDFMiner objects. Ex.: {page_number: [[x1, y1, x2, y2, ],]} """ pdf_doc = open(document, 'rb') bboxes = {} for layout, page in get_pages(pdf_doc, page_numbers): #print(element.get_text()) bboxes[page] = [] for element in layout: if not isinstance(element, Iterable): continue # not iterable for subElement in element: #print('Subelement type:', type(subElement)) if isinstance(subElement, LTChar): if (subElement.get_text() == ' '): pass # TODO: Handle word deliminator # Print the character in this class # print(subElement.get_text(), end='') item = list(subElement.bbox) item.append(subElement) bboxes[page].append(item) elif isinstance(subElement, LTTextLineHorizontal): #print(subElement.bbox) item = list(subElement.bbox) item.append(subElement) bboxes[page].append(item) else: pass return bboxes def get_paragraphs(words): paragraph_tolerance = 0.1 max_height_diff = 1 paragraphs = [] for page, elements in words.items(): # Find nominal font size # Round to int freq = Counter() for element in elements: height = int(element[3] - element[1]) #print(height,end=' ') freq[height] += 1 nominal_font = freq.most_common(1)[0][0] print("Nominal font is:", nominal_font) print("Page:", page) x_offset_prev_line = None prev_x_offset = None prev_y_offset = None paragraph_content = "" #print("Element count:", len(elements)) first_line = False processed_first_line = False for element in elements: x_offset = element[0] y_offset = element[1] height = int(element[3] - element[1]) text = element[4].get_text() if x_offset_prev_line != None: large_x_offset = (abs(x_offset_prev_line - x_offset) > paragraph_tolerance) # Font size mismatch? if abs(height - nominal_font) > max_height_diff: if len(paragraph_content) > 0: print("Content append:", len(paragraph_content)) paragraphs.append(paragraph_content) paragraph_content = "" print("Continue due to height != nominal_font") continue print("ELEMENT:", element[0:4], text[0:15]) if prev_y_offset is not None and len(paragraph_content) > 0: if y_offset < prev_y_offset - height * 1.5: print("Content append:", len(paragraph_content)) if len(paragraph_content) > 0: paragraphs.append(paragraph_content) paragraph_content = text prev_y_offset = None continue prev_y_offset = y_offset prev_y_offset = y_offset #print("element:", element) if not isinstance(element[4], LTTextLineHorizontal): continue #print("Running text:", text) #print(f"x_offset_prev_line , x_offset]: {x_offset_prev_line, x_offset}") # Find first paragraph if x_offset_prev_line is None: #print("x_offset_prev is none") x_offset_prev_line = x_offset if not processed_first_line: first_line = True processed_first_line = True if height == nominal_font: paragraph_content += text #print("Continue due to x_offset_prev_line is none") continue # Check case if first line was indented if x_offset_prev_line > x_offset and first_line: #print("x_offset < element[0]") first_line = False paragraph_content += text x_offset_prev_line = x_offset #print("Continue due to x_offset_prev_line > x_offset and first_line") continue # is this indented? # and ignore small changes if x_offset_prev_line < x_offset and large_x_offset: #print(f"x_offset_prev_line > x_offset: {x_offset_prev_line, x_offset}") if height == nominal_font and len(paragraph_content) > 0: paragraphs.append(paragraph_content) paragraph_content = text # Reset at next line read # What if next paragraph is also indented??? x_offset_prev_line = None #print("Continue due to x_offset_prev_line < x_offset and large_x_offset") continue #print(element[0:4]) if height == nominal_font: paragraph_content += text #print("End of loop") # TODO: Remove redundant space if paragraph_content != "": paragraphs.append(paragraph_content) # Find paragraph indexes c = 0 indexes = [] for p in paragraphs: c += len(p) indexes.append(c) return paragraphs, indexes def get_pdf_elements(document, element_type, page_numbers=None): pdf_doc = open(document, 'rb') items = {} for layout, page in get_pages(pdf_doc, page_numbers): #print(element.get_text()) items[page] = [] for element in layout: if isinstance(element, element_type): item = list(element.bbox) if hasattr(element, 'non_stroking_color'): item.append(element.non_stroking_color) items[page].append(item) print(items) return items def get_large_colored_background_rectangles(document, page_numbers=None): # Only include rectangles that are at least 4" x 1" in size min_size = (288.0, 72.0) elements = get_pdf_elements(document, LTRect, page_numbers) rects_out = {} for page, rects in elements.items(): print("Rects:", rects) for rect in rects: width = rect[2] - rect[0] height = rect[3] - rect[1] print("Dimensions:", width, height) if (width > min_size[0] and height > min_size[1]): if not page in rects_out: rects_out[page] = [] rects_out[page].append(rect) return rects_out def extract_pages(document, output, page_numbers=None): pdf = PdfFileReader(document) pdf_writer = PdfFileWriter() for page in page_numbers: current_page = pdf.getPage(page) pdf_writer.addPage(current_page) with open(output, "wb") as out: pdf_writer.write(out)