File size: 13,305 Bytes
c14d9ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
# Copyright (c) 2022, Lawrence Livermore National Security, LLC. 
# All rights reserved.
# See the top-level LICENSE and NOTICE files for details.
# LLNL-CODE-838964

# SPDX-License-Identifier: Apache-2.0-with-LLVM-exception

from pdfminer.pdfpage import PDFParser
from pdfminer.pdfpage import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LTTextBoxHorizontal
from pdfminer.layout import LTTextLineHorizontal
from pdfminer.layout import LTChar
from pdfminer.layout import LAParams
from pdfminer.layout import LTRect
from pdfminer.layout import LTFigure

from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer import pdfinterp

from collections.abc import Iterable 
from collections import Counter
from collections import OrderedDict

import os

# This is use for highlighting in PDFs 
from PyPDF2.generic import (
    DictionaryObject,
    NumberObject,
    FloatObject,
    NameObject,
    TextStringObject,
    ArrayObject
)

# Used to extract pages
from PyPDF2 import PdfFileReader, PdfFileWriter

def get_page_sizes(document):
    parser = PDFParser(open(document, 'rb'))
    doc = PDFDocument(parser)
    pageSizesList = []
    for page in PDFPage.create_pages(doc):
        # the media box that is the page size as list of 4 integers x0 y0 x1 y1
        pageSizesList.append(page.mediabox) # <- appending
    return pageSizesList
    
def get_page_count(document):
    # Is there a better way of getting the page count than doing this?   
    parser = PDFParser(document)
    tmpdoc = PDFDocument(parser)
    page_count = pdfinterp.resolve1(tmpdoc.catalog['Pages'])['Count']
    return page_count
    
def get_pdf_page_count(filename):
    with open(filename, 'rb') as document:    
        return get_page_count(document)
        
def get_pages(document, page_numbers = None):
    #Create resource manager
    rsrcmgr = PDFResourceManager()
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    page_count = get_page_count(document)
    
    if page_numbers is None:
        page_numbers = range(page_count)
        
    for page, page_number in zip(PDFPage.get_pages(document, page_numbers), page_numbers):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        #print("Yield page:", page_number)
        yield layout, page_number
                
def partial_overlaps(box, other):
    """
    Determine if the two bounding boxes overlap eachother.
    TODO: Really should just use a standard Python library for this.
    
    box -- 2 coordinate bounding box (x1,y1,x2,y2)
    other -- 2 coordinate bounding box (x1,y1,x2,y2)
    """
    # a1 x1  a2 x2      
    # <------------------>
    x_intersects = (other[0] < box[0] and other[2] > box[0]) or (
                    other[0] < box[2] and other[2] > box[2])
    y_intersects = (other[1] < box[1] and other[3] > box[1]) or (
                    other[1] < box[3] and other[3] > box[3]) 
 
    intersects = x_intersects or y_intersects
    # TODO: Simplify?
    return intersects and overlaps(box, other)
    #return intersects

def overlaps(box, other):
    """
    Determine if the two bounding boxes overlap eachother.
    TODO: Really should just use a standard Python library for this.
    
    box -- 2 coordinate bounding box (x1,y1,x2,y2)
    other -- 2 coordinate bounding box (x1,y1,x2,y2)
    """
    x_intersects = box[0] > other[2] or box[2] < other[0]
    y_intersects = box[1] > other[3] or box[3] < other[1] 
 
    intersects = not (x_intersects or y_intersects)
    return intersects

def union(src, other):
    """
    Expand src by union of other bbox
    
    src -- 2 coordinate bounding box (x1,y1,x2,y2)
    other -- 2 coordinate bounding box (x1,y1,x2,y2)
    
    returns union of src and other
    """
    xmin = min(src[0], other[0])
    ymin = min(src[1], other[1])
    xmax = max(src[2], other[2])
    ymax = max(src[3], other[3])
    
    return [xmin, ymin, xmax, ymax]



# See: https://gist.github.com/agentcooper/4c55133f5d95866acdee5017cd318558#file-pypdf2highlight-py
# x1, y1 starts in bottom left corner
def createHighlight(x1, y1, x2, y2, meta, color = [1, 0, 0]):
    newHighlight = DictionaryObject()

    newHighlight.update({
        NameObject("/F"): NumberObject(4),
        NameObject("/Type"): NameObject("/Annot"),
        NameObject("/Subtype"): NameObject("/Highlight"),

        NameObject("/T"): TextStringObject(meta["author"]),
        NameObject("/Contents"): TextStringObject(meta["contents"]),

        NameObject("/C"): ArrayObject([FloatObject(c) for c in color]),
        NameObject("/Rect"): ArrayObject([
            FloatObject(x1),
            FloatObject(y1),
            FloatObject(x2),
            FloatObject(y2)
        ]),
        NameObject("/QuadPoints"): ArrayObject([
            FloatObject(x1),
            FloatObject(y2),
            FloatObject(x2),
            FloatObject(y2),
            FloatObject(x1),
            FloatObject(y1),
            FloatObject(x2),
            FloatObject(y1)
        ]),
    })

    return newHighlight

def addHighlightToPage(highlight, page, output):
    highlight_ref = output._addObject(highlight);

    if "/Annots" in page:
        page[NameObject("/Annots")].append(highlight_ref)
    else:
        page[NameObject("/Annots")] = ArrayObject([highlight_ref])
    
def get_pdf_words(document, page_numbers=None):
    """
    Get all words from LTChar or LTTextLineHorizontal objects from the document.
    
    :param document: string path of the PDF file to process
    :returns: A map of page #'s containing lists of coordinates and PDFMiner
    objects. Ex.: {page_number: [[x1, y1, x2, y2, <LTTextLineHorizontal>],]}
    """
    pdf_doc = open(document, 'rb')
        
    bboxes = {}
    for layout, page in get_pages(pdf_doc, page_numbers):
        #print(element.get_text())
        bboxes[page] = []
        for element in layout:
            if not isinstance(element, Iterable):
                continue # not iterable
            for subElement in element:
                #print('Subelement type:', type(subElement))
                if isinstance(subElement, LTChar):                
                    if (subElement.get_text() == ' '):
                        pass  # TODO: Handle word deliminator                                    
                    # Print the character in this class
                    # print(subElement.get_text(), end='')      
                    item = list(subElement.bbox)
                    item.append(subElement)
                    bboxes[page].append(item)
                elif isinstance(subElement, LTTextLineHorizontal):
                    #print(subElement.bbox)
                    item = list(subElement.bbox)
                    item.append(subElement)
                    bboxes[page].append(item)
                else:
                    pass
    return bboxes

def get_paragraphs(words):
    paragraph_tolerance = 0.1
    max_height_diff = 1
    paragraphs = []
    
    for page, elements in words.items():
        # Find nominal font size
        # Round to int
        freq = Counter()
        for element in elements:
            height = int(element[3] - element[1])
            #print(height,end=' ')
            freq[height] += 1
            
        nominal_font = freq.most_common(1)[0][0]
        print("Nominal font is:", nominal_font)

        print("Page:", page)
        x_offset_prev_line = None
        prev_x_offset = None
        prev_y_offset = None
        paragraph_content = ""
        #print("Element count:", len(elements))
        first_line = False
        processed_first_line = False
        
        for element in elements:
            x_offset = element[0]
            y_offset = element[1]
            height = int(element[3] - element[1])
            text = element[4].get_text()
            
            if x_offset_prev_line != None:
                large_x_offset = (abs(x_offset_prev_line - x_offset) > paragraph_tolerance)
        
            # Font size mismatch?
            if abs(height - nominal_font) > max_height_diff:
                if len(paragraph_content) > 0:
                    print("Content append:", len(paragraph_content))
                    paragraphs.append(paragraph_content)
                    paragraph_content = ""
                print("Continue due to height != nominal_font")
                continue
                
            print("ELEMENT:", element[0:4], text[0:15])
            if prev_y_offset is not None and len(paragraph_content) > 0:
                if y_offset < prev_y_offset - height * 1.5:
                    print("Content append:", len(paragraph_content))
                    if len(paragraph_content) > 0:
                        paragraphs.append(paragraph_content)
                        paragraph_content = text
                    prev_y_offset = None
                    continue    
                
            prev_y_offset = y_offset
                
            prev_y_offset = y_offset
            #print("element:", element)
            if not isinstance(element[4], LTTextLineHorizontal):
                continue
            
            #print("Running text:", text)
            #print(f"x_offset_prev_line , x_offset]: {x_offset_prev_line, x_offset}")

            
            # Find first paragraph
            if x_offset_prev_line is None:
                #print("x_offset_prev is none")
                x_offset_prev_line = x_offset
                if not processed_first_line:
                    first_line = True
                    processed_first_line = True
                if height == nominal_font:
                    paragraph_content += text
                #print("Continue due to x_offset_prev_line is none")
                continue
                

            
            # Check case if first line was indented
            if x_offset_prev_line > x_offset and first_line:
                #print("x_offset < element[0]")
                first_line = False
                paragraph_content += text
                x_offset_prev_line = x_offset
                #print("Continue due to  x_offset_prev_line > x_offset and first_line")
                continue
                
            # is this indented?
            # and ignore small changes
            if x_offset_prev_line < x_offset and large_x_offset:
                #print(f"x_offset_prev_line > x_offset: {x_offset_prev_line, x_offset}")
                if height == nominal_font and len(paragraph_content) > 0:
                    paragraphs.append(paragraph_content)
                
                paragraph_content = text
                # Reset at next line read
                # What if next paragraph is also indented???
                x_offset_prev_line = None
                #print("Continue due to  x_offset_prev_line < x_offset and large_x_offset")
                continue                

            #print(element[0:4])
            if height == nominal_font:
                paragraph_content += text
            #print("End of loop")
                
            # TODO: Remove redundant space
        if paragraph_content != "":
            paragraphs.append(paragraph_content)
    
    # Find paragraph indexes
    c = 0
    indexes = []
    for p in paragraphs:
        c += len(p)
        indexes.append(c)
        
    return paragraphs, indexes
    
def get_pdf_elements(document, element_type, page_numbers=None):
    pdf_doc = open(document, 'rb')
        
    items = {}
    for layout, page in get_pages(pdf_doc, page_numbers):
        #print(element.get_text())
        items[page] = []
        for element in layout:
            if isinstance(element, element_type):
                item = list(element.bbox)
                if hasattr(element, 'non_stroking_color'):
                    item.append(element.non_stroking_color)
                items[page].append(item)
    print(items)
    return items
                
def get_large_colored_background_rectangles(document, page_numbers=None):
    # Only include rectangles that are at least 4" x 1" in size
    min_size = (288.0, 72.0)
    
    elements = get_pdf_elements(document, LTRect, page_numbers)
    rects_out = {}
    for page, rects in elements.items():
        print("Rects:", rects)
        for rect in rects:
            width = rect[2] - rect[0] 
            height = rect[3] - rect[1] 
            print("Dimensions:", width, height)
            if (width > min_size[0] and
                height > min_size[1]):
                if not page in rects_out:
                    rects_out[page] = []
                rects_out[page].append(rect)
    return rects_out

def extract_pages(document, output, page_numbers=None):
    pdf = PdfFileReader(document)
    
    pdf_writer = PdfFileWriter()
    for page in page_numbers:
        current_page = pdf.getPage(page)
        pdf_writer.addPage(current_page)
    
    with open(output, "wb") as out:
        pdf_writer.write(out)