Alpaca233's picture
Upload 18 files
52d0cfd
raw
history blame contribute delete
No virus
605 Bytes
from PyPDF2 import PdfReader
class Paper(object):
def __init__(self, pdf_obj: PdfReader) -> None:
self._pdf_obj = pdf_obj
self._paper_meta = self._pdf_obj.metadata
def iter_pages(self, iter_text_len: int = 3000):
page_idx = 0
for page in self._pdf_obj.pages:
txt = page.extract_text()
for i in range((len(txt) // iter_text_len) + 1):
yield page_idx, i, txt[i * iter_text_len:(i + 1) * iter_text_len]
page_idx += 1
if __name__ == '__main__':
reader = PdfReader('../alexnet.pdf')
paper = Paper(reader)