File size: 2,022 Bytes
0d99179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from pathlib import Path

import categories
import processing
import extract
from PIL import Image
from pydantic import BaseModel
from io import BytesIO

def categorize_and_parse_text(text: str) -> BaseModel:
    """Categorizes the text and parses the information from it.

    Args:
        text(str): The text to categorize and parse information from.

    Returns: The category of the text.
    """
    category = categories.categorize_text(text)
    # if stop_on_category:
    #     return category, text
    result = categories.run_category_chain(category, text)
    return result

def process_pdf(filename: Path, extract_only=False) -> BaseModel:
    """Processes the given PDF file and extracts information from it.

    Args:
        filename(Path): The PDF file to process.

    Returns: The extracted information.
    """
    with open(filename, "rb") as f:
        pdf_bytes = bytes(f.read())
    
    text = extract.extract_text_from_pdf_pypdf(BytesIO(pdf_bytes))
    # If the encoded text is too short, a pdf scanner probably added a watermark
    if len(text) < 20:
        # Try to extract text from images
        images = processing.preprocess_pdf_pdf2image(pdf_bytes)
        text = extract.extract_text_from_images_pyocr_tesseract(images)
    if extract_only:
        return text
    result = categorize_and_parse_text(text)
    return result

def process_image(filename: Path, extract_only=False) -> BaseModel:
    """Processes the given image file and extracts information from it.

    Args:
        filename(Path): The image file to process.

    Returns: The extracted information.
    """
    image = Image.open(filename)
    image = processing.preprocess_image(image)
    text = extract.extract_text_from_image_pyocr_tesseract(image)
    image.close()
    if extract_only:
        return text
    result = categorize_and_parse_text(text)
    return result

if __name__ == "__main__":
    filename = Path("examples/example1.pdf")
    result = process_pdf(filename)
    print(result.json(indent=4))