ankur-bohra's picture
Add basic structure
0d99179
from pathlib import Path
import categories
import processing
import extract
from PIL import Image
from pydantic import BaseModel
from io import BytesIO
def categorize_and_parse_text(text: str) -> BaseModel:
"""Categorizes the text and parses the information from it.
Args:
text(str): The text to categorize and parse information from.
Returns: The category of the text.
"""
category = categories.categorize_text(text)
# if stop_on_category:
# return category, text
result = categories.run_category_chain(category, text)
return result
def process_pdf(filename: Path, extract_only=False) -> BaseModel:
"""Processes the given PDF file and extracts information from it.
Args:
filename(Path): The PDF file to process.
Returns: The extracted information.
"""
with open(filename, "rb") as f:
pdf_bytes = bytes(f.read())
text = extract.extract_text_from_pdf_pypdf(BytesIO(pdf_bytes))
# If the encoded text is too short, a pdf scanner probably added a watermark
if len(text) < 20:
# Try to extract text from images
images = processing.preprocess_pdf_pdf2image(pdf_bytes)
text = extract.extract_text_from_images_pyocr_tesseract(images)
if extract_only:
return text
result = categorize_and_parse_text(text)
return result
def process_image(filename: Path, extract_only=False) -> BaseModel:
"""Processes the given image file and extracts information from it.
Args:
filename(Path): The image file to process.
Returns: The extracted information.
"""
image = Image.open(filename)
image = processing.preprocess_image(image)
text = extract.extract_text_from_image_pyocr_tesseract(image)
image.close()
if extract_only:
return text
result = categorize_and_parse_text(text)
return result
if __name__ == "__main__":
filename = Path("examples/example1.pdf")
result = process_pdf(filename)
print(result.json(indent=4))