Spaces:

cjber
/

planning-ai

Sleeping

File size: 2,717 Bytes

4f904d6
 
 
 
 
 
8bcdd18
 
 
4f904d6
 
 
 
8bcdd18
 
4f904d6
 
 
 
8bcdd18
0fdcea5
8bcdd18
 
 
0fdcea5
8bcdd18
 
0fdcea5
8bcdd18
4f904d6
0fdcea5
 
 
 
 
 
 
 
4f904d6
0fdcea5
4f904d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8bcdd18
4f904d6
 
 
 
0fdcea5
 
 
 
 
ef249f5
0fdcea5
 
 
 
 
 
ef249f5
 
 
 
 
 
 
 
 
0fdcea5
 
8ef72b8

import os
from pathlib import Path

from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeOutputOption, AnalyzeResult
from azure.core.credentials import AzureKeyCredential
from dotenv import load_dotenv
from pypdf import PdfReader, PdfWriter
from pypdf.errors import PdfReadError
from tqdm import tqdm

from planning_ai.common.utils import Paths

load_dotenv()

endpoint = os.getenv("AZURE_API_ENDPOINT") or ""
credential = AzureKeyCredential(os.getenv("AZURE_API_KEY") or "")
document_intelligence_client = DocumentIntelligenceClient(endpoint, credential)


def read_pdf(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        text = "\n\n".join([page.extract_text() for page in reader.pages])
        return text, reader
    except PdfReadError:
        print("Not a pdf file...")
        return None, None


def write_pdf(reader, out_pdf):
    writer = PdfWriter()
    for page in reader.pages:
        writer.add_page(page)
    with open(out_pdf, "wb") as f:
        writer.write(f)
    print("Written PDF text to file.")


def analyze_document_with_azure(pdf_path, out_pdf, failed_txt):
    with open(pdf_path, "rb") as f:
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-read",
            body=f,
            output=[AnalyzeOutputOption.PDF],
        )
    try:
        result: AnalyzeResult = poller.result()
        operation_id = poller.details["operation_id"]

        response = document_intelligence_client.get_analyze_result_pdf(
            model_id=result.model_id, result_id=operation_id
        )
        with open(out_pdf, "wb") as writer:
            writer.writelines(response)
        print("Written Azure text to file.")
    except Exception as e:
        with open(failed_txt, "w") as f:
            f.write("")
        print(f"Error occurred in result. {e}")


def azure_process_pdfs():
    pdfs = (Paths.RAW / "pdfs").glob("*.pdf")

    for pdf_path in tqdm(list(pdfs)):
        print(f"Processing {pdf_path}")

        out_pdf = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.pdf")
        failed_txt = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.txt")

        text, reader = read_pdf(pdf_path)
        if text is None or len(text) < 1_000:
            if pdf_path.stat().st_size > 1_000_000:
                print(f"Processing {pdf_path} failed.")
                continue
            try:
                analyze_document_with_azure(pdf_path, out_pdf, failed_txt)
            except Exception as e:
                print(f"Failed to use Azure: {e}")
        else:
            write_pdf(reader, out_pdf)


if __name__ == "__main__":
    azure_process_pdfs()