File size: 2,717 Bytes
4f904d6
 
 
 
 
 
8bcdd18
 
 
4f904d6
 
 
 
8bcdd18
 
4f904d6
 
 
 
8bcdd18
0fdcea5
8bcdd18
 
 
0fdcea5
8bcdd18
 
0fdcea5
8bcdd18
4f904d6
0fdcea5
 
 
 
 
 
 
 
4f904d6
0fdcea5
4f904d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8bcdd18
4f904d6
 
 
 
0fdcea5
 
 
 
 
ef249f5
0fdcea5
 
 
 
 
 
ef249f5
 
 
 
 
 
 
 
 
0fdcea5
 
8ef72b8
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
from pathlib import Path

from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeOutputOption, AnalyzeResult
from azure.core.credentials import AzureKeyCredential
from dotenv import load_dotenv
from pypdf import PdfReader, PdfWriter
from pypdf.errors import PdfReadError
from tqdm import tqdm

from planning_ai.common.utils import Paths

load_dotenv()

endpoint = os.getenv("AZURE_API_ENDPOINT") or ""
credential = AzureKeyCredential(os.getenv("AZURE_API_KEY") or "")
document_intelligence_client = DocumentIntelligenceClient(endpoint, credential)


def read_pdf(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        text = "\n\n".join([page.extract_text() for page in reader.pages])
        return text, reader
    except PdfReadError:
        print("Not a pdf file...")
        return None, None


def write_pdf(reader, out_pdf):
    writer = PdfWriter()
    for page in reader.pages:
        writer.add_page(page)
    with open(out_pdf, "wb") as f:
        writer.write(f)
    print("Written PDF text to file.")


def analyze_document_with_azure(pdf_path, out_pdf, failed_txt):
    with open(pdf_path, "rb") as f:
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-read",
            body=f,
            output=[AnalyzeOutputOption.PDF],
        )
    try:
        result: AnalyzeResult = poller.result()
        operation_id = poller.details["operation_id"]

        response = document_intelligence_client.get_analyze_result_pdf(
            model_id=result.model_id, result_id=operation_id
        )
        with open(out_pdf, "wb") as writer:
            writer.writelines(response)
        print("Written Azure text to file.")
    except Exception as e:
        with open(failed_txt, "w") as f:
            f.write("")
        print(f"Error occurred in result. {e}")


def azure_process_pdfs():
    pdfs = (Paths.RAW / "pdfs").glob("*.pdf")

    for pdf_path in tqdm(list(pdfs)):
        print(f"Processing {pdf_path}")

        out_pdf = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.pdf")
        failed_txt = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.txt")

        text, reader = read_pdf(pdf_path)
        if text is None or len(text) < 1_000:
            if pdf_path.stat().st_size > 1_000_000:
                print(f"Processing {pdf_path} failed.")
                continue
            try:
                analyze_document_with_azure(pdf_path, out_pdf, failed_txt)
            except Exception as e:
                print(f"Failed to use Azure: {e}")
        else:
            write_pdf(reader, out_pdf)


if __name__ == "__main__":
    azure_process_pdfs()