cjber's picture
fix: change data files
ef249f5
import os
from pathlib import Path
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeOutputOption, AnalyzeResult
from azure.core.credentials import AzureKeyCredential
from dotenv import load_dotenv
from pypdf import PdfReader, PdfWriter
from pypdf.errors import PdfReadError
from tqdm import tqdm
from planning_ai.common.utils import Paths
load_dotenv()
endpoint = os.getenv("AZURE_API_ENDPOINT") or ""
credential = AzureKeyCredential(os.getenv("AZURE_API_KEY") or "")
document_intelligence_client = DocumentIntelligenceClient(endpoint, credential)
def read_pdf(pdf_path):
try:
reader = PdfReader(pdf_path)
text = "\n\n".join([page.extract_text() for page in reader.pages])
return text, reader
except PdfReadError:
print("Not a pdf file...")
return None, None
def write_pdf(reader, out_pdf):
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
with open(out_pdf, "wb") as f:
writer.write(f)
print("Written PDF text to file.")
def analyze_document_with_azure(pdf_path, out_pdf, failed_txt):
with open(pdf_path, "rb") as f:
poller = document_intelligence_client.begin_analyze_document(
"prebuilt-read",
body=f,
output=[AnalyzeOutputOption.PDF],
)
try:
result: AnalyzeResult = poller.result()
operation_id = poller.details["operation_id"]
response = document_intelligence_client.get_analyze_result_pdf(
model_id=result.model_id, result_id=operation_id
)
with open(out_pdf, "wb") as writer:
writer.writelines(response)
print("Written Azure text to file.")
except Exception as e:
with open(failed_txt, "w") as f:
f.write("")
print(f"Error occurred in result. {e}")
def azure_process_pdfs():
pdfs = (Paths.RAW / "pdfs").glob("*.pdf")
for pdf_path in tqdm(list(pdfs)):
print(f"Processing {pdf_path}")
out_pdf = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.pdf")
failed_txt = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.txt")
text, reader = read_pdf(pdf_path)
if text is None or len(text) < 1_000:
if pdf_path.stat().st_size > 1_000_000:
print(f"Processing {pdf_path} failed.")
continue
try:
analyze_document_with_azure(pdf_path, out_pdf, failed_txt)
except Exception as e:
print(f"Failed to use Azure: {e}")
else:
write_pdf(reader, out_pdf)
if __name__ == "__main__":
azure_process_pdfs()