Spaces:
Sleeping
Sleeping
import os | |
from pathlib import Path | |
from azure.ai.documentintelligence import DocumentIntelligenceClient | |
from azure.ai.documentintelligence.models import AnalyzeOutputOption, AnalyzeResult | |
from azure.core.credentials import AzureKeyCredential | |
from dotenv import load_dotenv | |
from pypdf import PdfReader, PdfWriter | |
from pypdf.errors import PdfReadError | |
from tqdm import tqdm | |
from planning_ai.common.utils import Paths | |
load_dotenv() | |
endpoint = os.getenv("AZURE_API_ENDPOINT") or "" | |
credential = AzureKeyCredential(os.getenv("AZURE_API_KEY") or "") | |
document_intelligence_client = DocumentIntelligenceClient(endpoint, credential) | |
def read_pdf(pdf_path): | |
try: | |
reader = PdfReader(pdf_path) | |
text = "\n\n".join([page.extract_text() for page in reader.pages]) | |
return text, reader | |
except PdfReadError: | |
print("Not a pdf file...") | |
return None, None | |
def write_pdf(reader, out_pdf): | |
writer = PdfWriter() | |
for page in reader.pages: | |
writer.add_page(page) | |
with open(out_pdf, "wb") as f: | |
writer.write(f) | |
print("Written PDF text to file.") | |
def analyze_document_with_azure(pdf_path, out_pdf, failed_txt): | |
with open(pdf_path, "rb") as f: | |
poller = document_intelligence_client.begin_analyze_document( | |
"prebuilt-read", | |
body=f, | |
output=[AnalyzeOutputOption.PDF], | |
) | |
try: | |
result: AnalyzeResult = poller.result() | |
operation_id = poller.details["operation_id"] | |
response = document_intelligence_client.get_analyze_result_pdf( | |
model_id=result.model_id, result_id=operation_id | |
) | |
with open(out_pdf, "wb") as writer: | |
writer.writelines(response) | |
print("Written Azure text to file.") | |
except Exception as e: | |
with open(failed_txt, "w") as f: | |
f.write("") | |
print(f"Error occurred in result. {e}") | |
def azure_process_pdfs(): | |
pdfs = (Paths.RAW / "pdfs").glob("*.pdf") | |
for pdf_path in tqdm(list(pdfs)): | |
print(f"Processing {pdf_path}") | |
out_pdf = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.pdf") | |
failed_txt = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.txt") | |
text, reader = read_pdf(pdf_path) | |
if text is None or len(text) < 1_000: | |
if pdf_path.stat().st_size > 1_000_000: | |
print(f"Processing {pdf_path} failed.") | |
continue | |
try: | |
analyze_document_with_azure(pdf_path, out_pdf, failed_txt) | |
except Exception as e: | |
print(f"Failed to use Azure: {e}") | |
else: | |
write_pdf(reader, out_pdf) | |
if __name__ == "__main__": | |
azure_process_pdfs() | |