|
from typing import Dict
|
|
import os
|
|
import base64
|
|
from magic_pdf.user_api import parse_union_pdf
|
|
from magic_pdf.rw import S3ReaderWriter
|
|
from loguru import logger
|
|
|
|
def get_s3_writer():
|
|
"""初始化S3读写器"""
|
|
try:
|
|
return S3ReaderWriter(
|
|
ak=os.environ["S3_ACCESS_KEY"],
|
|
sk=os.environ["S3_SECRET_KEY"],
|
|
endpoint=os.environ["S3_ENDPOINT"],
|
|
bucket=os.environ["S3_BUCKET"]
|
|
)
|
|
except KeyError as e:
|
|
raise Exception(f"Missing environment variable: {str(e)}")
|
|
|
|
def inference(inputs: Dict):
|
|
"""
|
|
Serverless API入口点
|
|
"""
|
|
try:
|
|
|
|
if "pdf_bytes" not in inputs:
|
|
return {"status": "error", "message": "No PDF data provided"}
|
|
|
|
|
|
try:
|
|
pdf_bytes = base64.b64decode(inputs["pdf_bytes"])
|
|
except Exception as e:
|
|
return {"status": "error", "message": f"Invalid PDF data: {str(e)}"}
|
|
|
|
|
|
image_writer = get_s3_writer()
|
|
|
|
|
|
kwargs = {
|
|
"lang": inputs.get("lang", "zh"),
|
|
"layout_model": inputs.get("layout_model", True),
|
|
"formula_enable": inputs.get("formula_enable", True),
|
|
"table_enable": inputs.get("table_enable", True),
|
|
"input_model_is_empty": True
|
|
}
|
|
|
|
|
|
result = parse_union_pdf(
|
|
pdf_bytes=pdf_bytes,
|
|
pdf_models=[],
|
|
imageWriter=image_writer,
|
|
**kwargs
|
|
)
|
|
|
|
return {
|
|
"status": "success",
|
|
"data": result
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.exception("Error processing PDF")
|
|
return {
|
|
"status": "error",
|
|
"message": str(e)
|
|
} |