Upload 3 files
Browse files- README.md +108 -3
- app.py +65 -0
- requirements.txt +4 -0
README.md
CHANGED
@@ -1,3 +1,108 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MinerUapi - PDF Processing Serverless API
|
2 |
+
|
3 |
+
基于 magic-pdf 0.9.2 版本的 PDF 处理 Serverless API。
|
4 |
+
|
5 |
+
## 功能特性
|
6 |
+
|
7 |
+
- PDF文本提取和OCR处理
|
8 |
+
- 自动识别文本PDF和OCR PDF
|
9 |
+
- 支持混合模式PDF处理
|
10 |
+
- S3存储集成
|
11 |
+
|
12 |
+
## API 使用说明
|
13 |
+
|
14 |
+
### 请求格式
|
15 |
+
|
16 |
+
json
|
17 |
+
{
|
18 |
+
"pdf_bytes": "base64编码的PDF文件内容",
|
19 |
+
"lang": "zh", // 可选,默认"zh"
|
20 |
+
"layout_model": true, // 可选,默认true
|
21 |
+
"formula_enable": true, // 可选,默认true
|
22 |
+
"table_enable": true, // 可选,默认true
|
23 |
+
"is_debug": false // 可选,默认false
|
24 |
+
}
|
25 |
+
|
26 |
+
### 响应格式
|
27 |
+
|
28 |
+
成功响应:
|
29 |
+
|
30 |
+
json
|
31 |
+
{
|
32 |
+
"status": "success",
|
33 |
+
"data": {
|
34 |
+
"_parse_type": "txt或ocr",
|
35 |
+
"_version_name": "0.9.2",
|
36 |
+
"_lang": "zh",
|
37 |
+
// 其他PDF解析结果
|
38 |
+
}
|
39 |
+
}
|
40 |
+
|
41 |
+
错误响应:
|
42 |
+
|
43 |
+
json
|
44 |
+
{
|
45 |
+
"status": "error",
|
46 |
+
"message": "错误信息"
|
47 |
+
}
|
48 |
+
|
49 |
+
## 环境变量要求
|
50 |
+
|
51 |
+
部署时需要配置以下环境变量:
|
52 |
+
|
53 |
+
- `S3_ACCESS_KEY`: S3访问密钥
|
54 |
+
- `S3_SECRET_KEY`: S3访问密钥密文
|
55 |
+
- `S3_ENDPOINT`: S3服务端点
|
56 |
+
- `S3_BUCKET`: S3存储桶名称
|
57 |
+
|
58 |
+
## Python调用示例
|
59 |
+
|
60 |
+
python
|
61 |
+
from huggingface_hub import InferenceClient
|
62 |
+
import base64
|
63 |
+
def process_pdf(pdf_path: str, hf_token: str):
|
64 |
+
# 创建客户端
|
65 |
+
client = InferenceClient(
|
66 |
+
model="kitjesen/MinerUapi", # 替换为您的模型ID
|
67 |
+
token=hf_token
|
68 |
+
)
|
69 |
+
# 读取并编码PDF
|
70 |
+
with open(pdf_path, 'rb') as f:
|
71 |
+
pdf_bytes = base64.b64encode(f.read()).decode()
|
72 |
+
# 发送请求
|
73 |
+
response = client.post(json={
|
74 |
+
"pdf_bytes": pdf_bytes,
|
75 |
+
"lang": "zh",
|
76 |
+
"layout_model": True,
|
77 |
+
"formula_enable": True,
|
78 |
+
"table_enable": True
|
79 |
+
})
|
80 |
+
return response
|
81 |
+
# 使用示例
|
82 |
+
try:
|
83 |
+
result = process_pdf("example.pdf", "your_hf_token")
|
84 |
+
if result["status"] == "success":
|
85 |
+
print("解析类型:", result["data"]["_parse_type"])
|
86 |
+
print("版本:", result["data"]["_version_name"])
|
87 |
+
else:
|
88 |
+
print("处理失败:", result["message"])
|
89 |
+
except Exception as e:
|
90 |
+
print(f"调用出错: {str(e)}")
|
91 |
+
|
92 |
+
## 注意事项
|
93 |
+
|
94 |
+
1. 文件限制:
|
95 |
+
- PDF大小建议不超过10MB
|
96 |
+
- 支持文本PDF和扫描PDF
|
97 |
+
|
98 |
+
2. 处理时间:
|
99 |
+
- 文本PDF: 通常几秒内完成
|
100 |
+
- OCR PDF: 可能需要较长时间
|
101 |
+
|
102 |
+
3. S3存储:
|
103 |
+
- 确保配置的S3权限正确
|
104 |
+
- 建议定期清理临时文件
|
105 |
+
|
106 |
+
4. 版本说明:
|
107 |
+
- 当前使用 magic-pdf 0.9.2 版本
|
108 |
+
- 建议定期检查版本更新
|
app.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict
|
2 |
+
import os
|
3 |
+
import base64
|
4 |
+
from magic_pdf.user_api import parse_union_pdf
|
5 |
+
from magic_pdf.rw import S3ReaderWriter
|
6 |
+
from loguru import logger
|
7 |
+
|
8 |
+
def get_s3_writer():
|
9 |
+
"""初始化S3读写器"""
|
10 |
+
try:
|
11 |
+
return S3ReaderWriter(
|
12 |
+
ak=os.environ["S3_ACCESS_KEY"],
|
13 |
+
sk=os.environ["S3_SECRET_KEY"],
|
14 |
+
endpoint=os.environ["S3_ENDPOINT"],
|
15 |
+
bucket=os.environ["S3_BUCKET"]
|
16 |
+
)
|
17 |
+
except KeyError as e:
|
18 |
+
raise Exception(f"Missing environment variable: {str(e)}")
|
19 |
+
|
20 |
+
def inference(inputs: Dict):
|
21 |
+
"""
|
22 |
+
Serverless API入口点
|
23 |
+
"""
|
24 |
+
try:
|
25 |
+
# 获取并验证输入
|
26 |
+
if "pdf_bytes" not in inputs:
|
27 |
+
return {"status": "error", "message": "No PDF data provided"}
|
28 |
+
|
29 |
+
# Base64解码PDF内容
|
30 |
+
try:
|
31 |
+
pdf_bytes = base64.b64decode(inputs["pdf_bytes"])
|
32 |
+
except Exception as e:
|
33 |
+
return {"status": "error", "message": f"Invalid PDF data: {str(e)}"}
|
34 |
+
|
35 |
+
# 初始化S3读写器
|
36 |
+
image_writer = get_s3_writer()
|
37 |
+
|
38 |
+
# 准备参数
|
39 |
+
kwargs = {
|
40 |
+
"lang": inputs.get("lang", "zh"),
|
41 |
+
"layout_model": inputs.get("layout_model", True),
|
42 |
+
"formula_enable": inputs.get("formula_enable", True),
|
43 |
+
"table_enable": inputs.get("table_enable", True),
|
44 |
+
"input_model_is_empty": True
|
45 |
+
}
|
46 |
+
|
47 |
+
# 使用 parse_union_pdf 处理
|
48 |
+
result = parse_union_pdf(
|
49 |
+
pdf_bytes=pdf_bytes,
|
50 |
+
pdf_models=[], # 使用内置模型
|
51 |
+
imageWriter=image_writer,
|
52 |
+
**kwargs
|
53 |
+
)
|
54 |
+
|
55 |
+
return {
|
56 |
+
"status": "success",
|
57 |
+
"data": result
|
58 |
+
}
|
59 |
+
|
60 |
+
except Exception as e:
|
61 |
+
logger.exception("Error processing PDF")
|
62 |
+
return {
|
63 |
+
"status": "error",
|
64 |
+
"message": str(e)
|
65 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
magic-pdf==0.9.2
|
2 |
+
boto3>=1.26.0
|
3 |
+
python-dotenv>=0.19.0
|
4 |
+
loguru>=0.6.0
|