mayank-youdataai commited on
Commit
74f6a97
·
verified ·
1 Parent(s): b0c683f

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +148 -0
  2. requirements.txt +19 -0
app.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tempfile
4
+ from fastapi import FastAPI, UploadFile, File, HTTPException
5
+ from paddleocr import PPStructure
6
+ import logging
7
+ import paddle
8
+
9
+ # Configure logging
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+ # Initialize FastAPI app
14
+ app = FastAPI()
15
+
16
+ # Global variable for OCR engine
17
+ ocr_engine = None
18
+
19
+ # Function to initialize the PaddleOCR engine based on GPU availability
20
+ def init_ocr_engine():
21
+ global ocr_engine
22
+ if ocr_engine is None:
23
+ use_gpu = is_gpu_available()
24
+
25
+ if use_gpu:
26
+ logger.info("NVIDIA GPU detected, running PaddleOCR on GPU.")
27
+ else:
28
+ logger.info("No GPU detected, running PaddleOCR on CPU.")
29
+
30
+ # Initialize the OCR engine with the use_gpu variable
31
+ ocr_engine = PPStructure(
32
+ table=True,
33
+ ocr=True,
34
+ show_log=True,
35
+ layout_score_threshold=0.1,
36
+ structure_version='PP-StructureV2',
37
+ use_gpu=use_gpu
38
+ )
39
+
40
+ return ocr_engine
41
+
42
+ # Function to check for GPU availability using Paddle
43
+ def is_gpu_available():
44
+ # Check if PaddlePaddle is compiled with CUDA and if a CUDA device is available
45
+ return paddle.is_compiled_with_cuda() and paddle.device.cuda.device_count() > 0
46
+
47
+ # Function to perform OCR and save the structured result
48
+ def perform_ocr_and_save(pdf_path, save_folder='./output'):
49
+ # Initialize PaddleOCR engine
50
+ ocr_engine = init_ocr_engine()
51
+ # Directly pass the PDF to PaddleOCR
52
+ result = ocr_engine(pdf_path)
53
+
54
+ if not result:
55
+ logger.error(f"OCR failed for {pdf_path}")
56
+
57
+ return result
58
+
59
+ # Function to format results to strings and sort them
60
+ def format_to_strings_and_sort(results):
61
+ logger.info("Formatting and sorting OCR results.")
62
+ formatted_data = []
63
+
64
+ for idx, elements in enumerate(results):
65
+ for element in elements:
66
+ type = element['type']
67
+ bbox = element['bbox']
68
+ responses = element['res']
69
+
70
+ if type != 'table':
71
+ for response in responses:
72
+ y_coordinate = bbox[1] # Use y1 coordinate for sorting
73
+ formatted_data.append({
74
+ 'page_num': idx + 1,
75
+ 'type': type,
76
+ 'text': response['text'],
77
+ 'confidence': response['confidence'],
78
+ 'bbox': bbox,
79
+ 'y_coordinate': y_coordinate # Add y-coordinate for sorting
80
+ })
81
+ else:
82
+ formatted_data.append({
83
+ 'page_num': idx + 1,
84
+ 'type': type,
85
+ 'html': responses['html'],
86
+ 'bbox': bbox,
87
+ 'y_coordinate': bbox[1] # Use bbox y1 for sorting
88
+ })
89
+
90
+ sorted_data = sorted(formatted_data, key=lambda x: (x['page_num'], x['y_coordinate']))
91
+
92
+ logger.info("Sorting completed.")
93
+ return sorted_data
94
+
95
+ # Function to save results to a JSON file
96
+ def save_to_json(data, filename):
97
+ logger.info(f"Saving sorted results to {filename}.")
98
+ with open(filename, "w") as json_file:
99
+ json.dump(data, json_file, indent=4)
100
+
101
+ # FastAPI endpoint to process uploaded PDF
102
+ @app.post("/process-ocr/")
103
+ async def process_ocr(file: UploadFile = File(...)):
104
+ try:
105
+ # Validate file type
106
+ if file.content_type != "application/pdf":
107
+ logger.warning(f"Invalid file type uploaded: {file.content_type}")
108
+ raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF file.")
109
+
110
+ # Create a temporary file to store the uploaded PDF
111
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
112
+ contents = await file.read()
113
+ temp_file.write(contents)
114
+ temp_file_path = temp_file.name
115
+ logger.info(f"Temporary file created at: {temp_file_path}")
116
+
117
+ # Perform OCR and save results
118
+ result = perform_ocr_and_save(temp_file_path)
119
+
120
+ if result is None:
121
+ raise HTTPException(status_code=500, detail="OCR processing failed. Check the input file.")
122
+
123
+ # Sort and format the results
124
+ result_json = format_to_strings_and_sort(result)
125
+
126
+ # Optionally, save the result JSON to a file (for debugging)
127
+ save_to_json(result_json, 'result_json.json')
128
+
129
+ # Return sorted result as JSON
130
+ return result_json
131
+
132
+ except Exception as e:
133
+ logger.error(f"An error occurred during OCR processing: {e}")
134
+ raise HTTPException(status_code=500, detail="An error occurred during OCR processing.")
135
+
136
+ finally:
137
+ # Clean up the temporary file
138
+ if os.path.exists(temp_file_path):
139
+ os.remove(temp_file_path)
140
+ logger.info(f"Temporary file {temp_file_path} deleted.")
141
+
142
+ # Endpoint to check if GPU is available
143
+ @app.get("/check-gpu/")
144
+ def check_gpu():
145
+ if is_gpu_available():
146
+ return {"gpu_available": True, "message": "NVIDIA GPU is available and will be used."}
147
+ else:
148
+ return {"gpu_available": False, "message": "NVIDIA GPU is not available, using CPU instead."}
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.0
2
+ uvicorn==0.31.0
3
+ paddleocr==2.8.1 # Updated to the latest available version
4
+ paddlepaddle-gpu==2.5.2 # GPU support, modify based on your CUDA version
5
+ pyMuPDF==1.22.1
6
+ numpy==1.24.4
7
+ tqdm==4.65.0
8
+ python-multipart==0.0.5
9
+ premailer==3.10.0
10
+
11
+ # fastapi
12
+ # uvicorn
13
+ # paddleocr
14
+ # paddlepaddle # For GPU support, you can modify this based on your CUDA version
15
+ # PyMuPDF
16
+ # numpy
17
+ # tqdm
18
+ # python-multipart
19
+ # premailer