binery commited on
Commit
16aad69
·
1 Parent(s): 8ebb2d4

Upload 8 files

Browse files
Files changed (8) hide show
  1. app.py +309 -0
  2. columns.pt +3 -0
  3. file_utils.py +109 -0
  4. packages.txt +3 -0
  5. predict.py +151 -0
  6. process.py +226 -0
  7. requirements.txt +6 -0
  8. table.pt +3 -0
app.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from predict import PaddleOCR
3
+ from pdf2image import convert_from_bytes
4
+ import cv2
5
+ import PIL
6
+ import numpy as np
7
+ import os
8
+ import tempfile
9
+ import random
10
+ import string
11
+ from ultralyticsplus import YOLO
12
+ import streamlit as st
13
+ import numpy as np
14
+ import pandas as pd
15
+ import matplotlib.pyplot as plt
16
+ import matplotlib.patches as patches
17
+ import io
18
+ import re
19
+ from dateutil.parser import parse
20
+ import datetime
21
+ from file_utils import (
22
+ get_img,
23
+ save_excel_file,
24
+ concat_csv,
25
+ convert_pdf_to_image,
26
+ filter_color,
27
+ plot,
28
+ delete_file,
29
+ )
30
+ from process import (
31
+ filter_columns,
32
+ extract_text_of_col,
33
+ prepare_cols,
34
+ process_cols,
35
+ finalize_data,
36
+ )
37
+
38
+
39
+ table_model = YOLO("table.pt")
40
+ column_model = YOLO("columns.pt")
41
+
42
+ def remove_dots(string):
43
+ # Remove dots from the first and last position of the string
44
+ string = string.strip('.')
45
+
46
+ # Remove the first dot from left to right if there are still more than one dots
47
+ if string.count('.') > 1:
48
+ string = string.replace(".", "", 1)
49
+
50
+ return string
51
+
52
+ def convert_df(df):
53
+ return df.to_csv(index=False).encode('utf-8')
54
+
55
+
56
+ def PIL_to_cv(pil_img):
57
+ return cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
58
+
59
+
60
+ def cv_to_PIL(cv_img):
61
+ return PIL.Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB))
62
+
63
+ def visualize_ocr(pil_img, ocr_result):
64
+ plt.imshow(pil_img, interpolation='lanczos')
65
+ plt.gcf().set_size_inches(20, 20)
66
+ ax = plt.gca()
67
+
68
+ for idx, result in enumerate(ocr_result):
69
+ bbox = result['bbox']
70
+ text = result['text']
71
+ rect = patches.Rectangle(bbox[:2], bbox[2]-bbox[0], bbox[3]-bbox[1], linewidth=2, edgecolor='red', facecolor='none', linestyle='-')
72
+ ax.add_patch(rect)
73
+ ax.text(bbox[0], bbox[1], text, horizontalalignment='left', verticalalignment='bottom', color='blue', fontsize=7)
74
+
75
+ plt.xticks([], [])
76
+ plt.yticks([], [])
77
+
78
+ plt.gcf().set_size_inches(10, 10)
79
+ plt.axis('off')
80
+ img_buf = io.BytesIO()
81
+ plt.savefig(img_buf, bbox_inches='tight', dpi=150)
82
+ plt.close()
83
+
84
+ return PIL.Image.open(img_buf)
85
+
86
+ def filter_columns(columns: np.ndarray):
87
+ for idx, col in enumerate(columns):
88
+ if idx >= len(columns) - 1:
89
+ break
90
+ nxt = columns[idx + 1]
91
+ threshold = ((col[2] - col[0]) + (nxt[2] - nxt[0])) / 2
92
+ if (col[2] - columns[idx + 1][0]) > threshold * 0.5:
93
+ col[1], col[2], col[3] = min(col[1], nxt[1]), nxt[2], max(col[3], nxt[3])
94
+ columns = np.delete(columns, idx + 1, 0)
95
+ idx -= 1
96
+ return columns
97
+
98
+ st.title("Extract data from bank statements")
99
+
100
+ model = PaddleOCR()
101
+
102
+ uploaded = st.file_uploader(
103
+ "upload a bank statement image",
104
+ type=["png", "jpg", "jpeg", "PNG", "JPG", "JPEG", "pdf", "PDF"],
105
+ )
106
+ number = st.number_input('Insert a number',value=2023, step=1)
107
+ filter = st.checkbox("filter color")
108
+ if st.button('Analyze image'):
109
+
110
+ final_csv = pd.DataFrame()
111
+ first_flag_dataframe=0
112
+ if uploaded is None:
113
+ st.write('Please upload an image')
114
+
115
+ else:
116
+ tabs = st.tabs(
117
+ ['Pages','Table Detection', 'Table Structure Recognition', 'Extracted Table(s)']
118
+ )
119
+ print(uploaded.type)
120
+ if uploaded.type == "application/pdf":
121
+ foldername = tempfile.TemporaryDirectory(dir=os.getcwd())
122
+ filename = uploaded.name.split(".")[0]
123
+ pdf_pages=convert_from_bytes(uploaded.read(),500)
124
+ for page_enumeration, page in enumerate(pdf_pages, start=1):
125
+
126
+ with tabs[0]:
127
+ st.header('Pages : '+str(page_enumeration))
128
+ st.image(page)
129
+
130
+ page_img=np.asarray(page)
131
+ tables = PaddleOCR.table_model(page_img, conf=0.75)
132
+ tabel_datas=tables[0].boxes.data.cpu().numpy()
133
+
134
+ tables = tables[0].boxes.xyxy.cpu().numpy()
135
+ with tabs[1]:
136
+ st.header('Table Detection Page :'+str(page_enumeration))
137
+
138
+ str_cols = st.columns(4)
139
+ str_cols[0].subheader('Table image')
140
+ str_cols[1].subheader('Columns')
141
+ str_cols[2].subheader('Structure result')
142
+ str_cols[3].subheader('Cells result')
143
+ results = []
144
+ for table in tables:
145
+ try:
146
+
147
+ tabel_data = np.array(
148
+ sorted(tabel_datas, key=lambda x: x[0]), dtype=np.ndarray
149
+ )
150
+
151
+ tabel_data = filter_columns(tabel_data)
152
+
153
+ str_cols[0].image(plot(page_img, tabel_data), channels="RGB")
154
+ # * crop the table as an image from the original image
155
+ sub_img = page_img[
156
+ int(table[1].item()): int(table[3].item()),
157
+ int(table[0].item()): int(table[2].item()),
158
+ ]
159
+
160
+ columns_detect = PaddleOCR.column_model(sub_img, conf=0.75)
161
+ cols_data = columns_detect[0].boxes.data.cpu().numpy()
162
+
163
+ # * Sort columns according to the x coordinate
164
+ cols_data = np.array(
165
+ sorted(cols_data, key=lambda x: x[0]), dtype=np.ndarray
166
+ )
167
+
168
+ # * merge the duplicated columns
169
+ cols_data = filter_columns(cols_data)
170
+ str_cols[1].image(plot(sub_img, cols_data), channels="RGB")
171
+
172
+ except Exception as e:
173
+ print(e)
174
+ st.warning("No Detection")
175
+ try:
176
+ ####################################################################
177
+
178
+ # # columns = cols_data[:, 0:4]
179
+ # # #sub_imgs = []
180
+ # # thr = 0
181
+ # # column = columns[0]
182
+ # # maxcol1=int(column[1])
183
+ # # maxcol3=int(column[3])
184
+ # # cols = []
185
+ # # for column in columns:
186
+ # # if maxcol1 < int(column[1]) :
187
+ # # maxcol1=int(column[1])
188
+ # # if maxcol3 < int(column[3]) :
189
+ # # maxcol3=int(column[3])
190
+
191
+ # # sub_imgs = (sub_img[ maxcol1: maxcol3, : ])
192
+ # # str_cols[2].image(sub_imgs)
193
+ # # image = filter_color(sub_imgs)
194
+ # # res, threshold,ocr_res = extract_text_of_col(image)
195
+ # # vis_ocr_img = visualize_ocr(image, ocr_res)
196
+ # # str_cols[3].image(vis_ocr_img)
197
+ # # thr += threshold
198
+ # # cols.append(prepare_cols(res, threshold * 0.6))
199
+ # # print("cols : ",cols)
200
+ # # thr = thr / len(columns)
201
+ # # data = process_cols(cols, thr * 0.6)
202
+ # # print("data : ",data)
203
+ ######################################################################
204
+ columns = cols_data[:, 0:4]
205
+ sub_imgs = []
206
+ column = columns[0]
207
+ maxcol1=int(column[1])
208
+ maxcol3=int(column[3])
209
+ #for column in columns:
210
+ # if maxcol1 < int(column[1]) :
211
+ # maxcol1=int(column[1])
212
+ # if maxcol3 < int(column[3]) :
213
+ # maxcol3=int(column[3])
214
+
215
+ for column in columns:
216
+ # * Create list of cropped images for each column
217
+ sub_imgs.append(sub_img[maxcol1:maxcol3, int(column[0]): int(column[2])])
218
+ cols = []
219
+ thr = 0
220
+ for image in sub_imgs:
221
+ if filter:
222
+ # * keep only black color in the image
223
+ image = filter_color(image)
224
+
225
+ # * extract text of each column and get the length threshold
226
+ res, threshold, ocr_res = extract_text_of_col(image)
227
+ thr += threshold
228
+
229
+ # * arrange the rows of each column with respect to row length threshold
230
+ cols.append(prepare_cols(res, threshold * 0.6))
231
+
232
+ thr = thr / len(sub_imgs)
233
+
234
+ # * append each element in each column to its right place in the dataframe
235
+ data = process_cols(cols, thr * 0.6)
236
+
237
+ # * merge the related rows together
238
+
239
+ data: pd.DataFrame = finalize_data(data, page_enumeration)
240
+ results.append(data)
241
+ with tabs[2]:
242
+ st.header('Extracted Table(s)')
243
+ st.dataframe(data)
244
+ print("data : ",data)
245
+ print("results : ", results)
246
+ if first_flag_dataframe == 0 :
247
+ first_flag_dataframe=1
248
+ final_csv=data
249
+ else:
250
+ final_csv = pd.concat([final_csv,data],ignore_index=True)
251
+ csv = convert_df(data)
252
+ print(csv)
253
+
254
+ except:
255
+ st.warning("Text Extraction Failed")
256
+ continue
257
+ with tabs[3]:
258
+ st.dataframe(final_csv)
259
+ rough_csv= convert_df(final_csv)
260
+ st.download_button(
261
+ "rough-csv",
262
+ rough_csv,
263
+ "file.csv",
264
+ "text/csv",
265
+ key='rough-csv'
266
+ )
267
+ final_csv.columns = ['page','Date', 'Transaction_Details', 'Three', 'Deposit','Withdrawal','Balance']
268
+ #final_csv = final_csv.rename(columns={1: 'Date', 2: 'Transaction_Details', 3: 'Three', 4: 'Deposit',5 : 'Withdrawal',6:'Balance'})
269
+ final_csv['Date'] = final_csv['Date'].astype(str)
270
+ st.dataframe(final_csv)
271
+ final_csv = final_csv[~final_csv['Date'].str.contains('Date')]
272
+ final_csv = final_csv[~final_csv['Date'].str.contains('日期')]
273
+ final_csv = final_csv[~final_csv['Date'].str.contains('口期')]
274
+ final_csv['Date'] = final_csv['Date'].apply(lambda x: re.sub(r'[^a-zA-Z0-9 ]', '', x))
275
+ final_csv['Date'] = final_csv['Date'].apply(lambda x: x + str(number))
276
+ final_csv['Date'] = final_csv['Date'].apply(lambda x:parse(x, fuzzy=True))
277
+ #final_csv['Date']=final_csv['Date'].str.replace(' ', '')
278
+ final_csv['*Date'] = pd.to_datetime(final_csv['Date']).dt.strftime('%d-%m-%Y')
279
+ final_csv['Withdrawal'] = final_csv['Withdrawal'].astype(str)
280
+ final_csv['Withdrawal'] = final_csv['Withdrawal'].str.replace('i', '').str.replace('E', '').str.replace(':', '').str.replace('M', '').str.replace('?', '').str.replace('t', '').str.replace('+', '').str.replace(';', '').str.replace('g', '').str.replace('^', '').str.replace('m', '').str.replace('/', '').str.replace('#', '').str.replace("'", '').str.replace('w', '').str.replace('"', '').str.replace('%', '').str.replace('r', '').str.replace('-', '').str.replace('v', '').str.replace(',', '').str.replace('·', '').str.replace(':', '').str.replace(' ', '').str.replace('*', '').str.replace('~', '').str.replace('V', '')
281
+ final_csv['Withdrawal'] = final_csv['Withdrawal'].apply(remove_dots)
282
+ final_csv['Withdrawal'] = final_csv['Withdrawal'].astype(float)*-1
283
+ final_csv['Deposit'] = final_csv['Deposit'].astype(str)
284
+ final_csv['Deposit'] = final_csv['Deposit'].str.replace('i', '').str.replace('E', '').str.replace(':', '').str.replace('M', '').str.replace('?', '').str.replace('t', '').str.replace('+', '').str.replace(';', '').str.replace('g', '').str.replace('^', '').str.replace('m', '').str.replace('/', '').str.replace('#', '').str.replace("'", '').str.replace('w', '').str.replace('"', '').str.replace('%', '').str.replace('r', '').str.replace('-', '').str.replace('v', '').str.replace(',', '').str.replace('·', '').str.replace(':', '').str.replace(' ', '').str.replace('*', '').str.replace('~', '').str.replace('V', '')
285
+ final_csv['Deposit'] = final_csv['Deposit'].apply(remove_dots)
286
+ final_csv['Deposit'] = final_csv['Deposit'].astype(float)
287
+ final_csv['*Amount'] = final_csv['Withdrawal'].fillna(0) + final_csv['Deposit'].fillna(0)
288
+ final_csv = final_csv.drop(['Withdrawal','Deposit'], axis=1)
289
+ final_csv['Payee'] = ''
290
+ final_csv['Description'] = final_csv['Transaction_Details']
291
+ final_csv.loc[final_csv['Three'].notnull(), 'Description'] += " "+final_csv['Three']
292
+ final_csv = final_csv.drop(['Transaction_Details','Three'], axis=1)
293
+ final_csv['Reference'] = ''
294
+ final_csv['Check Number'] = ''
295
+ df = final_csv[['*Date', '*Amount', 'Payee', 'Description','Reference','Check Number']]
296
+ df = df[df['*Amount'] != 0]
297
+ csv = convert_df(df)
298
+ st.dataframe(df)
299
+ st.download_button(
300
+ "Press to Download",
301
+ csv,
302
+ "file.csv",
303
+ "text/csv",
304
+ key='download-csv'
305
+ )
306
+
307
+ #success = st.button("Extract", on_click=model, args=[uploaded, filter])
308
+
309
+
columns.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a10312d912b64387404799ccc0a677a349e4a7534c9d0311e20febf8fef2c38f
3
+ size 22502968
file_utils.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import math
4
+ import numpy as np
5
+ import pandas as pd
6
+ from pdf2image import convert_from_bytes
7
+
8
+ import streamlit as st
9
+
10
+
11
+ def get_img(uploaded_file):
12
+ # convert file bytes into cv2 image
13
+ file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
14
+ img = cv2.imdecode(file_bytes, 1)
15
+ return img
16
+
17
+
18
+ def convert_pdf_to_image(filename):
19
+ # * returns back a list of images according to the pdf pages
20
+ pdf_pages = convert_from_bytes(filename, 500)
21
+ return pdf_pages
22
+
23
+
24
+ def filter_color(img):
25
+ hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
26
+
27
+ # define range of black color in HSV
28
+
29
+ lower_val = np.array([0, 0, 0])
30
+
31
+ upper_val = np.array([179, 100, 130])
32
+
33
+ # Threshold the HSV image to get only black colors
34
+
35
+ mask = cv2.inRange(hsv, lower_val, upper_val)
36
+
37
+ # Bitwise-AND mask and original image
38
+
39
+ res = cv2.bitwise_not(mask)
40
+ return res
41
+
42
+
43
+ def plot(img, boxes):
44
+ FONT_SCALE = 1e-3
45
+ THICKNESS_SCALE = 1e-3
46
+ TEXT_Y_OFFSET_SCALE = 2.5e-2
47
+ height, width, _ = img.shape
48
+
49
+ font_scale = min(width, height) * FONT_SCALE
50
+ thickness = math.ceil(min(width, height) * THICKNESS_SCALE)
51
+
52
+ tmp = img.copy()
53
+ for box in boxes:
54
+ top_left = (int(box[0]), int(box[1]))
55
+ bottom_right = (int(box[2]), int(box[3]))
56
+
57
+ tmp = cv2.rectangle(tmp, top_left, bottom_right,
58
+ (0, 0, 255), thickness)
59
+
60
+ text = str(round(float(box[4]), 2))
61
+
62
+ cv2.putText(
63
+ tmp,
64
+ text,
65
+ (int(box[0]), int(box[1]) + int(height * TEXT_Y_OFFSET_SCALE)),
66
+ cv2.FONT_HERSHEY_SIMPLEX,
67
+ font_scale,
68
+ (0, 0, 255),
69
+ thickness,
70
+ )
71
+ return tmp
72
+
73
+
74
+ def delete_file(filename):
75
+ if os.path.exists(filename):
76
+ os.remove(filename)
77
+
78
+
79
+ def save_excel_file(
80
+ idx, df: pd.DataFrame, foldername, filename, page_enumeration: int = 0
81
+ ):
82
+ df.to_csv(
83
+ f"{foldername}/{filename}page{page_enumeration}table{idx}.csv",
84
+ index=False,
85
+ )
86
+
87
+
88
+ def concat_csv(folder, filename: str):
89
+ df = pd.DataFrame()
90
+ foldername = folder.name
91
+ files = list(
92
+ sorted(
93
+ os.listdir(foldername), key=lambda x: x.split("page")[1].split("table")[0]
94
+ )
95
+ )
96
+ columns = []
97
+ for idx, file in enumerate(files):
98
+ tmp = pd.read_csv(f"{foldername}/{file}")
99
+ try:
100
+ if idx == 0:
101
+ columns = tmp.iloc[0]
102
+ df = pd.concat([df, tmp[1:]])
103
+ except:
104
+ continue
105
+
106
+ if not df.empty:
107
+ df.columns = columns
108
+ st.dataframe(df)
109
+ df.to_csv(filename, index=False)
packages.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ libpoppler-dev
2
+ libpoppler-cpp-dev
3
+ poppler-utils
predict.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import random
4
+ import string
5
+ from ultralyticsplus import YOLO
6
+ import streamlit as st
7
+ import numpy as np
8
+ import pandas as pd
9
+ from process import (
10
+ filter_columns,
11
+ extract_text_of_col,
12
+ prepare_cols,
13
+ process_cols,
14
+ finalize_data,
15
+ )
16
+ from file_utils import (
17
+ get_img,
18
+ save_excel_file,
19
+ concat_csv,
20
+ convert_pdf_to_image,
21
+ filter_color,
22
+ plot,
23
+ delete_file,
24
+ )
25
+
26
+
27
+ def process_img(
28
+ img,
29
+ page_enumeration: int = 0,
30
+ filter=False,
31
+ foldername: str = "",
32
+ filename: str = "",
33
+ ):
34
+ tables = PaddleOCR.table_model(img, conf=0.75)
35
+ tables = tables[0].boxes.xyxy.cpu().numpy()
36
+ results = []
37
+ for table in tables:
38
+ try:
39
+ # * crop the table as an image from the original image
40
+ sub_img = img[
41
+ int(table[1].item()): int(table[3].item()),
42
+ int(table[0].item()): int(table[2].item()),
43
+ ]
44
+ columns_detect = PaddleOCR.column_model(sub_img, conf=0.75)
45
+ cols_data = columns_detect[0].boxes.data.cpu().numpy()
46
+
47
+ # * Sort columns according to the x coordinate
48
+ cols_data = np.array(
49
+ sorted(cols_data, key=lambda x: x[0]), dtype=np.ndarray
50
+ )
51
+
52
+ # * merge the duplicated columns
53
+ cols_data = filter_columns(cols_data)
54
+ st.image(plot(sub_img, cols_data), channels="RGB")
55
+ except:
56
+ st.warning("No Detection")
57
+
58
+ try:
59
+ columns = cols_data[:, 0:4]
60
+ sub_imgs = []
61
+ for column in columns:
62
+ # * Create list of cropped images for each column
63
+ sub_imgs.append(sub_img[:, int(column[0]): int(column[2])])
64
+ cols = []
65
+ thr = 0
66
+ for image in sub_imgs:
67
+ if filter:
68
+ # * keep only black color in the image
69
+ image = filter_color(image)
70
+
71
+ # * extract text of each column and get the length threshold
72
+ res, threshold = extract_text_of_col(image)
73
+ thr += threshold
74
+
75
+ # * arrange the rows of each column with respect to row length threshold
76
+ cols.append(prepare_cols(res, threshold * 0.6))
77
+
78
+ thr = thr / len(sub_imgs)
79
+
80
+ # * append each element in each column to its right place in the dataframe
81
+ data = process_cols(cols, thr * 0.6)
82
+
83
+ # * merge the related rows together
84
+ data: pd.DataFrame = finalize_data(data, page_enumeration)
85
+ results.append(data)
86
+ print("data : ",data)
87
+ print("results : ", results)
88
+ except:
89
+ st.warning("Text Extraction Failed")
90
+ continue
91
+ list(
92
+ map(
93
+ lambda x: save_excel_file(
94
+ *x,
95
+ foldername,
96
+ filename,
97
+ page_enumeration,
98
+ ),
99
+ enumerate(results),
100
+ )
101
+ )
102
+
103
+
104
+ class PaddleOCR:
105
+ # Load Image Detection model
106
+ table_model = YOLO("table.pt")
107
+ column_model = YOLO("columns.pt")
108
+
109
+ def __call__(self, uploaded, filter=False):
110
+ foldername = tempfile.TemporaryDirectory(dir=os.getcwd())
111
+ filename = uploaded.name.split(".")[0]
112
+ if uploaded.name.split(".")[1].lower() == "pdf":
113
+ pdf_pages = convert_pdf_to_image(uploaded.read())
114
+ for page_enumeration, page in enumerate(pdf_pages, start=1):
115
+ process_img(
116
+ np.asarray(page),
117
+ page_enumeration,
118
+ filter=filter,
119
+ foldername=foldername.name,
120
+ filename=filename,
121
+ )
122
+ else:
123
+ img = get_img(uploaded)
124
+ process_img(
125
+ img,
126
+ filter=filter,
127
+ foldername=foldername.name,
128
+ filename=filename,
129
+ )
130
+
131
+ # * concatenate all csv files if many
132
+ extra = "".join(random.choices(string.ascii_uppercase, k=5))
133
+ filename = f"{filename}_{extra}.csv"
134
+ try:
135
+ concat_csv(foldername, filename)
136
+ except:
137
+ st.warning("No results found")
138
+
139
+ foldername.cleanup()
140
+
141
+ if os.path.exists(filename):
142
+ with open(f"{filename}", "rb") as fp:
143
+ st.download_button(
144
+ label="Download CSV file",
145
+ data=fp,
146
+ file_name=filename,
147
+ mime="text/csv",
148
+ )
149
+ delete_file(filename)
150
+ else:
151
+ st.warning("No results found")
process.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from paddleocr import PaddleOCR
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ ocr = PaddleOCR(use_angle_cls=True, lang="ch")
6
+
7
+
8
+ def filter_columns(columns: np.ndarray):
9
+ for idx, col in enumerate(columns):
10
+ if idx >= len(columns) - 1:
11
+ break
12
+ nxt = columns[idx + 1]
13
+ threshold = ((col[2] - col[0]) + (nxt[2] - nxt[0])) / 2
14
+ if (col[2] - columns[idx + 1][0]) > threshold * 0.5:
15
+ col[1], col[2], col[3] = min(col[1], nxt[1]), nxt[2], max(col[3], nxt[3])
16
+ columns = np.delete(columns, idx + 1, 0)
17
+ idx -= 1
18
+ return columns
19
+
20
+
21
+ def process_text(row):
22
+ # * concatenate the text of the cell and return the coordinates and the text of the cell
23
+ coor = np.array([None, None])
24
+ text = ""
25
+ for txt in row:
26
+ coor[0], coor[1] = (
27
+ txt[0][0][1] if coor[0] is None or txt[0][0][1] < coor[0] else coor[0],
28
+ txt[0][2][1] if coor[1] is None or txt[0][2][1] > coor[1] else coor[1],
29
+ )
30
+ text += f"{txt[1][0]} "
31
+ text = text.strip()
32
+ row = [coor, text]
33
+ return row
34
+
35
+
36
+ def extract_text_of_col(col_img):
37
+ """'
38
+ * extract text from the column image and calculate the average length of the row in the column
39
+ * the average is calculated by summing the length of each row then divide the total by the number of rows inside the column
40
+ * return the text and the average length
41
+ """
42
+ result = ocr.ocr(col_img, cls=False)
43
+ ocr_res = []
44
+
45
+ for ps, (text, score) in result[0]:
46
+ x1 = min(p[0] for p in ps)
47
+ y1 = min(p[1] for p in ps)
48
+ x2 = max(p[0] for p in ps)
49
+ y2 = max(p[1] for p in ps)
50
+ word_info = {
51
+ 'bbox': [x1, y1, x2, y2],
52
+ 'text': text
53
+ }
54
+ ocr_res.append(word_info)
55
+ threshold = 0
56
+ print(result)
57
+ for idx in range(len(result)):
58
+ summ = 0
59
+ length = len(result[idx])
60
+ for line in result[idx]:
61
+ summ += line[0][2][1] - line[0][0][1]
62
+ if length > 0:
63
+ threshold += summ / len(result[idx])
64
+ return result, threshold / len(result),ocr_res
65
+
66
+
67
+ def prepare_cols(result, threshold):
68
+ """
69
+ ** columns are seperated **
70
+ * add each element from the extracted text to its row according to the coordinate intersection with respect to the average length of the row
71
+ * the intersection is True if the intersected part is bigger than the threshold number (ex: half of the average length of the row)
72
+ * return the column of the arranged rows
73
+ """
74
+ col = []
75
+ for idx in range(len(result)):
76
+ row = []
77
+ for i, line in enumerate(result[idx]):
78
+ if i == 0:
79
+ row.append(line)
80
+ if i == len(result[idx]) - 1:
81
+ col.append(process_text(row))
82
+ continue
83
+ if (
84
+ line[0][0][1] >= row[-1][0][0][1] and line[0][2][1] >= row[-1][0][2][1]
85
+ ) and (
86
+ line[0][2][1] > row[-1][0][0][1]
87
+ and line[0][0][1] < row[-1][0][2][1]
88
+ and (abs(line[0][0][1] - row[-1][0][2][1]) > threshold)
89
+ ):
90
+ row.append(line)
91
+ elif (
92
+ line[0][0][1] <= row[-1][0][0][1] and line[0][2][1] <= row[-1][0][2][1]
93
+ ) and (
94
+ line[0][2][1] > row[-1][0][0][1]
95
+ and line[0][0][1] < row[-1][0][2][1]
96
+ and (abs(line[0][2][1] - row[-1][0][0][1]) > threshold)
97
+ ):
98
+ row.append(line)
99
+ elif (
100
+ line[0][0][1] <= row[-1][0][0][1] and line[0][2][1] >= row[-1][0][2][1]
101
+ ) and (
102
+ line[0][2][1] > row[-1][0][0][1]
103
+ and line[0][0][1] < row[-1][0][2][1]
104
+ and (abs(row[-1][0][2][1] - row[-1][0][0][1]) > threshold)
105
+ ):
106
+ row.append(line)
107
+ elif (
108
+ line[0][0][1] >= row[-1][0][0][1] and line[0][2][1] <= row[-1][0][2][1]
109
+ ) and (
110
+ line[0][2][1] > row[-1][0][0][1]
111
+ and line[0][0][1] < row[-1][0][2][1]
112
+ and (abs(line[0][0][1] - line[0][2][1]) > threshold)
113
+ ):
114
+ row.append(line)
115
+ elif (
116
+ line[0][0][1] == row[-1][0][0][1] and line[0][2][1] == row[-1][0][2][1]
117
+ ) and (
118
+ line[0][2][1] > row[-1][0][0][1]
119
+ and line[0][0][1] < row[-1][0][2][1]
120
+ and (abs(line[0][2][1] - row[-1][0][0][1]) > threshold)
121
+ ):
122
+ row.append(line)
123
+ else:
124
+ col.append(process_text(row))
125
+ row = [line]
126
+ if i == len(result[idx]) - 1:
127
+ col.append(process_text(row))
128
+ return col
129
+
130
+
131
+ def prepare_coordinates(cols):
132
+ """
133
+ * find the column with the maximum number of rows
134
+ * create a key value pair in which the key is the coordinates of each row in the column with the highest number of rows
135
+ and the value is an empty numpy array which has length of number of detected columns
136
+ """
137
+ max_col = max(cols, key=len)
138
+ array = np.empty(len(cols), dtype=object)
139
+ array.fill(np.nan)
140
+ coor_dict = {tuple(k[0]): array for k in max_col}
141
+ return coor_dict
142
+
143
+
144
+ def process_cols(cols, threshold):
145
+ coor_dict = prepare_coordinates(cols)
146
+ """
147
+ * loop over each element inside each column and find the right place for it inside the dataframe by using the coordinates intersection with respect to the average length of the row
148
+ * the intersection is True if the intersected part is bigger than the threshold number (ex: half of the average length of the row)
149
+ """
150
+ for idx, col in enumerate(cols):
151
+ for element in col:
152
+ for coor, row in coor_dict.items():
153
+ if (coor[0] >= element[0][0] and coor[1] >= element[0][1]) and (
154
+ (coor[1] > element[0][0])
155
+ and (coor[0] < element[0][1])
156
+ and (abs(coor[0] - element[0][1]) > threshold)
157
+ ):
158
+ new = row.copy()
159
+ new[idx] = element[1]
160
+ coor_dict[coor] = new
161
+ elif (coor[0] <= element[0][0] and coor[1] <= element[0][1]) and (
162
+ (coor[1] > element[0][0])
163
+ and (coor[0] < element[0][1])
164
+ and (abs(coor[1] - element[0][0]) > threshold)
165
+ ):
166
+ new = row.copy()
167
+ new[idx] = element[1]
168
+ coor_dict[coor] = new
169
+ elif (coor[0] >= element[0][0] and coor[1] <= element[0][1]) and (
170
+ (coor[1] > element[0][0])
171
+ and (coor[0] < element[0][1])
172
+ and (abs(coor[1] - coor[0]) > threshold)
173
+ ):
174
+ new = row.copy()
175
+ new[idx] = element[1]
176
+ coor_dict[coor] = new
177
+ elif (coor[0] <= element[0][0] and coor[1] >= element[0][1]) and (
178
+ (coor[1] > element[0][0])
179
+ and (coor[0] < element[0][1])
180
+ and (abs(element[0][1] - element[0][0]) > threshold)
181
+ ):
182
+ new = row.copy()
183
+ new[idx] = element[1]
184
+ coor_dict[coor] = new
185
+ data = [row for row in coor_dict.values()]
186
+ return data
187
+
188
+
189
+ def valid_row(row):
190
+ return (
191
+ (row[0] is not np.nan)
192
+ or (row[-1] is not np.nan)
193
+ or (row[-2] is not np.nan)
194
+ or (row[-3] is not np.nan)
195
+ )
196
+
197
+
198
+ def finalize_data(data: list, page_enumeration: int):
199
+ idx = 0
200
+ while idx <= len(data) - 1:
201
+ row = data[idx]
202
+ if valid_row(row) and row[0] is np.nan:
203
+ # * add the date to the valid row if it's empty
204
+ try:
205
+ row[0] = data[idx - 1][0]
206
+ data[idx] = row
207
+ except:
208
+ data.pop(idx)
209
+ idx = (idx - 1) if idx > 0 else idx
210
+ continue
211
+ if not valid_row(row):
212
+ if idx == 0:
213
+ data.pop(idx)
214
+ continue
215
+ for i, col in enumerate(row):
216
+ # * merge description to the previous row if the current row is not valid
217
+ if (col is not None) and (col is not np.nan):
218
+ data[idx - 1][i] = str(data[idx - 1][i]) + f" {col}"
219
+ data.pop(idx)
220
+ idx -= 1
221
+ continue
222
+ idx += 1
223
+ page_idx = ["page"] + [page_enumeration for i in range(len(data) - 1)]
224
+ data: pd.DataFrame = pd.DataFrame(data)
225
+ data.insert(0, "page", page_idx)
226
+ return data
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ultralyticsplus
2
+ streamlit
3
+ paddlepaddle
4
+ paddleocr
5
+ python-poppler
6
+ pdf2image
table.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e79562d5c516d29b475647d8f620af68ed075d80a3a7cb5de48ac05565501bf8
3
+ size 22492408