Spaces:

binery
/

Bank_Statement_Parser

Runtime error

App Files Files Community

binery commited on May 30, 2023

Commit

16aad69

1 Parent(s): 8ebb2d4

Upload 8 files

Browse files

Files changed (8) hide show

app.py +309 -0
columns.pt +3 -0
file_utils.py +109 -0
packages.txt +3 -0
predict.py +151 -0
process.py +226 -0
requirements.txt +6 -0
table.pt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,309 @@

+import streamlit as st
+from predict import PaddleOCR
+from pdf2image import convert_from_bytes
+import cv2
+import PIL
+import numpy as np
+import os
+import tempfile
+import random
+import string
+from ultralyticsplus import YOLO
+import streamlit as st
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import io
+import re
+from dateutil.parser import parse
+import datetime
+from file_utils import (
+    get_img,
+    save_excel_file,
+    concat_csv,
+    convert_pdf_to_image,
+    filter_color,
+    plot,
+    delete_file,
+)
+from process import (
+    filter_columns,
+    extract_text_of_col,
+    prepare_cols,
+    process_cols,
+    finalize_data,
+)
+table_model = YOLO("table.pt")
+column_model = YOLO("columns.pt")
+def remove_dots(string):
+    # Remove dots from the first and last position of the string
+    string = string.strip('.')
+    # Remove the first dot from left to right if there are still more than one dots
+    if string.count('.') > 1:
+        string = string.replace(".", "", 1)
+    return string
+def convert_df(df):
+   return df.to_csv(index=False).encode('utf-8')
+def PIL_to_cv(pil_img):
+    return cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
+def cv_to_PIL(cv_img):
+    return PIL.Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB))
+def visualize_ocr(pil_img, ocr_result):
+    plt.imshow(pil_img, interpolation='lanczos')
+    plt.gcf().set_size_inches(20, 20)
+    ax = plt.gca()
+    for idx, result in enumerate(ocr_result):
+        bbox = result['bbox']
+        text = result['text']
+        rect = patches.Rectangle(bbox[:2], bbox[2]-bbox[0], bbox[3]-bbox[1], linewidth=2, edgecolor='red', facecolor='none', linestyle='-')
+        ax.add_patch(rect)
+        ax.text(bbox[0], bbox[1], text, horizontalalignment='left', verticalalignment='bottom', color='blue', fontsize=7)
+    plt.xticks([], [])
+    plt.yticks([], [])
+    plt.gcf().set_size_inches(10, 10)
+    plt.axis('off')
+    img_buf = io.BytesIO()
+    plt.savefig(img_buf, bbox_inches='tight', dpi=150)
+    plt.close()
+    return PIL.Image.open(img_buf)
+def filter_columns(columns: np.ndarray):
+    for idx, col in enumerate(columns):
+        if idx >= len(columns) - 1:
+            break
+        nxt = columns[idx + 1]
+        threshold = ((col[2] - col[0]) + (nxt[2] - nxt[0])) / 2
+        if (col[2] - columns[idx + 1][0]) > threshold * 0.5:
+            col[1], col[2], col[3] = min(col[1], nxt[1]), nxt[2], max(col[3], nxt[3])
+            columns = np.delete(columns, idx + 1, 0)
+            idx -= 1
+    return columns
+st.title("Extract data from bank statements")
+model = PaddleOCR()
+uploaded = st.file_uploader(
+    "upload a bank statement image",
+    type=["png", "jpg", "jpeg", "PNG", "JPG", "JPEG", "pdf", "PDF"],
+)
+number = st.number_input('Insert a number',value=2023, step=1)
+filter = st.checkbox("filter color")
+if st.button('Analyze image'):
+        final_csv = pd.DataFrame()
+        first_flag_dataframe=0
+        if uploaded is None:
+            st.write('Please upload an image')
+        else:
+            tabs = st.tabs(
+                ['Pages','Table Detection', 'Table Structure Recognition', 'Extracted Table(s)']
+            )
+            print(uploaded.type)
+            if uploaded.type == "application/pdf":
+                 foldername = tempfile.TemporaryDirectory(dir=os.getcwd())
+                 filename = uploaded.name.split(".")[0]
+                 pdf_pages=convert_from_bytes(uploaded.read(),500)
+                 for page_enumeration, page in enumerate(pdf_pages, start=1):
+                    with tabs[0]:
+                        st.header('Pages : '+str(page_enumeration))
+                        st.image(page)
+                    page_img=np.asarray(page)
+                    tables = PaddleOCR.table_model(page_img, conf=0.75)
+                    tabel_datas=tables[0].boxes.data.cpu().numpy()
+                    tables = tables[0].boxes.xyxy.cpu().numpy()
+                    with tabs[1]:
+                        st.header('Table Detection Page :'+str(page_enumeration))
+                        str_cols = st.columns(4)
+                        str_cols[0].subheader('Table image')
+                        str_cols[1].subheader('Columns')
+                        str_cols[2].subheader('Structure result')
+                        str_cols[3].subheader('Cells result')
+                    results = []
+                    for table in tables:
+                        try:
+                            tabel_data = np.array(
+                                        sorted(tabel_datas, key=lambda x: x[0]), dtype=np.ndarray
+                                    )
+                            tabel_data = filter_columns(tabel_data)
+                            str_cols[0].image(plot(page_img, tabel_data), channels="RGB")
+                            # * crop the table as an image from the original image
+                            sub_img = page_img[
+                                int(table[1].item()): int(table[3].item()),
+                                int(table[0].item()): int(table[2].item()),
+                                ]
+                            columns_detect = PaddleOCR.column_model(sub_img, conf=0.75)
+                            cols_data = columns_detect[0].boxes.data.cpu().numpy()
+                            # * Sort columns according to the x coordinate
+                            cols_data = np.array(
+                                sorted(cols_data, key=lambda x: x[0]), dtype=np.ndarray
+                            )
+                            # * merge the duplicated columns
+                            cols_data = filter_columns(cols_data)
+                            str_cols[1].image(plot(sub_img, cols_data), channels="RGB")
+                        except Exception as e:
+                            print(e)
+                            st.warning("No Detection")
+                        try:
+                            ####################################################################
+                            # # columns = cols_data[:, 0:4]
+                            # # #sub_imgs = []
+                            # # thr = 0
+                            # # column = columns[0]
+                            # # maxcol1=int(column[1])
+                            # # maxcol3=int(column[3])
+                            # # cols = []
+                            # # for column in columns:
+                            # #     if maxcol1 < int(column[1]) :
+                            # #         maxcol1=int(column[1])
+                            # #     if maxcol3 < int(column[3]) :
+                            # #         maxcol3=int(column[3])
+                            # # sub_imgs = (sub_img[ maxcol1: maxcol3, : ])
+                            # # str_cols[2].image(sub_imgs)
+                            # # image = filter_color(sub_imgs)
+                            # # res, threshold,ocr_res = extract_text_of_col(image)
+                            # # vis_ocr_img = visualize_ocr(image, ocr_res)
+                            # # str_cols[3].image(vis_ocr_img)
+                            # # thr += threshold
+                            # # cols.append(prepare_cols(res, threshold * 0.6))
+                            # # print("cols : ",cols)
+                            # # thr = thr / len(columns)
+                            # # data = process_cols(cols, thr * 0.6)
+                            # # print("data : ",data)
+                            ######################################################################
+                            columns = cols_data[:, 0:4]
+                            sub_imgs = []
+                            column = columns[0]
+                            maxcol1=int(column[1])
+                            maxcol3=int(column[3])
+                            #for column in columns:
+                            #    if maxcol1 < int(column[1]) :
+                            #        maxcol1=int(column[1])
+                            #    if maxcol3 < int(column[3]) :
+                            #        maxcol3=int(column[3])
+                            for column in columns:
+                                # * Create list of cropped images for each column
+                                sub_imgs.append(sub_img[maxcol1:maxcol3, int(column[0]): int(column[2])])
+                            cols = []
+                            thr = 0
+                            for image in sub_imgs:
+                                if filter:
+                                    # * keep only black color in the image
+                                    image = filter_color(image)
+                                # * extract text of each column and get the length threshold
+                                res, threshold, ocr_res = extract_text_of_col(image)
+                                thr += threshold
+                                # * arrange the rows of each column with respect to row length threshold
+                                cols.append(prepare_cols(res, threshold * 0.6))
+                            thr = thr / len(sub_imgs)
+                            # * append each element in each column to its right place in the dataframe
+                            data = process_cols(cols, thr * 0.6)
+                            # * merge the related rows together
+                            data: pd.DataFrame = finalize_data(data, page_enumeration)
+                            results.append(data)
+                            with tabs[2]:
+                                st.header('Extracted Table(s)')
+                                st.dataframe(data)
+                            print("data : ",data)
+                            print("results : ", results)
+                            if first_flag_dataframe == 0 :
+                                first_flag_dataframe=1
+                                final_csv=data
+                            else:
+                                final_csv = pd.concat([final_csv,data],ignore_index=True)
+                                csv = convert_df(data)
+                                print(csv)
+                        except:
+                            st.warning("Text Extraction Failed")
+                            continue
+                 with tabs[3]:
+                     st.dataframe(final_csv)
+                     rough_csv= convert_df(final_csv)
+                     st.download_button(
+                        "rough-csv",
+                        rough_csv,
+                        "file.csv",
+                        "text/csv",
+                        key='rough-csv'
+                        )
+                 final_csv.columns = ['page','Date', 'Transaction_Details', 'Three', 'Deposit','Withdrawal','Balance']
+                 #final_csv = final_csv.rename(columns={1: 'Date', 2: 'Transaction_Details', 3: 'Three', 4: 'Deposit',5 : 'Withdrawal',6:'Balance'})
+                 final_csv['Date'] = final_csv['Date'].astype(str)
+                 st.dataframe(final_csv)
+                 final_csv = final_csv[~final_csv['Date'].str.contains('Date')]
+                 final_csv = final_csv[~final_csv['Date'].str.contains('日期')]
+                 final_csv = final_csv[~final_csv['Date'].str.contains('口期')]
+                 final_csv['Date'] = final_csv['Date'].apply(lambda x: re.sub(r'[^a-zA-Z0-9 ]', '', x))
+                 final_csv['Date'] = final_csv['Date'].apply(lambda x: x + str(number))
+                 final_csv['Date'] = final_csv['Date'].apply(lambda x:parse(x, fuzzy=True))
+                 #final_csv['Date']=final_csv['Date'].str.replace(' ', '')
+                 final_csv['*Date'] = pd.to_datetime(final_csv['Date']).dt.strftime('%d-%m-%Y')
+                 final_csv['Withdrawal'] = final_csv['Withdrawal'].astype(str)
+                 final_csv['Withdrawal'] = final_csv['Withdrawal'].str.replace('i', '').str.replace('E', '').str.replace(':', '').str.replace('M', '').str.replace('?', '').str.replace('t', '').str.replace('+', '').str.replace(';', '').str.replace('g', '').str.replace('^', '').str.replace('m', '').str.replace('/', '').str.replace('#', '').str.replace("'", '').str.replace('w', '').str.replace('"', '').str.replace('%', '').str.replace('r', '').str.replace('-', '').str.replace('v', '').str.replace(',', '').str.replace('·', '').str.replace(':', '').str.replace(' ', '').str.replace('*', '').str.replace('~', '').str.replace('V', '')
+                 final_csv['Withdrawal'] = final_csv['Withdrawal'].apply(remove_dots)
+                 final_csv['Withdrawal'] = final_csv['Withdrawal'].astype(float)*-1
+                 final_csv['Deposit'] = final_csv['Deposit'].astype(str)
+                 final_csv['Deposit'] = final_csv['Deposit'].str.replace('i', '').str.replace('E', '').str.replace(':', '').str.replace('M', '').str.replace('?', '').str.replace('t', '').str.replace('+', '').str.replace(';', '').str.replace('g', '').str.replace('^', '').str.replace('m', '').str.replace('/', '').str.replace('#', '').str.replace("'", '').str.replace('w', '').str.replace('"', '').str.replace('%', '').str.replace('r', '').str.replace('-', '').str.replace('v', '').str.replace(',', '').str.replace('·', '').str.replace(':', '').str.replace(' ', '').str.replace('*', '').str.replace('~', '').str.replace('V', '')
+                 final_csv['Deposit'] = final_csv['Deposit'].apply(remove_dots)
+                 final_csv['Deposit'] = final_csv['Deposit'].astype(float)
+                 final_csv['*Amount'] = final_csv['Withdrawal'].fillna(0) + final_csv['Deposit'].fillna(0)
+                 final_csv = final_csv.drop(['Withdrawal','Deposit'], axis=1)
+                 final_csv['Payee'] = ''
+                 final_csv['Description'] = final_csv['Transaction_Details']
+                 final_csv.loc[final_csv['Three'].notnull(), 'Description'] += " "+final_csv['Three']
+                 final_csv = final_csv.drop(['Transaction_Details','Three'], axis=1)
+                 final_csv['Reference'] = ''
+                 final_csv['Check Number'] = ''
+                 df = final_csv[['*Date', '*Amount', 'Payee', 'Description','Reference','Check Number']]
+                 df = df[df['*Amount'] != 0]
+                 csv = convert_df(df)
+                 st.dataframe(df)
+                 st.download_button(
+                    "Press to Download",
+                    csv,
+                    "file.csv",
+                    "text/csv",
+                    key='download-csv'
+                    )
+            #success = st.button("Extract", on_click=model, args=[uploaded, filter])

columns.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a10312d912b64387404799ccc0a677a349e4a7534c9d0311e20febf8fef2c38f
+size 22502968

file_utils.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+import cv2
+import math
+import numpy as np
+import pandas as pd
+from pdf2image import convert_from_bytes
+import streamlit as st
+def get_img(uploaded_file):
+    # convert file bytes into cv2 image
+    file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
+    img = cv2.imdecode(file_bytes, 1)
+    return img
+def convert_pdf_to_image(filename):
+    # * returns back a list of images according to the pdf pages
+    pdf_pages = convert_from_bytes(filename, 500)
+    return pdf_pages
+def filter_color(img):
+    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+    # define range of black color in HSV
+    lower_val = np.array([0, 0, 0])
+    upper_val = np.array([179, 100, 130])
+    # Threshold the HSV image to get only black colors
+    mask = cv2.inRange(hsv, lower_val, upper_val)
+    # Bitwise-AND mask and original image
+    res = cv2.bitwise_not(mask)
+    return res
+def plot(img, boxes):
+    FONT_SCALE = 1e-3
+    THICKNESS_SCALE = 1e-3
+    TEXT_Y_OFFSET_SCALE = 2.5e-2
+    height, width, _ = img.shape
+    font_scale = min(width, height) * FONT_SCALE
+    thickness = math.ceil(min(width, height) * THICKNESS_SCALE)
+    tmp = img.copy()
+    for box in boxes:
+        top_left = (int(box[0]), int(box[1]))
+        bottom_right = (int(box[2]), int(box[3]))
+        tmp = cv2.rectangle(tmp, top_left, bottom_right,
+                            (0, 0, 255), thickness)
+        text = str(round(float(box[4]), 2))
+        cv2.putText(
+            tmp,
+            text,
+            (int(box[0]), int(box[1]) + int(height * TEXT_Y_OFFSET_SCALE)),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            font_scale,
+            (0, 0, 255),
+            thickness,
+        )
+    return tmp
+def delete_file(filename):
+    if os.path.exists(filename):
+        os.remove(filename)
+def save_excel_file(
+    idx, df: pd.DataFrame, foldername, filename, page_enumeration: int = 0
+):
+    df.to_csv(
+        f"{foldername}/{filename}page{page_enumeration}table{idx}.csv",
+        index=False,
+    )
+def concat_csv(folder, filename: str):
+    df = pd.DataFrame()
+    foldername = folder.name
+    files = list(
+        sorted(
+            os.listdir(foldername), key=lambda x: x.split("page")[1].split("table")[0]
+        )
+    )
+    columns = []
+    for idx, file in enumerate(files):
+        tmp = pd.read_csv(f"{foldername}/{file}")
+        try:
+            if idx == 0:
+                columns = tmp.iloc[0]
+            df = pd.concat([df, tmp[1:]])
+        except:
+            continue
+    if not df.empty:
+        df.columns = columns
+        st.dataframe(df)
+        df.to_csv(filename, index=False)

packages.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+libpoppler-dev
+libpoppler-cpp-dev
+poppler-utils

predict.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import os
+import tempfile
+import random
+import string
+from ultralyticsplus import YOLO
+import streamlit as st
+import numpy as np
+import pandas as pd
+from process import (
+    filter_columns,
+    extract_text_of_col,
+    prepare_cols,
+    process_cols,
+    finalize_data,
+)
+from file_utils import (
+    get_img,
+    save_excel_file,
+    concat_csv,
+    convert_pdf_to_image,
+    filter_color,
+    plot,
+    delete_file,
+)
+def process_img(
+    img,
+    page_enumeration: int = 0,
+    filter=False,
+    foldername: str = "",
+    filename: str = "",
+):
+    tables = PaddleOCR.table_model(img, conf=0.75)
+    tables = tables[0].boxes.xyxy.cpu().numpy()
+    results = []
+    for table in tables:
+        try:
+            # * crop the table as an image from the original image
+            sub_img = img[
+                int(table[1].item()): int(table[3].item()),
+                int(table[0].item()): int(table[2].item()),
+            ]
+            columns_detect = PaddleOCR.column_model(sub_img, conf=0.75)
+            cols_data = columns_detect[0].boxes.data.cpu().numpy()
+            # * Sort columns according to the x coordinate
+            cols_data = np.array(
+                sorted(cols_data, key=lambda x: x[0]), dtype=np.ndarray
+            )
+            # * merge the duplicated columns
+            cols_data = filter_columns(cols_data)
+            st.image(plot(sub_img, cols_data), channels="RGB")
+        except:
+            st.warning("No Detection")
+        try:
+            columns = cols_data[:, 0:4]
+            sub_imgs = []
+            for column in columns:
+                # * Create list of cropped images for each column
+                sub_imgs.append(sub_img[:, int(column[0]): int(column[2])])
+            cols = []
+            thr = 0
+            for image in sub_imgs:
+                if filter:
+                    # * keep only black color in the image
+                    image = filter_color(image)
+                # * extract text of each column and get the length threshold
+                res, threshold = extract_text_of_col(image)
+                thr += threshold
+                # * arrange the rows of each column with respect to row length threshold
+                cols.append(prepare_cols(res, threshold * 0.6))
+            thr = thr / len(sub_imgs)
+            # * append each element in each column to its right place in the dataframe
+            data = process_cols(cols, thr * 0.6)
+            # * merge the related rows together
+            data: pd.DataFrame = finalize_data(data, page_enumeration)
+            results.append(data)
+            print("data : ",data)
+            print("results : ", results)
+        except:
+            st.warning("Text Extraction Failed")
+            continue
+    list(
+        map(
+            lambda x: save_excel_file(
+                *x,
+                foldername,
+                filename,
+                page_enumeration,
+            ),
+            enumerate(results),
+        )
+    )
+class PaddleOCR:
+    # Load Image Detection model
+    table_model = YOLO("table.pt")
+    column_model = YOLO("columns.pt")
+    def __call__(self, uploaded, filter=False):
+        foldername = tempfile.TemporaryDirectory(dir=os.getcwd())
+        filename = uploaded.name.split(".")[0]
+        if uploaded.name.split(".")[1].lower() == "pdf":
+            pdf_pages = convert_pdf_to_image(uploaded.read())
+            for page_enumeration, page in enumerate(pdf_pages, start=1):
+                process_img(
+                    np.asarray(page),
+                    page_enumeration,
+                    filter=filter,
+                    foldername=foldername.name,
+                    filename=filename,
+                )
+        else:
+            img = get_img(uploaded)
+            process_img(
+                img,
+                filter=filter,
+                foldername=foldername.name,
+                filename=filename,
+            )
+        # * concatenate all csv files if many
+        extra = "".join(random.choices(string.ascii_uppercase, k=5))
+        filename = f"{filename}_{extra}.csv"
+        try:
+            concat_csv(foldername, filename)
+        except:
+            st.warning("No results found")
+        foldername.cleanup()
+        if os.path.exists(filename):
+            with open(f"{filename}", "rb") as fp:
+                st.download_button(
+                    label="Download CSV file",
+                    data=fp,
+                    file_name=filename,
+                    mime="text/csv",
+                )
+            delete_file(filename)
+        else:
+            st.warning("No results found")

process.py ADDED Viewed

	@@ -0,0 +1,226 @@

+from paddleocr import PaddleOCR
+import numpy as np
+import pandas as pd
+ocr = PaddleOCR(use_angle_cls=True, lang="ch")
+def filter_columns(columns: np.ndarray):
+    for idx, col in enumerate(columns):
+        if idx >= len(columns) - 1:
+            break
+        nxt = columns[idx + 1]
+        threshold = ((col[2] - col[0]) + (nxt[2] - nxt[0])) / 2
+        if (col[2] - columns[idx + 1][0]) > threshold * 0.5:
+            col[1], col[2], col[3] = min(col[1], nxt[1]), nxt[2], max(col[3], nxt[3])
+            columns = np.delete(columns, idx + 1, 0)
+            idx -= 1
+    return columns
+def process_text(row):
+    # * concatenate the text of the cell and return the coordinates and the text of the cell
+    coor = np.array([None, None])
+    text = ""
+    for txt in row:
+        coor[0], coor[1] = (
+            txt[0][0][1] if coor[0] is None or txt[0][0][1] < coor[0] else coor[0],
+            txt[0][2][1] if coor[1] is None or txt[0][2][1] > coor[1] else coor[1],
+        )
+        text += f"{txt[1][0]} "
+    text = text.strip()
+    row = [coor, text]
+    return row
+def extract_text_of_col(col_img):
+    """'
+    * extract text from the column image and calculate the average length of the row in the column
+    * the average is calculated by summing the length of each row then divide the total by the number of rows inside the column
+    * return the text and the average length
+    """
+    result = ocr.ocr(col_img, cls=False)
+    ocr_res = []
+    for ps, (text, score) in result[0]:
+        x1 = min(p[0] for p in ps)
+        y1 = min(p[1] for p in ps)
+        x2 = max(p[0] for p in ps)
+        y2 = max(p[1] for p in ps)
+        word_info = {
+            'bbox': [x1, y1, x2, y2],
+            'text': text
+        }
+        ocr_res.append(word_info)
+    threshold = 0
+    print(result)
+    for idx in range(len(result)):
+        summ = 0
+        length = len(result[idx])
+        for line in result[idx]:
+            summ += line[0][2][1] - line[0][0][1]
+        if length > 0:
+            threshold += summ / len(result[idx])
+    return result, threshold / len(result),ocr_res
+def prepare_cols(result, threshold):
+    """
+    ** columns are seperated **
+    * add each element from the extracted text to its row according to the coordinate intersection with respect to the average length of the row
+    * the intersection is True if the intersected part is bigger than the threshold number (ex: half of the average length of the row)
+    * return the column of the arranged rows
+    """
+    col = []
+    for idx in range(len(result)):
+        row = []
+        for i, line in enumerate(result[idx]):
+            if i == 0:
+                row.append(line)
+                if i == len(result[idx]) - 1:
+                    col.append(process_text(row))
+                continue
+            if (
+                line[0][0][1] >= row[-1][0][0][1] and line[0][2][1] >= row[-1][0][2][1]
+            ) and (
+                line[0][2][1] > row[-1][0][0][1]
+                and line[0][0][1] < row[-1][0][2][1]
+                and (abs(line[0][0][1] - row[-1][0][2][1]) > threshold)
+            ):
+                row.append(line)
+            elif (
+                line[0][0][1] <= row[-1][0][0][1] and line[0][2][1] <= row[-1][0][2][1]
+            ) and (
+                line[0][2][1] > row[-1][0][0][1]
+                and line[0][0][1] < row[-1][0][2][1]
+                and (abs(line[0][2][1] - row[-1][0][0][1]) > threshold)
+            ):
+                row.append(line)
+            elif (
+                line[0][0][1] <= row[-1][0][0][1] and line[0][2][1] >= row[-1][0][2][1]
+            ) and (
+                line[0][2][1] > row[-1][0][0][1]
+                and line[0][0][1] < row[-1][0][2][1]
+                and (abs(row[-1][0][2][1] - row[-1][0][0][1]) > threshold)
+            ):
+                row.append(line)
+            elif (
+                line[0][0][1] >= row[-1][0][0][1] and line[0][2][1] <= row[-1][0][2][1]
+            ) and (
+                line[0][2][1] > row[-1][0][0][1]
+                and line[0][0][1] < row[-1][0][2][1]
+                and (abs(line[0][0][1] - line[0][2][1]) > threshold)
+            ):
+                row.append(line)
+            elif (
+                line[0][0][1] == row[-1][0][0][1] and line[0][2][1] == row[-1][0][2][1]
+            ) and (
+                line[0][2][1] > row[-1][0][0][1]
+                and line[0][0][1] < row[-1][0][2][1]
+                and (abs(line[0][2][1] - row[-1][0][0][1]) > threshold)
+            ):
+                row.append(line)
+            else:
+                col.append(process_text(row))
+                row = [line]
+            if i == len(result[idx]) - 1:
+                col.append(process_text(row))
+    return col
+def prepare_coordinates(cols):
+    """
+    * find the column with the maximum number of rows
+    * create a key value pair in which the key is the coordinates of each row in the column with the highest number of rows
+    and the value is an empty numpy array which has length of number of detected columns
+    """
+    max_col = max(cols, key=len)
+    array = np.empty(len(cols), dtype=object)
+    array.fill(np.nan)
+    coor_dict = {tuple(k[0]): array for k in max_col}
+    return coor_dict
+def process_cols(cols, threshold):
+    coor_dict = prepare_coordinates(cols)
+    """
+    * loop over each element inside each column and find the right place for it inside the dataframe by using the coordinates intersection with respect to the average length of the row
+    * the intersection is True if the intersected part is bigger than the threshold number (ex: half of the average length of the row)
+    """
+    for idx, col in enumerate(cols):
+        for element in col:
+            for coor, row in coor_dict.items():
+                if (coor[0] >= element[0][0] and coor[1] >= element[0][1]) and (
+                    (coor[1] > element[0][0])
+                    and (coor[0] < element[0][1])
+                    and (abs(coor[0] - element[0][1]) > threshold)
+                ):
+                    new = row.copy()
+                    new[idx] = element[1]
+                    coor_dict[coor] = new
+                elif (coor[0] <= element[0][0] and coor[1] <= element[0][1]) and (
+                    (coor[1] > element[0][0])
+                    and (coor[0] < element[0][1])
+                    and (abs(coor[1] - element[0][0]) > threshold)
+                ):
+                    new = row.copy()
+                    new[idx] = element[1]
+                    coor_dict[coor] = new
+                elif (coor[0] >= element[0][0] and coor[1] <= element[0][1]) and (
+                    (coor[1] > element[0][0])
+                    and (coor[0] < element[0][1])
+                    and (abs(coor[1] - coor[0]) > threshold)
+                ):
+                    new = row.copy()
+                    new[idx] = element[1]
+                    coor_dict[coor] = new
+                elif (coor[0] <= element[0][0] and coor[1] >= element[0][1]) and (
+                    (coor[1] > element[0][0])
+                    and (coor[0] < element[0][1])
+                    and (abs(element[0][1] - element[0][0]) > threshold)
+                ):
+                    new = row.copy()
+                    new[idx] = element[1]
+                    coor_dict[coor] = new
+    data = [row for row in coor_dict.values()]
+    return data
+def valid_row(row):
+    return (
+        (row[0] is not np.nan)
+        or (row[-1] is not np.nan)
+        or (row[-2] is not np.nan)
+        or (row[-3] is not np.nan)
+    )
+def finalize_data(data: list, page_enumeration: int):
+    idx = 0
+    while idx <= len(data) - 1:
+        row = data[idx]
+        if valid_row(row) and row[0] is np.nan:
+            # * add the date to the valid row if it's empty
+            try:
+                row[0] = data[idx - 1][0]
+                data[idx] = row
+            except:
+                data.pop(idx)
+                idx = (idx - 1) if idx > 0 else idx
+                continue
+        if not valid_row(row):
+            if idx == 0:
+                data.pop(idx)
+                continue
+            for i, col in enumerate(row):
+                # * merge description to the previous row if the current row is not valid
+                if (col is not None) and (col is not np.nan):
+                    data[idx - 1][i] = str(data[idx - 1][i]) + f" {col}"
+            data.pop(idx)
+            idx -= 1
+            continue
+        idx += 1
+    page_idx = ["page"] + [page_enumeration for i in range(len(data) - 1)]
+    data: pd.DataFrame = pd.DataFrame(data)
+    data.insert(0, "page", page_idx)
+    return data

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+ultralyticsplus
+streamlit
+paddlepaddle
+paddleocr
+python-poppler
+pdf2image

table.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e79562d5c516d29b475647d8f620af68ed075d80a3a7cb5de48ac05565501bf8
+size 22492408