import json import os # from sklearn.externals import joblib import joblib import numpy as np import pandas as pd # from .variables import old_ocr_req_cols # from .skew_correction import PageSkewWraper const_HW = 1.294117647 const_W = 600 def bucket_sort(df, colmn, ymax_col="ymax", ymin_col="ymin"): df["line_number"] = 0 colmn.append("line_number") array_value = df[colmn].values start_index = Line_counter = counter = 0 ymax, ymin, line_no = ( colmn.index(ymax_col), colmn.index(ymin_col), colmn.index("line_number"), ) while counter < len(array_value): current_ymax = array_value[start_index][ymax] for next_index in range(start_index, len(array_value)): counter += 1 next_ymin = array_value[next_index][ymin] next_ymax = array_value[next_index][ymax] if current_ymax > next_ymin: array_value[next_index][line_no] = Line_counter + 1 # if current_ymax < next_ymax: # current_ymax = next_ymax else: counter -= 1 break # print(counter, len(array_value), start_index) start_index = counter Line_counter += 1 return pd.DataFrame(array_value, columns=colmn) def do_sorting(df): df.sort_values(["ymin", "xmin"], ascending=True, inplace=True) df["idx"] = df.index if "line_number" in df.columns: print("line number removed") df.drop("line_number", axis=1, inplace=True) req_colns = ["xmin", "ymin", "xmax", "ymax", "idx"] temp_df = df.copy() temp = bucket_sort(temp_df.copy(), req_colns) df = df.merge(temp[["idx", "line_number"]], on="idx") df.sort_values(["line_number", "xmin"], ascending=True, inplace=True) df = df.reset_index(drop=True) df = df.reset_index(drop=True) return df