plano_lit / utils.py
SakshiRathi77's picture
Upload 33 files
12b0903 verified
raw
history blame
1.92 kB
import json
import os
# from sklearn.externals import joblib
import joblib
import numpy as np
import pandas as pd
# from .variables import old_ocr_req_cols
# from .skew_correction import PageSkewWraper
const_HW = 1.294117647
const_W = 600
def bucket_sort(df, colmn, ymax_col="ymax", ymin_col="ymin"):
df["line_number"] = 0
colmn.append("line_number")
array_value = df[colmn].values
start_index = Line_counter = counter = 0
ymax, ymin, line_no = (
colmn.index(ymax_col),
colmn.index(ymin_col),
colmn.index("line_number"),
)
while counter < len(array_value):
current_ymax = array_value[start_index][ymax]
for next_index in range(start_index, len(array_value)):
counter += 1
next_ymin = array_value[next_index][ymin]
next_ymax = array_value[next_index][ymax]
if current_ymax > next_ymin:
array_value[next_index][line_no] = Line_counter + 1
# if current_ymax < next_ymax:
# current_ymax = next_ymax
else:
counter -= 1
break
# print(counter, len(array_value), start_index)
start_index = counter
Line_counter += 1
return pd.DataFrame(array_value, columns=colmn)
def do_sorting(df):
df.sort_values(["ymin", "xmin"], ascending=True, inplace=True)
df["idx"] = df.index
if "line_number" in df.columns:
print("line number removed")
df.drop("line_number", axis=1, inplace=True)
req_colns = ["xmin", "ymin", "xmax", "ymax", "idx"]
temp_df = df.copy()
temp = bucket_sort(temp_df.copy(), req_colns)
df = df.merge(temp[["idx", "line_number"]], on="idx")
df.sort_values(["line_number", "xmin"], ascending=True, inplace=True)
df = df.reset_index(drop=True)
df = df.reset_index(drop=True)
return df