File size: 1,915 Bytes
12b0903
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import json
import os

# from sklearn.externals import joblib
import joblib
import numpy as np
import pandas as pd

# from .variables import old_ocr_req_cols
# from .skew_correction import  PageSkewWraper

const_HW = 1.294117647
const_W = 600


def bucket_sort(df, colmn, ymax_col="ymax", ymin_col="ymin"):
    df["line_number"] = 0
    colmn.append("line_number")
    array_value = df[colmn].values
    start_index = Line_counter = counter = 0
    ymax, ymin, line_no = (
        colmn.index(ymax_col),
        colmn.index(ymin_col),
        colmn.index("line_number"),
    )
    while counter < len(array_value):
        current_ymax = array_value[start_index][ymax]
        for next_index in range(start_index, len(array_value)):
            counter += 1

            next_ymin = array_value[next_index][ymin]
            next_ymax = array_value[next_index][ymax]
            if current_ymax > next_ymin:

                array_value[next_index][line_no] = Line_counter + 1
            #                 if current_ymax < next_ymax:

            #                     current_ymax = next_ymax
            else:
                counter -= 1
                break
        # print(counter, len(array_value), start_index)
        start_index = counter
        Line_counter += 1
    return pd.DataFrame(array_value, columns=colmn)


def do_sorting(df):
    df.sort_values(["ymin", "xmin"], ascending=True, inplace=True)
    df["idx"] = df.index
    if "line_number" in df.columns:
        print("line number removed")
        df.drop("line_number", axis=1, inplace=True)
    req_colns = ["xmin", "ymin", "xmax", "ymax", "idx"]
    temp_df = df.copy()
    temp = bucket_sort(temp_df.copy(), req_colns)
    df = df.merge(temp[["idx", "line_number"]], on="idx")
    df.sort_values(["line_number", "xmin"], ascending=True, inplace=True)
    df = df.reset_index(drop=True)
    df = df.reset_index(drop=True)
    return df