File size: 2,642 Bytes
3daa9d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import sys
import traceback
import pickle
import os
import concurrent.futures
from tqdm import tqdm
import time
from font_dataset.font import load_fonts
import cv2

cjk_ratio = 3

train_cnt = 100
val_cnt = 5
test_cnt = 30

train_cnt_cjk = int(train_cnt * cjk_ratio)
val_cnt_cjk = int(val_cnt * cjk_ratio)
test_cnt_cjk = int(test_cnt * cjk_ratio)

dataset_path = "./dataset/font_img"
os.makedirs(dataset_path, exist_ok=True)

unqualified_log_file_name = f"unqualified_font_{time.time()}.txt"
runtime_exclusion_list = []

fonts, exclusion_rule = load_fonts()


def generate_dataset(dataset_type: str, cnt: int):
    dataset_bath_dir = os.path.join(dataset_path, dataset_type)
    os.makedirs(dataset_bath_dir, exist_ok=True)

    def _generate_single(args):
        i, j, font = args
        print(
            f"Checking {dataset_type} font: {font.path} {i} / {len(fonts)}, image {j}",
            end="\r",
        )

        if exclusion_rule(font):
            print(f"Excluded font: {font.path}")
            return
        if font.path in runtime_exclusion_list:
            print(f"Excluded font: {font.path}")
            return

        image_file_name = f"font_{i}_img_{j}.jpg"
        label_file_name = f"font_{i}_img_{j}.bin"

        image_file_path = os.path.join(dataset_bath_dir, image_file_name)
        label_file_path = os.path.join(dataset_bath_dir, label_file_name)

        # detect cache
        if (not os.path.exists(image_file_path)) or (
            not os.path.exists(label_file_path)
        ):
            print(
                f"Missing {dataset_type} font: {font.path} {i} / {len(fonts)}, image {j}"
            )

        # detect broken
        try:
            # check image
            cv2.imread(image_file_path)
            # check label
            with open(label_file_path, "rb") as f:
                pickle.load(f)
        except Exception as e:
            print(
                f"Broken {dataset_type} font: {font.path} {i} / {len(fonts)}, image {j}"
            )
            os.remove(image_file_path)
            os.remove(label_file_path)

        return

    work_list = []

    # divide len(fonts) into 64 parts and choose the third part for this script
    for i in range(len(fonts)):
        font = fonts[i]
        if font.language == "CJK":
            true_cnt = cnt * cjk_ratio
        else:
            true_cnt = cnt
        for j in range(true_cnt):
            work_list.append((i, j, font))

    for i in tqdm(range(len(work_list))):
        _generate_single(work_list[i])


generate_dataset("train", train_cnt)
generate_dataset("val", val_cnt)
generate_dataset("test", test_cnt)