File size: 6,669 Bytes
bae498f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import time

import numpy as np
from gplearn.genetic import SymbolicRegressor
from sklearn.utils.validation import column_or_1d

import Settings as settings
from DataUtils import make_y_multi_safe

pop_size = 5000
generations = 20
p_crossover = 0.7
warm_start = False


class Genetic_Model:
    def __init__(self):
        self.name = "Genetic Model"
        self.short_name = "GP"
        self.function_set = settings.function_set.copy()
        if "id" in self.function_set:
            self.function_set.remove("id")

        self.est_gp = SymbolicRegressor(population_size=pop_size,
                                        generations=generations, stopping_criteria=0.01,  # 20 gen
                                        p_crossover=p_crossover, p_subtree_mutation=0.1,
                                        p_hoist_mutation=0.05, p_point_mutation=0.1,
                                        warm_start=warm_start,
                                        max_samples=0.9, verbose=False,
                                        parsimony_coefficient=0.01,
                                        function_set=self.function_set)

    def reset(self):
        del self.est_gp
        self.est_gp = SymbolicRegressor(population_size=pop_size,
                                        generations=generations, stopping_criteria=0.01,  # 20 gen
                                        p_crossover=p_crossover, p_subtree_mutation=0.1,
                                        p_hoist_mutation=0.05, p_point_mutation=0.1,
                                        warm_start=warm_start,
                                        max_samples=0.9, verbose=False,
                                        parsimony_coefficient=0.01,
                                        function_set=self.function_set)

    def soft_reset(self):
        del self.est_gp
        self.est_gp = SymbolicRegressor(population_size=pop_size,
                                        generations=generations, stopping_criteria=0.01,  # 20 gen
                                        p_crossover=p_crossover, p_subtree_mutation=0.1,
                                        p_hoist_mutation=0.05, p_point_mutation=0.1,
                                        warm_start=warm_start,
                                        max_samples=0.9, verbose=False,
                                        parsimony_coefficient=0.01,
                                        function_set=self.function_set)

    def predict(self, X):
        return self.est_gp.predict(X)

    def get_formula(self):
        return self.est_gp._program

    def get_simple_formula(self, digits=None):
        return self.get_formula()

    def get_big_formula(self):
        formula_string = str(self.get_formula())
        nested_list_string = formula_string.replace("sqrt(", "[\'sqrt\', ")
        nested_list_string = nested_list_string.replace("add(", "[\'+\', ")
        nested_list_string = nested_list_string.replace("mul(", "[\'*\', ")
        nested_list_string = nested_list_string.replace("sub(", "[\'-\', ")
        nested_list_string = nested_list_string.replace("sin(", "[\'sin\', ")
        nested_list_string = nested_list_string.replace(")", "]")
        nested_list_string = nested_list_string.replace("X", "Y")

        retval = ""
        currently_digits = False
        current_number = ""
        for current_char in nested_list_string:
            if current_char == 'Y':
                retval += "\'x"
                currently_digits = True
                current_number = ""
            elif currently_digits:
                if current_char.isdigit():
                    # retval += "{}".format(current_char)
                    current_number += "{}".format(current_char)
                else:
                    currently_digits = False
                    retval += "{}".format(int(current_number) + 1)
                    retval += "\'{}".format(current_char)
            else:
                retval += "{}".format(current_char)

        if "Y" in retval:
            print("ERROR: formula still contains a Y...")
            print("   formula string: {}\n   nested list string: {}".format(formula_string, nested_list_string))

        return eval(retval)

    def train(self, X, Y):
        X = np.reshape(X, [X.shape[0], -1])
        Y = np.reshape(Y, [-1, 1])
        Y = column_or_1d(Y)
        self.est_gp.fit(X, Y)
        return None

    # Does not repeat train. Sorry.
    def repeat_train(self, x, y, test_x=None, test_y=None,
                     num_repeats=settings.num_train_repeat_processes,
                     num_steps_to_train=settings.num_train_steps_in_repeat_mode,
                     verbose=True):
        train_set_size = int(len(x) * settings.quick_train_fraction + 0.1)
        x = np.array(x)
        y = np.reshape(np.array(y), [-1, ])
        sample = np.random.choice(range(x.shape[0]), size=train_set_size, replace=False)
        out_sample = [yyy for yyy in range(x.shape[0]) if yyy not in sample]

        train_x = x[sample][:]
        train_y = y[sample][:]
        valid_x = x[out_sample][:]
        valid_y = y[out_sample][:]

        old_time = time.time()

        if verbose:
            print("Beginning {} repeat sessions of {} iterations each.".format(num_repeats,
                                                                               settings.num_train_steps_in_repeat_mode))
            print()
            start_time = time.time()
            old_time = start_time

        self.soft_reset()
        self.train(train_x, train_y)

        current_time = time.time()
        if verbose:
            # print(self.get_simple_formula())
            print("Attained validation error: {:.5f}".format(valid_err))

        best_formula = self.get_simple_formula()
        if test_x is not None:
            safe_test_y = make_y_multi_safe(test_y)
            best_err = self.test(test_x, safe_test_y)
        else:
            best_err = self.test(valid_x, valid_y)

        if verbose:
            iters_per_minute = 60.0 / (current_time - old_time)
            print("Took {:.2f} minutes.".format((current_time - old_time) / 60))
            print("Est. {:.2f} minutes remaining.".format((num_repeats - train_iter) / iters_per_minute))
            print()

        return best_formula, 0, best_err

    # Mean square error
    def test(self, x, y):
        x = np.reshape(x, [x.shape[0], -1])
        y_hat = np.reshape(self.est_gp.predict(x), [1, -1])[0]
        y_gold = np.reshape(y, [1, -1])[0]
        our_sum = 0
        for i in range(len(y_gold)):
            our_sum += (y_hat[i] - y_gold[i]) ** 2

        return our_sum / len(y_gold)