Spaces:
Runtime error
Runtime error
Priyanka-Kumavat-At-TE
commited on
Commit
•
2fc2c1f
1
Parent(s):
3eb0b43
Upload 19 files
Browse files- supv/__init__.py +0 -0
- supv/bacl.py +493 -0
- supv/basic_nn.py +293 -0
- supv/fftn.py +240 -0
- supv/gbt.py +482 -0
- supv/gcn.py +444 -0
- supv/knn.py +106 -0
- supv/lrd.py +112 -0
- supv/lstm.py +414 -0
- supv/mcalib.py +384 -0
- supv/mcclf.py +207 -0
- supv/nlm.py +434 -0
- supv/optunar.py +127 -0
- supv/pasearch.py +243 -0
- supv/regress.py +253 -0
- supv/rf.py +134 -0
- supv/svm.py +141 -0
- supv/svml.py +428 -0
- supv/tnn.py +789 -0
supv/__init__.py
ADDED
File without changes
|
supv/bacl.py
ADDED
@@ -0,0 +1,493 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/local/bin/python3
|
2 |
+
|
3 |
+
# avenir-python: Machine Learning
|
4 |
+
# Author: Pranab Ghosh
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7 |
+
# may not use this file except in compliance with the License. You may
|
8 |
+
# obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15 |
+
# implied. See the License for the specific language governing
|
16 |
+
# permissions and limitations under the License.
|
17 |
+
|
18 |
+
# Package imports
|
19 |
+
import os
|
20 |
+
import sys
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
import numpy as np
|
23 |
+
import sklearn as sk
|
24 |
+
import matplotlib
|
25 |
+
import random
|
26 |
+
import jprops
|
27 |
+
from io import StringIO
|
28 |
+
from sklearn.model_selection import cross_val_score
|
29 |
+
import joblib
|
30 |
+
from random import randint
|
31 |
+
from io import StringIO
|
32 |
+
sys.path.append(os.path.abspath("../lib"))
|
33 |
+
from util import *
|
34 |
+
from mlutil import *
|
35 |
+
from pasearch import *
|
36 |
+
|
37 |
+
#base classifier class
|
38 |
+
class BaseClassifier(object):
|
39 |
+
|
40 |
+
def __init__(self, configFile, defValues, mname):
|
41 |
+
self.config = Configuration(configFile, defValues)
|
42 |
+
self.subSampleRate = None
|
43 |
+
self.featData = None
|
44 |
+
self.clsData = None
|
45 |
+
self.classifier = None
|
46 |
+
self.trained = False
|
47 |
+
self.verbose = self.config.getBooleanConfig("common.verbose")[0]
|
48 |
+
logFilePath = self.config.getStringConfig("common.logging.file")[0]
|
49 |
+
logLevName = self.config.getStringConfig("common.logging.level")[0]
|
50 |
+
self.logger = createLogger(mname, logFilePath, logLevName)
|
51 |
+
self.logger.info("********* starting session")
|
52 |
+
|
53 |
+
def initConfig(self, configFile, defValues):
|
54 |
+
"""
|
55 |
+
initialize config
|
56 |
+
"""
|
57 |
+
self.config = Configuration(configFile, defValues)
|
58 |
+
|
59 |
+
def getConfig(self):
|
60 |
+
"""
|
61 |
+
get config object
|
62 |
+
"""
|
63 |
+
return self.config
|
64 |
+
|
65 |
+
def setConfigParam(self, name, value):
|
66 |
+
"""
|
67 |
+
set config param
|
68 |
+
"""
|
69 |
+
self.config.setParam(name, value)
|
70 |
+
|
71 |
+
def getMode(self):
|
72 |
+
"""
|
73 |
+
get mode
|
74 |
+
"""
|
75 |
+
return self.config.getStringConfig("common.mode")[0]
|
76 |
+
|
77 |
+
def getSearchParamStrategy(self):
|
78 |
+
"""
|
79 |
+
get search parameter
|
80 |
+
"""
|
81 |
+
return self.config.getStringConfig("train.search.param.strategy")[0]
|
82 |
+
|
83 |
+
def train(self):
|
84 |
+
"""
|
85 |
+
train model
|
86 |
+
"""
|
87 |
+
#build model
|
88 |
+
self.buildModel()
|
89 |
+
|
90 |
+
# training data
|
91 |
+
if self.featData is None:
|
92 |
+
(featData, clsData) = self.prepTrainingData()
|
93 |
+
(self.featData, self.clsData) = (featData, clsData)
|
94 |
+
else:
|
95 |
+
(featData, clsData) = (self.featData, self.clsData)
|
96 |
+
if self.subSampleRate is not None:
|
97 |
+
(featData, clsData) = subSample(featData, clsData, self.subSampleRate, False)
|
98 |
+
self.logger.info("subsample size " + str(featData.shape[0]))
|
99 |
+
|
100 |
+
# parameters
|
101 |
+
modelSave = self.config.getBooleanConfig("train.model.save")[0]
|
102 |
+
|
103 |
+
#train
|
104 |
+
self.logger.info("...training model")
|
105 |
+
self.classifier.fit(featData, clsData)
|
106 |
+
score = self.classifier.score(featData, clsData)
|
107 |
+
successCriterion = self.config.getStringConfig("train.success.criterion")[0]
|
108 |
+
result = None
|
109 |
+
if successCriterion == "accuracy":
|
110 |
+
self.logger.info("accuracy with training data {:06.3f}".format(score))
|
111 |
+
result = score
|
112 |
+
elif successCriterion == "error":
|
113 |
+
error = 1.0 - score
|
114 |
+
self.logger.info("error with training data {:06.3f}".format(error))
|
115 |
+
result = error
|
116 |
+
else:
|
117 |
+
raise ValueError("invalid success criterion")
|
118 |
+
|
119 |
+
if modelSave:
|
120 |
+
self.logger.info("...saving model")
|
121 |
+
modelFilePath = self.getModelFilePath()
|
122 |
+
joblib.dump(self.classifier, modelFilePath)
|
123 |
+
self.trained = True
|
124 |
+
return result
|
125 |
+
|
126 |
+
def trainValidate(self):
|
127 |
+
"""
|
128 |
+
train with k fold validation
|
129 |
+
"""
|
130 |
+
#build model
|
131 |
+
self.buildModel()
|
132 |
+
|
133 |
+
# training data
|
134 |
+
(featData, clsData) = self.prepTrainingData()
|
135 |
+
|
136 |
+
#parameter
|
137 |
+
validation = self.config.getStringConfig("train.validation")[0]
|
138 |
+
numFolds = self.config.getIntConfig("train.num.folds")[0]
|
139 |
+
successCriterion = self.config.getStringConfig("train.success.criterion")[0]
|
140 |
+
scoreMethod = self.config.getStringConfig("train.score.method")[0]
|
141 |
+
|
142 |
+
#train with validation
|
143 |
+
self.logger.info("...training and kfold cross validating model")
|
144 |
+
scores = cross_val_score(self.classifier, featData, clsData, cv=numFolds,scoring=scoreMethod)
|
145 |
+
avScore = np.mean(scores)
|
146 |
+
result = self.reportResult(avScore, successCriterion, scoreMethod)
|
147 |
+
return result
|
148 |
+
|
149 |
+
def trainValidateSearch(self):
|
150 |
+
"""
|
151 |
+
train with k fold validation and search parameter space for optimum
|
152 |
+
"""
|
153 |
+
self.logger.info("...starting train validate with parameter search")
|
154 |
+
searchStrategyName = self.getSearchParamStrategy()
|
155 |
+
if searchStrategyName is not None:
|
156 |
+
if searchStrategyName == "grid":
|
157 |
+
searchStrategy = GuidedParameterSearch(self.verbose)
|
158 |
+
elif searchStrategyName == "random":
|
159 |
+
searchStrategy = RandomParameterSearch(self.verbose)
|
160 |
+
maxIter = self.config.getIntConfig("train.search.max.iterations")[0]
|
161 |
+
searchStrategy.setMaxIter(maxIter)
|
162 |
+
elif searchStrategyName == "simuan":
|
163 |
+
searchStrategy = SimulatedAnnealingParameterSearch(self.verbose)
|
164 |
+
maxIter = self.config.getIntConfig("train.search.max.iterations")[0]
|
165 |
+
searchStrategy.setMaxIter(maxIter)
|
166 |
+
temp = self.config.getFloatConfig("train.search.sa.temp")[0]
|
167 |
+
searchStrategy.setTemp(temp)
|
168 |
+
tempRedRate = self.config.getFloatConfig("train.search.sa.temp.red.rate")[0]
|
169 |
+
searchStrategy.setTempReductionRate(tempRedRate)
|
170 |
+
else:
|
171 |
+
raise ValueError("invalid paramtere search strategy")
|
172 |
+
else:
|
173 |
+
raise ValueError("missing search strategy")
|
174 |
+
|
175 |
+
# add search params
|
176 |
+
searchParams = self.config.getStringConfig("train.search.params")[0].split(",")
|
177 |
+
searchParamNames = []
|
178 |
+
extSearchParamNames = []
|
179 |
+
if searchParams is not None:
|
180 |
+
for searchParam in searchParams:
|
181 |
+
paramItems = searchParam.split(":")
|
182 |
+
extSearchParamNames.append(paramItems[0])
|
183 |
+
|
184 |
+
#get rid name component search
|
185 |
+
paramNameItems = paramItems[0].split(".")
|
186 |
+
del paramNameItems[1]
|
187 |
+
paramItems[0] = ".".join(paramNameItems)
|
188 |
+
|
189 |
+
searchStrategy.addParam(paramItems)
|
190 |
+
searchParamNames.append(paramItems[0])
|
191 |
+
else:
|
192 |
+
raise ValueError("missing search parameter list")
|
193 |
+
|
194 |
+
# add search param data list for each param
|
195 |
+
for (searchParamName,extSearchParamName) in zip(searchParamNames,extSearchParamNames):
|
196 |
+
searchParamData = self.config.getStringConfig(extSearchParamName)[0].split(",")
|
197 |
+
searchStrategy.addParamVaues(searchParamName, searchParamData)
|
198 |
+
|
199 |
+
# train and validate for various param value combination
|
200 |
+
searchStrategy.prepare()
|
201 |
+
paramValues = searchStrategy.nextParamValues()
|
202 |
+
searchResults = []
|
203 |
+
while paramValues is not None:
|
204 |
+
self.logger.info("...next parameter set")
|
205 |
+
paramStr = ""
|
206 |
+
for paramValue in paramValues:
|
207 |
+
self.setConfigParam(paramValue[0], str(paramValue[1]))
|
208 |
+
paramStr = paramStr + paramValue[0] + "=" + str(paramValue[1]) + " "
|
209 |
+
result = self.trainValidate()
|
210 |
+
searchStrategy.setCost(result)
|
211 |
+
searchResults.append((paramStr, result))
|
212 |
+
paramValues = searchStrategy.nextParamValues()
|
213 |
+
|
214 |
+
# output
|
215 |
+
self.logger.info("all parameter search results")
|
216 |
+
for searchResult in searchResults:
|
217 |
+
self.logger.info("{}\t{06.3f}".format(searchResult[0], searchResult[1]))
|
218 |
+
|
219 |
+
self.logger.info("best parameter search result")
|
220 |
+
bestSolution = searchStrategy.getBestSolution()
|
221 |
+
paramStr = ""
|
222 |
+
for paramValue in bestSolution[0]:
|
223 |
+
paramStr = paramStr + paramValue[0] + "=" + str(paramValue[1]) + " "
|
224 |
+
self.logger.info("{}\t{:06.3f}".format(paramStr, bestSolution[1]))
|
225 |
+
return bestSolution
|
226 |
+
|
227 |
+
def validate(self):
|
228 |
+
"""
|
229 |
+
predict
|
230 |
+
"""
|
231 |
+
# create model
|
232 |
+
useSavedModel = self.config.getBooleanConfig("validate.use.saved.model")[0]
|
233 |
+
if useSavedModel:
|
234 |
+
# load saved model
|
235 |
+
self.logger.info("...loading model")
|
236 |
+
modelFilePath = self.getModelFilePath()
|
237 |
+
self.classifier = joblib.load(modelFilePath)
|
238 |
+
else:
|
239 |
+
# train model
|
240 |
+
if not self.trained:
|
241 |
+
self.train()
|
242 |
+
|
243 |
+
# prepare test data
|
244 |
+
(featData, clsDataActual) = self.prepValidationData()
|
245 |
+
|
246 |
+
#predict
|
247 |
+
self.logger.info("...predicting")
|
248 |
+
clsDataPred = self.classifier.predict(featData)
|
249 |
+
|
250 |
+
self.logger.info("...validating")
|
251 |
+
#print clsData
|
252 |
+
scoreMethod = self.config.getStringConfig("validate.score.method")[0]
|
253 |
+
if scoreMethod == "accuracy":
|
254 |
+
accuracy = sk.metrics.accuracy_score(clsDataActual, clsDataPred)
|
255 |
+
self.logger.info("accuracy:")
|
256 |
+
self.logger.info(accuracy)
|
257 |
+
elif scoreMethod == "confusionMatrix":
|
258 |
+
confMatrx = sk.metrics.confusion_matrix(clsDataActual, clsDataPred)
|
259 |
+
self.logger.info("confusion matrix:")
|
260 |
+
self.logger.info(confMatrx)
|
261 |
+
|
262 |
+
|
263 |
+
def predictx(self):
|
264 |
+
"""
|
265 |
+
predict
|
266 |
+
"""
|
267 |
+
# create model
|
268 |
+
self.prepModel()
|
269 |
+
|
270 |
+
# prepare test data
|
271 |
+
featData = self.prepPredictData()
|
272 |
+
|
273 |
+
#predict
|
274 |
+
self.logger.info("...predicting")
|
275 |
+
clsData = self.classifier.predict(featData)
|
276 |
+
self.logger.info(clsData)
|
277 |
+
|
278 |
+
def predict(self, recs=None):
|
279 |
+
"""
|
280 |
+
predict with in memory data
|
281 |
+
"""
|
282 |
+
# create model
|
283 |
+
self.prepModel()
|
284 |
+
|
285 |
+
#input record
|
286 |
+
if recs:
|
287 |
+
#passed record
|
288 |
+
featData = self.prepStringPredictData(recs)
|
289 |
+
if (featData.ndim == 1):
|
290 |
+
featData = featData.reshape(1, -1)
|
291 |
+
else:
|
292 |
+
#file
|
293 |
+
featData = self.prepPredictData()
|
294 |
+
|
295 |
+
#predict
|
296 |
+
self.logger.info("...predicting")
|
297 |
+
clsData = self.classifier.predict(featData)
|
298 |
+
return clsData
|
299 |
+
|
300 |
+
def predictProb(self, recs):
|
301 |
+
"""
|
302 |
+
predict probability with in memory data
|
303 |
+
"""
|
304 |
+
raise ValueError("can not predict class probability")
|
305 |
+
|
306 |
+
def prepModel(self):
|
307 |
+
"""
|
308 |
+
preparing model
|
309 |
+
"""
|
310 |
+
useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
|
311 |
+
if (useSavedModel and not self.classifier):
|
312 |
+
# load saved model
|
313 |
+
self.logger.info("...loading saved model")
|
314 |
+
modelFilePath = self.getModelFilePath()
|
315 |
+
self.classifier = joblib.load(modelFilePath)
|
316 |
+
else:
|
317 |
+
# train model
|
318 |
+
if not self.trained:
|
319 |
+
self.train()
|
320 |
+
|
321 |
+
def prepTrainingData(self):
|
322 |
+
"""
|
323 |
+
loads and prepares training data
|
324 |
+
"""
|
325 |
+
# parameters
|
326 |
+
dataFile = self.config.getStringConfig("train.data.file")[0]
|
327 |
+
fieldIndices = self.config.getStringConfig("train.data.fields")[0]
|
328 |
+
if not fieldIndices is None:
|
329 |
+
fieldIndices = strToIntArray(fieldIndices, ",")
|
330 |
+
featFieldIndices = self.config.getStringConfig("train.data.feature.fields")[0]
|
331 |
+
if not featFieldIndices is None:
|
332 |
+
featFieldIndices = strToIntArray(featFieldIndices, ",")
|
333 |
+
classFieldIndex = self.config.getIntConfig("train.data.class.field")[0]
|
334 |
+
|
335 |
+
#training data
|
336 |
+
(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
|
337 |
+
if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
|
338 |
+
scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
|
339 |
+
featData = scaleData(featData, scalingMethod)
|
340 |
+
|
341 |
+
clsData = extrColumns(data, classFieldIndex)
|
342 |
+
clsData = np.array([int(a) for a in clsData])
|
343 |
+
return (featData, clsData)
|
344 |
+
|
345 |
+
def prepValidationData(self):
|
346 |
+
"""
|
347 |
+
loads and prepares training data
|
348 |
+
"""
|
349 |
+
# parameters
|
350 |
+
dataFile = self.config.getStringConfig("validate.data.file")[0]
|
351 |
+
fieldIndices = self.config.getStringConfig("validate.data.fields")[0]
|
352 |
+
if not fieldIndices is None:
|
353 |
+
fieldIndices = strToIntArray(fieldIndices, ",")
|
354 |
+
featFieldIndices = self.config.getStringConfig("validate.data.feature.fields")[0]
|
355 |
+
if not featFieldIndices is None:
|
356 |
+
featFieldIndices = strToIntArray(featFieldIndices, ",")
|
357 |
+
classFieldIndex = self.config.getIntConfig("validate.data.class.field")[0]
|
358 |
+
|
359 |
+
#training data
|
360 |
+
(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
|
361 |
+
if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
|
362 |
+
scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
|
363 |
+
featData = scaleData(featData, scalingMethod)
|
364 |
+
clsData = extrColumns(data, classFieldIndex)
|
365 |
+
clsData = [int(a) for a in clsData]
|
366 |
+
return (featData, clsData)
|
367 |
+
|
368 |
+
def prepPredictData(self):
|
369 |
+
"""
|
370 |
+
loads and prepares training data
|
371 |
+
"""
|
372 |
+
# parameters
|
373 |
+
dataFile = self.config.getStringConfig("predict.data.file")[0]
|
374 |
+
if dataFile is None:
|
375 |
+
raise ValueError("missing prediction data file")
|
376 |
+
fieldIndices = self.config.getStringConfig("predict.data.fields")[0]
|
377 |
+
if not fieldIndices is None:
|
378 |
+
fieldIndices = strToIntArray(fieldIndices, ",")
|
379 |
+
featFieldIndices = self.config.getStringConfig("predict.data.feature.fields")[0]
|
380 |
+
if not featFieldIndices is None:
|
381 |
+
featFieldIndices = strToIntArray(featFieldIndices, ",")
|
382 |
+
|
383 |
+
#training data
|
384 |
+
(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
|
385 |
+
if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
|
386 |
+
scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
|
387 |
+
featData = scaleData(featData, scalingMethod)
|
388 |
+
|
389 |
+
return featData
|
390 |
+
|
391 |
+
def prepStringPredictData(self, recs):
|
392 |
+
"""
|
393 |
+
prepare string predict data
|
394 |
+
"""
|
395 |
+
frecs = StringIO(recs)
|
396 |
+
featData = np.loadtxt(frecs, delimiter=',')
|
397 |
+
return featData
|
398 |
+
|
399 |
+
def getModelFilePath(self):
|
400 |
+
"""
|
401 |
+
get model file path
|
402 |
+
"""
|
403 |
+
modelDirectory = self.config.getStringConfig("common.model.directory")[0]
|
404 |
+
modelFile = self.config.getStringConfig("common.model.file")[0]
|
405 |
+
if modelFile is None:
|
406 |
+
raise ValueError("missing model file name")
|
407 |
+
modelFilePath = modelDirectory + "/" + modelFile
|
408 |
+
return modelFilePath
|
409 |
+
|
410 |
+
def reportResult(self, score, successCriterion, scoreMethod):
|
411 |
+
"""
|
412 |
+
report result
|
413 |
+
"""
|
414 |
+
if successCriterion == "accuracy":
|
415 |
+
self.logger.info("average " + scoreMethod + " with k fold cross validation {:06.3f}".format(score))
|
416 |
+
result = score
|
417 |
+
elif successCriterion == "error":
|
418 |
+
error = 1.0 - score
|
419 |
+
self.logger.info("average error with k fold cross validation {:06.3f}".format(error))
|
420 |
+
result = error
|
421 |
+
else:
|
422 |
+
raise ValueError("invalid success criterion")
|
423 |
+
return result
|
424 |
+
|
425 |
+
def autoTrain(self):
|
426 |
+
"""
|
427 |
+
auto train
|
428 |
+
"""
|
429 |
+
maxTestErr = self.config.getFloatConfig("train.auto.max.test.error")[0]
|
430 |
+
maxErr = self.config.getFloatConfig("train.auto.max.error")[0]
|
431 |
+
maxErrDiff = self.config.getFloatConfig("train.auto.max.error.diff")[0]
|
432 |
+
|
433 |
+
self.config.setParam("train.model.save", "False")
|
434 |
+
|
435 |
+
#train, validate and serach optimum parameter
|
436 |
+
result = self.trainValidateSearch()
|
437 |
+
testError = result[1]
|
438 |
+
|
439 |
+
#subsample training size to match train size for k fold validation
|
440 |
+
numFolds = self.config.getIntConfig("train.num.folds")[0]
|
441 |
+
self.subSampleRate = float(numFolds - 1) / numFolds
|
442 |
+
|
443 |
+
#train only with optimum parameter values
|
444 |
+
for paramValue in result[0]:
|
445 |
+
pName = paramValue[0]
|
446 |
+
pValue = paramValue[1]
|
447 |
+
self.logger.info(pName + " " + pValue)
|
448 |
+
self.setConfigParam(pName, pValue)
|
449 |
+
trainError = self.train()
|
450 |
+
|
451 |
+
if testError < maxTestErr:
|
452 |
+
# criteria based on test error only
|
453 |
+
self.logger.info("Successfullt trained. Low test error level")
|
454 |
+
status = 1
|
455 |
+
else:
|
456 |
+
# criteria based on bias error and generalization error
|
457 |
+
avError = (trainError + testError) / 2
|
458 |
+
diffError = testError - trainError
|
459 |
+
self.logger.info("Auto training completed: training error {:06.3f} test error: {:06.3f}".format(trainError, testError))
|
460 |
+
self.logger.info("Average of test and training error: {:06.3f} test and training error diff: {:06.3f}".format(avError, diffError))
|
461 |
+
if diffError > maxErrDiff:
|
462 |
+
# high generalization error
|
463 |
+
if avError > maxErr:
|
464 |
+
# high bias error
|
465 |
+
self.logger.info("High generalization error and high error. Need larger training data set and increased model complexity")
|
466 |
+
status = 4
|
467 |
+
else:
|
468 |
+
# low bias error
|
469 |
+
self.logger.info("High generalization error. Need larger training data set")
|
470 |
+
status = 3
|
471 |
+
else:
|
472 |
+
# low generalization error
|
473 |
+
if avError > maxErr:
|
474 |
+
# high bias error
|
475 |
+
self.logger.info("Converged, but with high error rate. Need to increase model complexity")
|
476 |
+
status = 2
|
477 |
+
else:
|
478 |
+
# low bias error
|
479 |
+
self.logger.info("Successfullt trained. Low generalization error and low bias error level")
|
480 |
+
status = 1
|
481 |
+
|
482 |
+
if status == 1:
|
483 |
+
#train final model, use all data and save model
|
484 |
+
self.logger.info("...training the final model")
|
485 |
+
self.config.setParam("train.model.save", "True")
|
486 |
+
self.subSampleRate = None
|
487 |
+
trainError = self.train()
|
488 |
+
self.logger.info("training error in final model {:06.3f}".format(trainError))
|
489 |
+
|
490 |
+
return status
|
491 |
+
|
492 |
+
|
493 |
+
|
supv/basic_nn.py
ADDED
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/Users/pranab/Tools/anaconda/bin/python
|
2 |
+
|
3 |
+
# avenir-python: Machine Learning
|
4 |
+
# Author: Pranab Ghosh
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7 |
+
# may not use this file except in compliance with the License. You may
|
8 |
+
# obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15 |
+
# implied. See the License for the specific language governing
|
16 |
+
# permissions and limitations under the License.
|
17 |
+
|
18 |
+
# Package imports
|
19 |
+
import os
|
20 |
+
import sys
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
import numpy as np
|
23 |
+
import sklearn
|
24 |
+
import sklearn.datasets
|
25 |
+
import sklearn.linear_model
|
26 |
+
import matplotlib
|
27 |
+
|
28 |
+
|
29 |
+
if len(sys.argv) != 7:
|
30 |
+
print "usage: <num_hidden_units> <data_set_size> <noise_in_data> <iteration_count> <learning_rate> <training_mode> "
|
31 |
+
sys.exit()
|
32 |
+
|
33 |
+
# number of hidden units
|
34 |
+
nn_hdim = int(sys.argv[1])
|
35 |
+
|
36 |
+
# dat set size
|
37 |
+
dsize = int(sys.argv[2])
|
38 |
+
|
39 |
+
# noise in training data
|
40 |
+
noise_level = float(sys.argv[3])
|
41 |
+
|
42 |
+
# iteration count
|
43 |
+
it_count = int(sys.argv[4])
|
44 |
+
|
45 |
+
# learning rate
|
46 |
+
epsilon = float(sys.argv[5])
|
47 |
+
|
48 |
+
#training mode
|
49 |
+
training_mode = sys.argv[6]
|
50 |
+
|
51 |
+
# validation
|
52 |
+
use_validation_data = True
|
53 |
+
|
54 |
+
# Generate a dataset
|
55 |
+
#noise_level = 0.20
|
56 |
+
#noise_level = 0.01
|
57 |
+
vlo = 100
|
58 |
+
vup = vlo + dsize / 5
|
59 |
+
vsize = vup - vlo
|
60 |
+
print "trainig data size %d" %(vsize)
|
61 |
+
np.random.seed(0)
|
62 |
+
XC, yc = sklearn.datasets.make_moons(dsize, noise=noise_level)
|
63 |
+
|
64 |
+
print "complete data set generated"
|
65 |
+
def print_array(X,y):
|
66 |
+
print X
|
67 |
+
print y
|
68 |
+
|
69 |
+
|
70 |
+
# Generate a validation dataset
|
71 |
+
#np.random.seed(0)
|
72 |
+
#XV, yv = sklearn.datasets.make_moons(40, noise=0.20)
|
73 |
+
#print "validation data set generated"
|
74 |
+
|
75 |
+
XV = XC[vlo:vup:1]
|
76 |
+
yv = yc[vlo:vup:1]
|
77 |
+
print "validation data generated"
|
78 |
+
#print_array(XV, yv)
|
79 |
+
|
80 |
+
X = np.delete(XC, np.s_[vlo:vup:1], 0)
|
81 |
+
y = np.delete(yc, np.s_[vlo:vup:1], 0)
|
82 |
+
print "training data generated"
|
83 |
+
#print_array(X, y)
|
84 |
+
print X
|
85 |
+
print y
|
86 |
+
|
87 |
+
|
88 |
+
# Parameters
|
89 |
+
num_examples = len(X) # training set size
|
90 |
+
nn_input_dim = 2 # input layer dimensionality
|
91 |
+
nn_output_dim = 2 # output layer dimensionality
|
92 |
+
|
93 |
+
#training data indices
|
94 |
+
tr_data_indices = np.arange(num_examples)
|
95 |
+
#print tr_data_indices
|
96 |
+
|
97 |
+
# Gradient descent parameters (I picked these by hand)
|
98 |
+
#epsilon = 0.01 # learning rate for gradient descent
|
99 |
+
reg_lambda = 0.01 # regularization strength
|
100 |
+
|
101 |
+
|
102 |
+
# Helper function to evaluate the total loss on the dataset
|
103 |
+
def calculate_loss(X,y,model):
|
104 |
+
W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
|
105 |
+
size = len(X)
|
106 |
+
|
107 |
+
# Forward propagation to calculate our predictions
|
108 |
+
z1 = X.dot(W1) + b1
|
109 |
+
a1 = np.tanh(z1)
|
110 |
+
z2 = a1.dot(W2) + b2
|
111 |
+
exp_scores = np.exp(z2)
|
112 |
+
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
|
113 |
+
|
114 |
+
# Calculating the loss
|
115 |
+
corect_logprobs = -np.log(probs[range(size), y])
|
116 |
+
data_loss = np.sum(corect_logprobs)
|
117 |
+
|
118 |
+
# Add regulatization term to loss (optional)
|
119 |
+
data_loss += reg_lambda/2 * (np.sum(np.square(W1)) + np.sum(np.square(W2)))
|
120 |
+
return 1./size * data_loss
|
121 |
+
|
122 |
+
|
123 |
+
# Helper function to predict an output (0 or 1)
|
124 |
+
def predict(model, x):
|
125 |
+
W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
|
126 |
+
|
127 |
+
# Forward propagation
|
128 |
+
z1 = x.dot(W1) + b1
|
129 |
+
a1 = np.tanh(z1)
|
130 |
+
z2 = a1.dot(W2) + b2
|
131 |
+
exp_scores = np.exp(z2)
|
132 |
+
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
|
133 |
+
return np.argmax(probs, axis=1)
|
134 |
+
|
135 |
+
# This function learns parameters for the neural network in batch mode and returns the model.
|
136 |
+
# - nn_hdim: Number of nodes in the hidden layer
|
137 |
+
# - num_passes: Number of passes through the training data for gradient descent
|
138 |
+
# - print_loss: If True, print the loss every 1000 iterations
|
139 |
+
def build_model_batch(nn_hdim, num_passes=10000, validation_interval=50):
|
140 |
+
# Initialize the parameters to random values. We need to learn these.
|
141 |
+
np.random.seed(0)
|
142 |
+
W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
|
143 |
+
b1 = np.zeros((1, nn_hdim))
|
144 |
+
W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
|
145 |
+
b2 = np.zeros((1, nn_output_dim))
|
146 |
+
|
147 |
+
# This is what we return at the end
|
148 |
+
model = {}
|
149 |
+
|
150 |
+
# Gradient descent. For each batch...
|
151 |
+
loss = -1.0
|
152 |
+
for i in xrange(0, num_passes):
|
153 |
+
#print "pass %d" %(i)
|
154 |
+
|
155 |
+
# Forward propagation
|
156 |
+
z1 = X.dot(W1) + b1
|
157 |
+
a1 = np.tanh(z1)
|
158 |
+
z2 = a1.dot(W2) + b2
|
159 |
+
exp_scores = np.exp(z2)
|
160 |
+
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
|
161 |
+
|
162 |
+
# Back propagation
|
163 |
+
delta3 = probs
|
164 |
+
delta3[range(num_examples), y] -= 1
|
165 |
+
dW2 = (a1.T).dot(delta3)
|
166 |
+
db2 = np.sum(delta3, axis=0, keepdims=True)
|
167 |
+
delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
|
168 |
+
dW1 = np.dot(X.T, delta2)
|
169 |
+
db1 = np.sum(delta2, axis=0)
|
170 |
+
|
171 |
+
# Add regularization terms (b1 and b2 don't have regularization terms)
|
172 |
+
dW2 += reg_lambda * W2
|
173 |
+
dW1 += reg_lambda * W1
|
174 |
+
|
175 |
+
# Gradient descent parameter update
|
176 |
+
W1 += -epsilon * dW1
|
177 |
+
b1 += -epsilon * db1
|
178 |
+
W2 += -epsilon * dW2
|
179 |
+
b2 += -epsilon * db2
|
180 |
+
|
181 |
+
# Assign new parameters to the model
|
182 |
+
model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
|
183 |
+
|
184 |
+
# This is expensive because it uses the whole dataset, so we don't want to do it too often.
|
185 |
+
if i % validation_interval == 0:
|
186 |
+
if use_validation_data:
|
187 |
+
cur_loss = calculate_loss(XV,yv,model)
|
188 |
+
else:
|
189 |
+
cur_loss = calculate_loss(X,y,model)
|
190 |
+
|
191 |
+
print "Loss after iteration %i: %.8f" %(i, cur_loss)
|
192 |
+
loss = cur_loss
|
193 |
+
|
194 |
+
|
195 |
+
return model
|
196 |
+
|
197 |
+
|
198 |
+
# This function learns parameters for the neural network in incremental and returns the model.
|
199 |
+
# - nn_hdim: Number of nodes in the hidden layer
|
200 |
+
# - num_passes: Number of passes through the training data for gradient descent
|
201 |
+
# - print_loss: If True, print the loss every 1000 iterations
|
202 |
+
def build_model_incr(nn_hdim, num_passes=10000, validation_interval=50):
|
203 |
+
# Initialize the parameters to random values. We need to learn these.
|
204 |
+
np.random.seed(0)
|
205 |
+
W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
|
206 |
+
b1 = np.zeros((1, nn_hdim))
|
207 |
+
W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
|
208 |
+
b2 = np.zeros((1, nn_output_dim))
|
209 |
+
|
210 |
+
# This is what we return at the end
|
211 |
+
model = {}
|
212 |
+
|
213 |
+
# gradient descent. For each batch...
|
214 |
+
loss = -1.0
|
215 |
+
for i in xrange(0, num_passes):
|
216 |
+
#print "pass %d" %(i)
|
217 |
+
|
218 |
+
#shuffle training data indices
|
219 |
+
np.random.shuffle(tr_data_indices)
|
220 |
+
|
221 |
+
# all training data
|
222 |
+
for j in tr_data_indices:
|
223 |
+
Xi = X[j].reshape(1,2)
|
224 |
+
yi = y[j].reshape(1)
|
225 |
+
|
226 |
+
# Forward propagation
|
227 |
+
z1 = Xi.dot(W1) + b1
|
228 |
+
a1 = np.tanh(z1)
|
229 |
+
z2 = a1.dot(W2) + b2
|
230 |
+
exp_scores = np.exp(z2)
|
231 |
+
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
|
232 |
+
|
233 |
+
# Back propagation
|
234 |
+
delta3 = probs
|
235 |
+
delta3[0,yi] -= 1
|
236 |
+
dW2 = (a1.T).dot(delta3)
|
237 |
+
db2 = np.sum(delta3, axis=0, keepdims=True)
|
238 |
+
delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
|
239 |
+
dW1 = np.dot(Xi.T, delta2)
|
240 |
+
db1 = np.sum(delta2, axis=0)
|
241 |
+
|
242 |
+
# Add regularization terms (b1 and b2 don't have regularization terms)
|
243 |
+
dW2 += reg_lambda * W2
|
244 |
+
dW1 += reg_lambda * W1
|
245 |
+
|
246 |
+
# Gradient descent parameter update
|
247 |
+
W1 += -epsilon * dW1
|
248 |
+
b1 += -epsilon * db1
|
249 |
+
W2 += -epsilon * dW2
|
250 |
+
b2 += -epsilon * db2
|
251 |
+
|
252 |
+
# Assign new parameters to the model
|
253 |
+
model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
|
254 |
+
|
255 |
+
# This is expensive because it uses the whole dataset, so we don't want to do it too often.
|
256 |
+
if i % validation_interval == 0:
|
257 |
+
if use_validation_data:
|
258 |
+
cur_loss = calculate_loss(XV,yv,model)
|
259 |
+
else:
|
260 |
+
cur_loss = calculate_loss(X,y,model)
|
261 |
+
|
262 |
+
print "Loss after iteration %i: %.8f" %(i, cur_loss)
|
263 |
+
loss = cur_loss
|
264 |
+
|
265 |
+
return model
|
266 |
+
|
267 |
+
|
268 |
+
# Build a model with a 3-dimensional hidden layer
|
269 |
+
if (training_mode == "batch"):
|
270 |
+
model = build_model_batch(nn_hdim, num_passes=it_count, validation_interval=1)
|
271 |
+
elif (training_mode == "incr"):
|
272 |
+
model = build_model_incr(nn_hdim, num_passes=it_count, validation_interval=1)
|
273 |
+
else:
|
274 |
+
print "invalid learning mode"
|
275 |
+
sys.exit()
|
276 |
+
|
277 |
+
print "hidden layer"
|
278 |
+
for row in model['W1']:
|
279 |
+
print(row)
|
280 |
+
|
281 |
+
print "hidden layer bias"
|
282 |
+
for row in model['b1']:
|
283 |
+
print(row)
|
284 |
+
|
285 |
+
print "output layer"
|
286 |
+
for row in model['W2']:
|
287 |
+
print(row)
|
288 |
+
|
289 |
+
print "output layer bias"
|
290 |
+
for row in model['b2']:
|
291 |
+
print(row)
|
292 |
+
|
293 |
+
|
supv/fftn.py
ADDED
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/local/bin/python3
|
2 |
+
|
3 |
+
# avenir-python: Machine Learning
|
4 |
+
# Author: Pranab Ghosh
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7 |
+
# may not use this file except in compliance with the License. You may
|
8 |
+
# obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15 |
+
# implied. See the License for the specific language governing
|
16 |
+
# permissions and limitations under the License.
|
17 |
+
|
18 |
+
# Package imports
|
19 |
+
import os
|
20 |
+
import sys
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
import numpy as np
|
23 |
+
import torch
|
24 |
+
from torch.autograd import Variable
|
25 |
+
from torch.utils.data import Dataset, TensorDataset
|
26 |
+
from torch.utils.data import DataLoader
|
27 |
+
import sklearn as sk
|
28 |
+
import matplotlib
|
29 |
+
import random
|
30 |
+
import jprops
|
31 |
+
from random import randint
|
32 |
+
sys.path.append(os.path.abspath("../lib"))
|
33 |
+
from util import *
|
34 |
+
from mlutil import *
|
35 |
+
from tnn import *
|
36 |
+
|
37 |
+
|
38 |
+
class FeedForwardTwinNetwork(FeedForwardNetwork):
|
39 |
+
"""
|
40 |
+
siamese twin feef forward network
|
41 |
+
"""
|
42 |
+
def __init__(self, configFile):
|
43 |
+
defValues = dict()
|
44 |
+
defValues["train.twin.crossenc"] = (False, None)
|
45 |
+
super(FeedForwardTwinNetwork, self).__init__(configFile, defValues)
|
46 |
+
|
47 |
+
def buildModel(self):
|
48 |
+
"""
|
49 |
+
Loads configuration and builds the various piecess necessary for the model
|
50 |
+
"""
|
51 |
+
super().buildModel()
|
52 |
+
|
53 |
+
#final fully connected after merge
|
54 |
+
|
55 |
+
feCount = self.config.getIntConfig("train.input.size")[0]
|
56 |
+
self.vaFe1 = self.validFeatData[:,:feCount]
|
57 |
+
self.vaFe2 = self.validFeatData[:,feCount:2*feCount]
|
58 |
+
self.vaFe3 = self.validFeatData[:,2*feCount:]
|
59 |
+
|
60 |
+
def forward(self, x1, x2, x3):
|
61 |
+
"""
|
62 |
+
Go through layers twice
|
63 |
+
"""
|
64 |
+
y1 = self.layers(x1)
|
65 |
+
y2 = self.layers(x2)
|
66 |
+
y3 = self.layers(x3)
|
67 |
+
y = (y1, y2, y3)
|
68 |
+
return y
|
69 |
+
|
70 |
+
@staticmethod
|
71 |
+
def batchTrain(model):
|
72 |
+
"""
|
73 |
+
train with batch data
|
74 |
+
"""
|
75 |
+
feCount = model.config.getIntConfig("train.input.size")[0]
|
76 |
+
fe1 = model.featData[:,:feCount]
|
77 |
+
fe2 = model.featData[:,feCount:2*feCount]
|
78 |
+
fe3 = model.featData[:,2*feCount:]
|
79 |
+
|
80 |
+
print(fe1.shape)
|
81 |
+
print(fe2.shape)
|
82 |
+
print(fe3.shape)
|
83 |
+
trainData = TensorDataset(fe1, fe2, fe3)
|
84 |
+
trainDataLoader = DataLoader(dataset=trainData, batch_size=model.batchSize, shuffle=True)
|
85 |
+
epochIntv = model.config.getIntConfig("train.epoch.intv")[0]
|
86 |
+
|
87 |
+
# train mode
|
88 |
+
model.train()
|
89 |
+
|
90 |
+
if model.trackErr:
|
91 |
+
trErr = list()
|
92 |
+
vaErr = list()
|
93 |
+
#epoch
|
94 |
+
for t in range(model.numIter):
|
95 |
+
#batch
|
96 |
+
b = 0
|
97 |
+
epochLoss = 0.0
|
98 |
+
for x1Batch, x2Batch, x3Batch in trainDataLoader:
|
99 |
+
|
100 |
+
# Forward pass: Compute predicted y by passing x to the model
|
101 |
+
yPred = model(x1Batch, x2Batch, x3Batch)
|
102 |
+
|
103 |
+
# Compute and print loss
|
104 |
+
loss = model.lossFn(yPred[0], yPred[1], yPred[2])
|
105 |
+
if model.verbose and t % epochIntv == 0 and model.batchIntv > 0 and b % model.batchIntv == 0:
|
106 |
+
print("epoch {} batch {} loss {:.6f}".format(t, b, loss.item()))
|
107 |
+
|
108 |
+
if model.trackErr and model.batchIntv == 0:
|
109 |
+
epochLoss += loss.item()
|
110 |
+
|
111 |
+
#error tracking at batch level
|
112 |
+
if model.trackErr and model.batchIntv > 0 and b % model.batchIntv == 0:
|
113 |
+
trErr.append(loss.item())
|
114 |
+
vloss = FeedForwardTwinNetwork.evaluateModel(model)
|
115 |
+
vaErr.append(vloss)
|
116 |
+
|
117 |
+
# Zero gradients, perform a backward pass, and update the weights.
|
118 |
+
model.optimizer.zero_grad()
|
119 |
+
loss.backward()
|
120 |
+
model.optimizer.step()
|
121 |
+
b += 1
|
122 |
+
|
123 |
+
#error tracking at epoch level
|
124 |
+
if model.trackErr and model.batchIntv == 0:
|
125 |
+
epochLoss /= b
|
126 |
+
if model.verbose:
|
127 |
+
print("epoch {} loss {:.6f}".format(t, epochLoss))
|
128 |
+
trErr.append(epochLoss)
|
129 |
+
vloss = FeedForwardTwinNetwork.evaluateModel(model)
|
130 |
+
vaErr.append(vloss)
|
131 |
+
|
132 |
+
#validate
|
133 |
+
"""
|
134 |
+
model.eval()
|
135 |
+
yPred = model(model.vaFeOne, model.vaFeTwo)
|
136 |
+
yPred = yPred.data.cpu().numpy()
|
137 |
+
yActual = model.validOutData.data.cpu().numpy()
|
138 |
+
if model.verbose:
|
139 |
+
vsize = yPred.shape[0]
|
140 |
+
print("\npredicted \t\t actual")
|
141 |
+
for i in range(vsize):
|
142 |
+
print(str(yPred[i]) + "\t" + str(yActual[i]))
|
143 |
+
|
144 |
+
score = perfMetric(model.accMetric, yActual, yPred)
|
145 |
+
print(yActual)
|
146 |
+
print(yPred)
|
147 |
+
print(formatFloat(3, score, "perf score"))
|
148 |
+
"""
|
149 |
+
|
150 |
+
#save
|
151 |
+
modelSave = model.config.getBooleanConfig("train.model.save")[0]
|
152 |
+
if modelSave:
|
153 |
+
FeedForwardNetwork.saveCheckpt(model)
|
154 |
+
|
155 |
+
if model.trackErr:
|
156 |
+
FeedForwardNetwork.errorPlot(model, trErr, vaErr)
|
157 |
+
|
158 |
+
return 1.0
|
159 |
+
|
160 |
+
|
161 |
+
@staticmethod
|
162 |
+
def evaluateModel(model):
|
163 |
+
"""
|
164 |
+
evaluate model
|
165 |
+
|
166 |
+
Parameters
|
167 |
+
model : torch model
|
168 |
+
"""
|
169 |
+
model.eval()
|
170 |
+
with torch.no_grad():
|
171 |
+
yPred = model(model.vaFe1, model.vaFe2, model.vaFe3)
|
172 |
+
score = model.lossFn(yPred[0], yPred[1], yPred[2]).item()
|
173 |
+
model.train()
|
174 |
+
return score
|
175 |
+
|
176 |
+
@staticmethod
|
177 |
+
def testModel(model):
|
178 |
+
"""
|
179 |
+
test model
|
180 |
+
|
181 |
+
Parameters
|
182 |
+
model : torch model
|
183 |
+
"""
|
184 |
+
useSavedModel = model.config.getBooleanConfig("predict.use.saved.model")[0]
|
185 |
+
if useSavedModel:
|
186 |
+
FeedForwardNetwork.restoreCheckpt(model)
|
187 |
+
else:
|
188 |
+
FeedForwardTwinNetwork.batchTrain(model)
|
189 |
+
|
190 |
+
dataSource = model.config.getStringConfig("predict.data.file")[0]
|
191 |
+
featData = FeedForwardNetwork.prepData(model, dataSource, False)
|
192 |
+
featData = torch.from_numpy(featData)
|
193 |
+
feCount = model.config.getIntConfig("train.input.size")[0]
|
194 |
+
fe1 = featData[:,:feCount]
|
195 |
+
fe2 = featData[:,feCount:2*feCount]
|
196 |
+
fe3 = featData[:,2*feCount:]
|
197 |
+
|
198 |
+
|
199 |
+
model.eval()
|
200 |
+
with torch.no_grad():
|
201 |
+
yp = model(fe1, fe2, fe3)
|
202 |
+
cos = torch.nn.CosineSimilarity()
|
203 |
+
s1 = cos(yp[0], yp[1]).data.cpu().numpy()
|
204 |
+
s2 = cos(yp[0], yp[2]).data.cpu().numpy()
|
205 |
+
#print(s1.shape)
|
206 |
+
|
207 |
+
n = yp[0].shape[0]
|
208 |
+
if model.verbose:
|
209 |
+
print(n)
|
210 |
+
for i in range(15):
|
211 |
+
if i % 3 == 0:
|
212 |
+
print("next")
|
213 |
+
print(yp[0][i])
|
214 |
+
print(yp[1][i])
|
215 |
+
print(yp[2][i])
|
216 |
+
print("similarity {:.3f} {:.3f}".format(s1[i], s2[i]))
|
217 |
+
|
218 |
+
tc = 0
|
219 |
+
cc = 0
|
220 |
+
outputSize = model.config.getIntConfig("train.output.size")[0]
|
221 |
+
for i in range(0, n, outputSize):
|
222 |
+
#for each sample outputSize no of rows
|
223 |
+
msi = None
|
224 |
+
imsi = None
|
225 |
+
for j in range(outputSize):
|
226 |
+
#first one positive , followed by all negative
|
227 |
+
si = (s1[i+j] + s2[i+j]) / 2
|
228 |
+
if msi == None or si > msi:
|
229 |
+
msi = si
|
230 |
+
imsi = j
|
231 |
+
tc += 1
|
232 |
+
if imsi == 0:
|
233 |
+
cc += 1
|
234 |
+
score = cc / tc
|
235 |
+
print("score: {:.3f}".format(score))
|
236 |
+
model.train()
|
237 |
+
return score
|
238 |
+
|
239 |
+
|
240 |
+
|
supv/gbt.py
ADDED
@@ -0,0 +1,482 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/local/bin/python3
|
2 |
+
|
3 |
+
# avenir-python: Machine Learning
|
4 |
+
# Author: Pranab Ghosh
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7 |
+
# may not use this file except in compliance with the License. You may
|
8 |
+
# obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15 |
+
# implied. See the License for the specific language governing
|
16 |
+
# permissions and limitations under the License.
|
17 |
+
|
18 |
+
# Package imports
|
19 |
+
import os
|
20 |
+
import sys
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
import numpy as np
|
23 |
+
import sklearn as sk
|
24 |
+
import matplotlib
|
25 |
+
import random
|
26 |
+
import jprops
|
27 |
+
from sklearn.ensemble import GradientBoostingClassifier
|
28 |
+
import joblib
|
29 |
+
from sklearn.metrics import accuracy_score
|
30 |
+
from sklearn.metrics import confusion_matrix
|
31 |
+
from sklearn.model_selection import cross_val_score
|
32 |
+
from random import randint
|
33 |
+
from io import StringIO
|
34 |
+
sys.path.append(os.path.abspath("../lib"))
|
35 |
+
from util import *
|
36 |
+
from mlutil import *
|
37 |
+
from pasearch import *
|
38 |
+
from bacl import *
|
39 |
+
|
40 |
+
# gradient boosting classification
|
41 |
+
class GradientBoostedTrees(object):
|
42 |
+
def __init__(self, configFile):
|
43 |
+
defValues = {}
|
44 |
+
defValues["common.mode"] = ("training", None)
|
45 |
+
defValues["common.model.directory"] = ("model", None)
|
46 |
+
defValues["common.model.file"] = (None, None)
|
47 |
+
defValues["common.preprocessing"] = (None, None)
|
48 |
+
defValues["common.verbose"] = (False, None)
|
49 |
+
defValues["train.data.file"] = (None, "missing training data file")
|
50 |
+
defValues["train.data.fields"] = (None, "missing training data field ordinals")
|
51 |
+
defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
|
52 |
+
defValues["train.data.class.field"] = (None, "missing class field ordinal")
|
53 |
+
defValues["train.validation"] = ("kfold", None)
|
54 |
+
defValues["train.num.folds"] = (5, None)
|
55 |
+
defValues["train.min.samples.split"] = ("4", None)
|
56 |
+
defValues["train.min.samples.leaf.gb"] = ("2", None)
|
57 |
+
defValues["train.max.depth.gb"] = (3, None)
|
58 |
+
defValues["train.max.leaf.nodes.gb"] = (None, None)
|
59 |
+
defValues["train.max.features.gb"] = (None, None)
|
60 |
+
defValues["train.learning.rate"] = (0.1, None)
|
61 |
+
defValues["train.num.estimators.gb"] = (100, None)
|
62 |
+
defValues["train.subsample"] = (1.0, None)
|
63 |
+
defValues["train.loss"] = ("deviance", None)
|
64 |
+
defValues["train.random.state"] = (None, None)
|
65 |
+
defValues["train.verbose"] = (0, None)
|
66 |
+
defValues["train.warm.start"] = (False, None)
|
67 |
+
defValues["train.presort"] = ("auto", None)
|
68 |
+
defValues["train.criterion"] = ("friedman_mse", None)
|
69 |
+
defValues["train.success.criterion"] = ("error", None)
|
70 |
+
defValues["train.model.save"] = (False, None)
|
71 |
+
defValues["train.score.method"] = ("accuracy", None)
|
72 |
+
defValues["train.search.param.strategy"] = (None, None)
|
73 |
+
defValues["train.search.params"] = (None, None)
|
74 |
+
defValues["predict.data.file"] = (None, None)
|
75 |
+
defValues["predict.data.fields"] = (None, "missing data field ordinals")
|
76 |
+
defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
|
77 |
+
defValues["predict.use.saved.model"] = (False, None)
|
78 |
+
defValues["validate.data.file"] = (None, "missing validation data file")
|
79 |
+
defValues["validate.data.fields"] = (None, "missing validation data field ordinals")
|
80 |
+
defValues["validate.data.feature.fields"] = (None, "missing validation data feature field ordinals")
|
81 |
+
defValues["validate.data.class.field"] = (None, "missing class field ordinal")
|
82 |
+
defValues["validate.use.saved.model"] = (False, None)
|
83 |
+
defValues["validate.score.method"] = ("accuracy", None)
|
84 |
+
|
85 |
+
self.config = Configuration(configFile, defValues)
|
86 |
+
self.subSampleRate = None
|
87 |
+
self.featData = None
|
88 |
+
self.clsData = None
|
89 |
+
self.gbcClassifier = None
|
90 |
+
self.verbose = self.config.getBooleanConfig("common.verbose")[0]
|
91 |
+
logFilePath = self.config.getStringConfig("common.logging.file")[0]
|
92 |
+
logLevName = self.config.getStringConfig("common.logging.level")[0]
|
93 |
+
self.logger = createLogger(__name__, logFilePath, logLevName)
|
94 |
+
self.logger.info("********* starting session")
|
95 |
+
|
96 |
+
# initialize config
|
97 |
+
def initConfig(self, configFile, defValues):
|
98 |
+
self.config = Configuration(configFile, defValues)
|
99 |
+
|
100 |
+
# get config object
|
101 |
+
def getConfig(self):
|
102 |
+
return self.config
|
103 |
+
|
104 |
+
#set config param
|
105 |
+
def setConfigParam(self, name, value):
|
106 |
+
self.config.setParam(name, value)
|
107 |
+
|
108 |
+
#get mode
|
109 |
+
def getMode(self):
|
110 |
+
return self.config.getStringConfig("common.mode")[0]
|
111 |
+
|
112 |
+
#get search parameter
|
113 |
+
def getSearchParamStrategy(self):
|
114 |
+
return self.config.getStringConfig("train.search.param.strategy")[0]
|
115 |
+
|
116 |
+
def setModel(self, model):
|
117 |
+
self.gbcClassifier = model
|
118 |
+
|
119 |
+
# train model
|
120 |
+
def train(self):
|
121 |
+
#build model
|
122 |
+
self.buildModel()
|
123 |
+
|
124 |
+
# training data
|
125 |
+
if self.featData is None:
|
126 |
+
(featData, clsData) = self.prepTrainingData()
|
127 |
+
(self.featData, self.clsData) = (featData, clsData)
|
128 |
+
else:
|
129 |
+
(featData, clsData) = (self.featData, self.clsData)
|
130 |
+
if self.subSampleRate is not None:
|
131 |
+
(featData, clsData) = subSample(featData, clsData, self.subSampleRate, False)
|
132 |
+
self.logger.info("subsample size " + str(featData.shape[0]))
|
133 |
+
|
134 |
+
# parameters
|
135 |
+
modelSave = self.config.getBooleanConfig("train.model.save")[0]
|
136 |
+
|
137 |
+
#train
|
138 |
+
self.logger.info("...training model")
|
139 |
+
self.gbcClassifier.fit(featData, clsData)
|
140 |
+
score = self.gbcClassifier.score(featData, clsData)
|
141 |
+
successCriterion = self.config.getStringConfig("train.success.criterion")[0]
|
142 |
+
result = None
|
143 |
+
if successCriterion == "accuracy":
|
144 |
+
self.logger.info("accuracy with training data {:06.3f}".format(score))
|
145 |
+
result = score
|
146 |
+
elif successCriterion == "error":
|
147 |
+
error = 1.0 - score
|
148 |
+
self.logger.info("error with training data {:06.3f}".format(error))
|
149 |
+
result = error
|
150 |
+
else:
|
151 |
+
raise ValueError("invalid success criterion")
|
152 |
+
|
153 |
+
if modelSave:
|
154 |
+
self.logger.info("...saving model")
|
155 |
+
modelFilePath = self.getModelFilePath()
|
156 |
+
joblib.dump(self.gbcClassifier, modelFilePath)
|
157 |
+
return result
|
158 |
+
|
159 |
+
#train with k fold validation
|
160 |
+
def trainValidate(self):
|
161 |
+
#build model
|
162 |
+
self.buildModel()
|
163 |
+
|
164 |
+
# training data
|
165 |
+
(featData, clsData) = self.prepTrainingData()
|
166 |
+
|
167 |
+
#parameter
|
168 |
+
validation = self.config.getStringConfig("train.validation")[0]
|
169 |
+
numFolds = self.config.getIntConfig("train.num.folds")[0]
|
170 |
+
successCriterion = self.config.getStringConfig("train.success.criterion")[0]
|
171 |
+
scoreMethod = self.config.getStringConfig("train.score.method")[0]
|
172 |
+
|
173 |
+
#train with validation
|
174 |
+
self.logger.info("...training and kfold cross validating model")
|
175 |
+
scores = cross_val_score(self.gbcClassifier, featData, clsData, cv=numFolds,scoring=scoreMethod)
|
176 |
+
avScore = np.mean(scores)
|
177 |
+
result = self.reportResult(avScore, successCriterion, scoreMethod)
|
178 |
+
return result
|
179 |
+
|
180 |
+
#train with k fold validation and search parameter space for optimum
|
181 |
+
def trainValidateSearch(self):
|
182 |
+
self.logger.info("...starting train validate with parameter search")
|
183 |
+
searchStrategyName = self.getSearchParamStrategy()
|
184 |
+
if searchStrategyName is not None:
|
185 |
+
if searchStrategyName == "grid":
|
186 |
+
searchStrategy = GuidedParameterSearch(self.verbose)
|
187 |
+
elif searchStrategyName == "random":
|
188 |
+
searchStrategy = RandomParameterSearch(self.verbose)
|
189 |
+
maxIter = self.config.getIntConfig("train.search.max.iterations")[0]
|
190 |
+
searchStrategy.setMaxIter(maxIter)
|
191 |
+
elif searchStrategyName == "simuan":
|
192 |
+
searchStrategy = SimulatedAnnealingParameterSearch(self.verbose)
|
193 |
+
maxIter = self.config.getIntConfig("train.search.max.iterations")[0]
|
194 |
+
searchStrategy.setMaxIter(maxIter)
|
195 |
+
temp = self.config.getFloatConfig("train.search.sa.temp")[0]
|
196 |
+
searchStrategy.setTemp(temp)
|
197 |
+
tempRedRate = self.config.getFloatConfig("train.search.sa.temp.red.rate")[0]
|
198 |
+
searchStrategy.setTempReductionRate(tempRedRate)
|
199 |
+
else:
|
200 |
+
raise ValueError("invalid paramtere search strategy")
|
201 |
+
else:
|
202 |
+
raise ValueError("missing search strategy")
|
203 |
+
|
204 |
+
# add search params
|
205 |
+
searchParams = self.config.getStringConfig("train.search.params")[0].split(",")
|
206 |
+
searchParamNames = []
|
207 |
+
extSearchParamNames = []
|
208 |
+
if searchParams is not None:
|
209 |
+
for searchParam in searchParams:
|
210 |
+
paramItems = searchParam.split(":")
|
211 |
+
extSearchParamNames.append(paramItems[0])
|
212 |
+
|
213 |
+
#get rid name component search
|
214 |
+
paramNameItems = paramItems[0].split(".")
|
215 |
+
del paramNameItems[1]
|
216 |
+
paramItems[0] = ".".join(paramNameItems)
|
217 |
+
|
218 |
+
searchStrategy.addParam(paramItems)
|
219 |
+
searchParamNames.append(paramItems[0])
|
220 |
+
else:
|
221 |
+
raise ValueError("missing search parameter list")
|
222 |
+
|
223 |
+
# add search param data list for each param
|
224 |
+
for (searchParamName,extSearchParamName) in zip(searchParamNames,extSearchParamNames):
|
225 |
+
searchParamData = self.config.getStringConfig(extSearchParamName)[0].split(",")
|
226 |
+
searchStrategy.addParamVaues(searchParamName, searchParamData)
|
227 |
+
|
228 |
+
# train and validate for various param value combination
|
229 |
+
searchStrategy.prepare()
|
230 |
+
paramValues = searchStrategy.nextParamValues()
|
231 |
+
searchResults = []
|
232 |
+
while paramValues is not None:
|
233 |
+
self.logger.info("...next parameter set")
|
234 |
+
paramStr = ""
|
235 |
+
for paramValue in paramValues:
|
236 |
+
self.setConfigParam(paramValue[0], str(paramValue[1]))
|
237 |
+
paramStr = paramStr + paramValue[0] + "=" + str(paramValue[1]) + " "
|
238 |
+
result = self.trainValidate()
|
239 |
+
searchStrategy.setCost(result)
|
240 |
+
searchResults.append((paramStr, result))
|
241 |
+
paramValues = searchStrategy.nextParamValues()
|
242 |
+
|
243 |
+
# output
|
244 |
+
self.logger.info("all parameter search results")
|
245 |
+
for searchResult in searchResults:
|
246 |
+
self.logger.info("{}\t{:06.3f}".format(searchResult[0], searchResult[1]))
|
247 |
+
|
248 |
+
self.logger.info("best parameter search result")
|
249 |
+
bestSolution = searchStrategy.getBestSolution()
|
250 |
+
paramStr = ""
|
251 |
+
for paramValue in bestSolution[0]:
|
252 |
+
paramStr = paramStr + paramValue[0] + "=" + str(paramValue[1]) + " "
|
253 |
+
self.logger.info("{}\t{:06.3f}".format(paramStr, bestSolution[1]))
|
254 |
+
return bestSolution
|
255 |
+
|
256 |
+
#predict
|
257 |
+
def validate(self):
|
258 |
+
# create model
|
259 |
+
useSavedModel = self.config.getBooleanConfig("validate.use.saved.model")[0]
|
260 |
+
if useSavedModel:
|
261 |
+
# load saved model
|
262 |
+
self.logger.info("...loading model")
|
263 |
+
modelFilePath = self.getModelFilePath()
|
264 |
+
self.gbcClassifier = joblib.load(modelFilePath)
|
265 |
+
else:
|
266 |
+
# train model
|
267 |
+
self.train()
|
268 |
+
|
269 |
+
# prepare test data
|
270 |
+
(featData, clsDataActual) = self.prepValidationData()
|
271 |
+
|
272 |
+
#predict
|
273 |
+
self.logger.info("...predicting")
|
274 |
+
clsDataPred = self.gbcClassifier.predict(featData)
|
275 |
+
|
276 |
+
self.logger.info("...validating")
|
277 |
+
#self.logger.info(clsData)
|
278 |
+
scoreMethod = self.config.getStringConfig("validate.score.method")[0]
|
279 |
+
if scoreMethod == "accuracy":
|
280 |
+
accuracy = accuracy_score(clsDataActual, clsDataPred)
|
281 |
+
self.logger.info("accuracy:")
|
282 |
+
self.logger.info(accuracy)
|
283 |
+
elif scoreMethod == "confusionMatrix":
|
284 |
+
confMatrx = confusion_matrix(clsDataActual, clsDataPred)
|
285 |
+
self.logger.info("confusion matrix:")
|
286 |
+
self.logger.info(confMatrx)
|
287 |
+
|
288 |
+
|
289 |
+
#predict
|
290 |
+
def predictx(self):
|
291 |
+
# create model
|
292 |
+
useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
|
293 |
+
if useSavedModel:
|
294 |
+
# load saved model
|
295 |
+
self.logger.info("...loading model")
|
296 |
+
modelFilePath = self.getModelFilePath()
|
297 |
+
self.gbcClassifier = joblib.load(modelFilePath)
|
298 |
+
else:
|
299 |
+
# train model
|
300 |
+
self.train()
|
301 |
+
|
302 |
+
# prepare test data
|
303 |
+
featData = self.prepPredictData()
|
304 |
+
|
305 |
+
#predict
|
306 |
+
self.logger.info("...predicting")
|
307 |
+
clsData = self.gbcClassifier.predict(featData)
|
308 |
+
self.logger.info(clsData)
|
309 |
+
|
310 |
+
#predict with in memory data
|
311 |
+
def predict(self, recs=None):
|
312 |
+
# create model
|
313 |
+
self.prepModel()
|
314 |
+
|
315 |
+
#input record
|
316 |
+
#input record
|
317 |
+
if recs:
|
318 |
+
#passed record
|
319 |
+
featData = self.prepStringPredictData(recs)
|
320 |
+
if (featData.ndim == 1):
|
321 |
+
featData = featData.reshape(1, -1)
|
322 |
+
else:
|
323 |
+
#file
|
324 |
+
featData = self.prepPredictData()
|
325 |
+
|
326 |
+
#predict
|
327 |
+
self.logger.info("...predicting")
|
328 |
+
clsData = self.gbcClassifier.predict(featData)
|
329 |
+
return clsData
|
330 |
+
|
331 |
+
#predict probability with in memory data
|
332 |
+
def predictProb(self, recs):
|
333 |
+
# create model
|
334 |
+
self.prepModel()
|
335 |
+
|
336 |
+
#input record
|
337 |
+
if type(recs) is str:
|
338 |
+
featData = self.prepStringPredictData(recs)
|
339 |
+
else:
|
340 |
+
featData = recs
|
341 |
+
#self.logger.info(featData.shape)
|
342 |
+
if (featData.ndim == 1):
|
343 |
+
featData = featData.reshape(1, -1)
|
344 |
+
|
345 |
+
#predict
|
346 |
+
self.logger.info("...predicting class probability")
|
347 |
+
clsData = self.gbcClassifier.predict_proba(featData)
|
348 |
+
return clsData
|
349 |
+
|
350 |
+
#preparing model
|
351 |
+
def prepModel(self):
|
352 |
+
useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
|
353 |
+
if (useSavedModel and not self.gbcClassifier):
|
354 |
+
# load saved model
|
355 |
+
self.logger.info("...loading saved model")
|
356 |
+
modelFilePath = self.getModelFilePath()
|
357 |
+
self.gbcClassifier = joblib.load(modelFilePath)
|
358 |
+
else:
|
359 |
+
# train model
|
360 |
+
self.train()
|
361 |
+
return self.gbcClassifier
|
362 |
+
|
363 |
+
#prepare string predict data
|
364 |
+
def prepStringPredictData(self, recs):
|
365 |
+
frecs = StringIO(recs)
|
366 |
+
featData = np.loadtxt(frecs, delimiter=',')
|
367 |
+
#self.logger.info(featData)
|
368 |
+
return featData
|
369 |
+
|
370 |
+
#loads and prepares training data
|
371 |
+
def prepTrainingData(self):
|
372 |
+
# parameters
|
373 |
+
dataFile = self.config.getStringConfig("train.data.file")[0]
|
374 |
+
fieldIndices = self.config.getStringConfig("train.data.fields")[0]
|
375 |
+
if not fieldIndices is None:
|
376 |
+
fieldIndices = strToIntArray(fieldIndices, ",")
|
377 |
+
featFieldIndices = self.config.getStringConfig("train.data.feature.fields")[0]
|
378 |
+
if not featFieldIndices is None:
|
379 |
+
featFieldIndices = strToIntArray(featFieldIndices, ",")
|
380 |
+
classFieldIndex = self.config.getIntConfig("train.data.class.field")[0]
|
381 |
+
|
382 |
+
#training data
|
383 |
+
(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
|
384 |
+
clsData = extrColumns(data, classFieldIndex)
|
385 |
+
clsData = np.array([int(a) for a in clsData])
|
386 |
+
return (featData, clsData)
|
387 |
+
|
388 |
+
#loads and prepares training data
|
389 |
+
def prepValidationData(self):
|
390 |
+
# parameters
|
391 |
+
dataFile = self.config.getStringConfig("validate.data.file")[0]
|
392 |
+
fieldIndices = self.config.getStringConfig("validate.data.fields")[0]
|
393 |
+
if not fieldIndices is None:
|
394 |
+
fieldIndices = strToIntArray(fieldIndices, ",")
|
395 |
+
featFieldIndices = self.config.getStringConfig("validate.data.feature.fields")[0]
|
396 |
+
if not featFieldIndices is None:
|
397 |
+
featFieldIndices = strToIntArray(featFieldIndices, ",")
|
398 |
+
classFieldIndex = self.config.getIntConfig("validate.data.class.field")[0]
|
399 |
+
|
400 |
+
#training data
|
401 |
+
(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
|
402 |
+
clsData = extrColumns(data, classFieldIndex)
|
403 |
+
clsData = [int(a) for a in clsData]
|
404 |
+
return (featData, clsData)
|
405 |
+
|
406 |
+
#loads and prepares training data
|
407 |
+
def prepPredictData(self):
|
408 |
+
# parameters
|
409 |
+
dataFile = self.config.getStringConfig("predict.data.file")[0]
|
410 |
+
if dataFile is None:
|
411 |
+
raise ValueError("missing prediction data file")
|
412 |
+
fieldIndices = self.config.getStringConfig("predict.data.fields")[0]
|
413 |
+
if not fieldIndices is None:
|
414 |
+
fieldIndices = strToIntArray(fieldIndices, ",")
|
415 |
+
featFieldIndices = self.config.getStringConfig("predict.data.feature.fields")[0]
|
416 |
+
if not featFieldIndices is None:
|
417 |
+
featFieldIndices = strToIntArray(featFieldIndices, ",")
|
418 |
+
|
419 |
+
#training data
|
420 |
+
(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
|
421 |
+
|
422 |
+
return featData
|
423 |
+
|
424 |
+
# get model file path
|
425 |
+
def getModelFilePath(self):
|
426 |
+
modelDirectory = self.config.getStringConfig("common.model.directory")[0]
|
427 |
+
modelFile = self.config.getStringConfig("common.model.file")[0]
|
428 |
+
if modelFile is None:
|
429 |
+
raise ValueError("missing model file name")
|
430 |
+
modelFilePath = modelDirectory + "/" + modelFile
|
431 |
+
return modelFilePath
|
432 |
+
|
433 |
+
# report result
|
434 |
+
def reportResult(self, score, successCriterion, scoreMethod):
|
435 |
+
if successCriterion == "accuracy":
|
436 |
+
self.logger.info("average " + scoreMethod + " with k fold cross validation {:06.3f}".format(score))
|
437 |
+
result = score
|
438 |
+
elif successCriterion == "error":
|
439 |
+
error = 1.0 - score
|
440 |
+
self.logger.info("average error with k fold cross validation {:06.3f}".format(error))
|
441 |
+
result = error
|
442 |
+
else:
|
443 |
+
raise ValueError("invalid success criterion")
|
444 |
+
return result
|
445 |
+
|
446 |
+
# builds model object
|
447 |
+
def buildModel(self):
|
448 |
+
self.logger.info("...building gradient boosted tree model")
|
449 |
+
# parameters
|
450 |
+
minSamplesSplit = self.config.getStringConfig("train.min.samples.split")[0]
|
451 |
+
minSamplesSplit = typedValue(minSamplesSplit)
|
452 |
+
minSamplesLeaf = self.config.getStringConfig("train.min.samples.leaf.gb")[0]
|
453 |
+
minSamplesLeaf = typedValue(minSamplesLeaf)
|
454 |
+
#minWeightFractionLeaf = self.config.getFloatConfig("train.min.weight.fraction.leaf.gb")[0]
|
455 |
+
(maxDepth, maxLeafNodes) = self.config.eitherOrIntConfig("train.max.depth.gb", "train.max.leaf.nodes.gb")
|
456 |
+
maxFeatures = self.config.getStringConfig("train.max.features.gb")[0]
|
457 |
+
maxFeatures = typedValue(maxFeatures)
|
458 |
+
learningRate = self.config.getFloatConfig("train.learning.rate")[0]
|
459 |
+
numEstimators = self.config.getIntConfig("train.num.estimators.gb")[0]
|
460 |
+
subsampleFraction = self.config.getFloatConfig("train.subsample")[0]
|
461 |
+
lossFun = self.config.getStringConfig("train.loss")[0]
|
462 |
+
randomState = self.config.getIntConfig("train.random.state")[0]
|
463 |
+
verboseOutput = self.config.getIntConfig("train.verbose")[0]
|
464 |
+
warmStart = self.config.getBooleanConfig("train.warm.start")[0]
|
465 |
+
presort = self.config.getStringConfig("train.presort")
|
466 |
+
if (presort[1]):
|
467 |
+
presortChoice = presort[0]
|
468 |
+
else:
|
469 |
+
presortChoice = presort[0].lower() == "true"
|
470 |
+
splitCriterion = self.config.getStringConfig("train.criterion")[0]
|
471 |
+
|
472 |
+
#classifier
|
473 |
+
self.gbcClassifier = GradientBoostingClassifier(loss=lossFun, learning_rate=learningRate, n_estimators=numEstimators,
|
474 |
+
subsample=subsampleFraction, min_samples_split=minSamplesSplit,
|
475 |
+
min_samples_leaf=minSamplesLeaf, min_weight_fraction_leaf=0.0, max_depth=maxDepth,
|
476 |
+
init=None, random_state=randomState, max_features=maxFeatures, verbose=verboseOutput,
|
477 |
+
max_leaf_nodes=maxLeafNodes, warm_start=warmStart, presort=presortChoice)
|
478 |
+
|
479 |
+
|
480 |
+
|
481 |
+
|
482 |
+
|
supv/gcn.py
ADDED
@@ -0,0 +1,444 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/local/bin/python3
|
2 |
+
|
3 |
+
# avenir-python: Machine Learning
|
4 |
+
# Author: Pranab Ghosh
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7 |
+
# may not use this file except in compliance with the License. You may
|
8 |
+
# obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15 |
+
# implied. See the License for the specific language governing
|
16 |
+
# permissions and limitations under the License.
|
17 |
+
|
18 |
+
# Package imports
|
19 |
+
import os
|
20 |
+
import sys
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
import matplotlib
|
23 |
+
import random
|
24 |
+
from random import randint
|
25 |
+
from itertools import compress
|
26 |
+
import numpy as np
|
27 |
+
import torch
|
28 |
+
from torch import nn
|
29 |
+
from torch.nn import Linear
|
30 |
+
from torch.autograd import Variable
|
31 |
+
from torch.utils.data import DataLoader
|
32 |
+
from torchvision import transforms
|
33 |
+
from torch_geometric.nn import GCNConv
|
34 |
+
from torch_geometric.nn import MessagePassing
|
35 |
+
from torch_geometric.data import Data
|
36 |
+
import sklearn as sk
|
37 |
+
import jprops
|
38 |
+
sys.path.append(os.path.abspath("../lib"))
|
39 |
+
from util import *
|
40 |
+
from mlutil import *
|
41 |
+
from tnn import FeedForwardNetwork
|
42 |
+
|
43 |
+
"""
|
44 |
+
Graph convolution network
|
45 |
+
"""
|
46 |
+
|
47 |
+
class GraphConvoNetwork(nn.Module):
|
48 |
+
def __init__(self, configFile):
|
49 |
+
"""
|
50 |
+
initilizer
|
51 |
+
|
52 |
+
Parameters
|
53 |
+
configFile : config file path
|
54 |
+
"""
|
55 |
+
defValues = dict()
|
56 |
+
defValues["common.model.directory"] = ("model", None)
|
57 |
+
defValues["common.model.file"] = (None, None)
|
58 |
+
defValues["common.preprocessing"] = (None, None)
|
59 |
+
defValues["common.scaling.method"] = ("zscale", None)
|
60 |
+
defValues["common.scaling.minrows"] = (50, None)
|
61 |
+
defValues["common.scaling.param.file"] = (None, None)
|
62 |
+
defValues["common.verbose"] = (False, None)
|
63 |
+
defValues["common.device"] = ("cpu", None)
|
64 |
+
defValues["train.data.file"] = (None, "missing training data file")
|
65 |
+
defValues["train.data.num.nodes.total"] = (None, None)
|
66 |
+
defValues["train.data.num.nodes.training"] = (None, None)
|
67 |
+
defValues["train.data.splits"] = ([.75,.15,.10], None)
|
68 |
+
defValues["train.layer.data"] = (None, "missing layer data")
|
69 |
+
defValues["train.input.size"] = (None, "missing output size")
|
70 |
+
defValues["train.output.size"] = (None, "missing output size")
|
71 |
+
defValues["train.loss.reduction"] = ("mean", None)
|
72 |
+
defValues["train.num.iterations"] = (500, None)
|
73 |
+
defValues["train.lossFn"] = ("mse", None)
|
74 |
+
defValues["train.optimizer"] = ("sgd", None)
|
75 |
+
defValues["train.opt.learning.rate"] = (.0001, None)
|
76 |
+
defValues["train.opt.weight.decay"] = (0, None)
|
77 |
+
defValues["train.opt.momentum"] = (0, None)
|
78 |
+
defValues["train.opt.eps"] = (1e-08, None)
|
79 |
+
defValues["train.opt.dampening"] = (0, None)
|
80 |
+
defValues["train.opt.momentum.nesterov"] = (False, None)
|
81 |
+
defValues["train.opt.betas"] = ([0.9, 0.999], None)
|
82 |
+
defValues["train.opt.alpha"] = (0.99, None)
|
83 |
+
defValues["train.save.model"] = (False, None)
|
84 |
+
defValues["train.track.error"] = (False, None)
|
85 |
+
defValues["train.epoch.intv"] = (5, None)
|
86 |
+
defValues["train.print.weights"] = (False, None)
|
87 |
+
defValues["valid.accuracy.metric"] = (None, None)
|
88 |
+
defValues["predict.create.mask"] = (False, None)
|
89 |
+
defValues["predict.use.saved.model"] = (True, None)
|
90 |
+
|
91 |
+
self.config = Configuration(configFile, defValues)
|
92 |
+
super(GraphConvoNetwork, self).__init__()
|
93 |
+
|
94 |
+
|
95 |
+
def getConfig(self):
|
96 |
+
"""
|
97 |
+
return config
|
98 |
+
"""
|
99 |
+
return self.config
|
100 |
+
|
101 |
+
def buildModel(self):
|
102 |
+
"""
|
103 |
+
Loads configuration and builds the various piecess necessary for the model
|
104 |
+
"""
|
105 |
+
torch.manual_seed(9999)
|
106 |
+
|
107 |
+
self.verbose = self.config.getBooleanConfig("common.verbose")[0]
|
108 |
+
numinp = self.config.getIntConfig("train.input.size")[0]
|
109 |
+
self.outputSize = self.config.getIntConfig("train.output.size")[0]
|
110 |
+
self.numIter = self.config.getIntConfig("train.num.iterations")[0]
|
111 |
+
optimizer = self.config.getStringConfig("train.optimizer")[0]
|
112 |
+
self.lossFnStr = self.config.getStringConfig("train.lossFn")[0]
|
113 |
+
self.accMetric = self.config.getStringConfig("valid.accuracy.metric")[0]
|
114 |
+
self.trackErr = self.config.getBooleanConfig("train.track.error")[0]
|
115 |
+
self.restored = False
|
116 |
+
self.clabels = list(range(self.outputSize)) if self.outputSize > 1 else None
|
117 |
+
|
118 |
+
#build network
|
119 |
+
layers = list()
|
120 |
+
ninp = numinp
|
121 |
+
trData = self.config.getStringConfig("train.layer.data")[0].split(",")
|
122 |
+
for ld in trData:
|
123 |
+
lde = ld.split(":")
|
124 |
+
ne = len(lde)
|
125 |
+
assert ne == 5 or ne == 6, "expecting 5 or 6 items for layer data"
|
126 |
+
|
127 |
+
gconv = False
|
128 |
+
if ne == 6:
|
129 |
+
if lde[0] == "gconv":
|
130 |
+
gconv == True
|
131 |
+
lde = lde[1:]
|
132 |
+
|
133 |
+
#num of units, activation, whether batch normalize, whether batch normalize after activation, dropout fraction
|
134 |
+
nunit = int(lde[0])
|
135 |
+
actStr = lde[1]
|
136 |
+
act = FeedForwardNetwork.createActivation(actStr) if actStr != "none" else None
|
137 |
+
bnorm = lde[2] == "true"
|
138 |
+
afterAct = lde[3] == "true"
|
139 |
+
dpr = float(lde[4])
|
140 |
+
|
141 |
+
if gconv:
|
142 |
+
layers.append(GCNConv(ninp, nunit))
|
143 |
+
else:
|
144 |
+
layers.append(Linear(ninp, nunit))
|
145 |
+
if bnorm:
|
146 |
+
#with batch norm
|
147 |
+
if afterAct:
|
148 |
+
safeAppend(layers, act)
|
149 |
+
layers.append(torch.nn.BatchNorm1d(nunit))
|
150 |
+
else:
|
151 |
+
layers.append(torch.nn.BatchNorm1d(nunit))
|
152 |
+
safeAppend(layers, act)
|
153 |
+
else:
|
154 |
+
#without batch norm
|
155 |
+
safeAppend(layers, act)
|
156 |
+
|
157 |
+
if dpr > 0:
|
158 |
+
layers.append(torch.nn.Dropout(dpr))
|
159 |
+
ninp = nunit
|
160 |
+
|
161 |
+
self.layers = torch.nn.ModuleList(layers)
|
162 |
+
self.device = FeedForwardNetwork.getDevice(self)
|
163 |
+
self.to(self.device)
|
164 |
+
self.loadData()
|
165 |
+
|
166 |
+
self.lossFn = FeedForwardNetwork.createLossFunction(self, self.lossFnStr)
|
167 |
+
self.optimizer = FeedForwardNetwork.createOptimizer(self, optimizer)
|
168 |
+
self.trained = False
|
169 |
+
|
170 |
+
def loadData(self):
|
171 |
+
"""
|
172 |
+
load node and edge data
|
173 |
+
"""
|
174 |
+
dataFilePath = self.config.getStringConfig("train.data.file")[0]
|
175 |
+
numNodes = self.config.getIntConfig("train.data.num.nodes.total")[0]
|
176 |
+
numLabeled = self.config.getIntConfig("train.data.num.nodes.training")[0]
|
177 |
+
splits = self.config.getFloatListConfig("train.data.splits")[0]
|
178 |
+
crPredMask = self.config.getBooleanConfig("predict.create.mask")[0]
|
179 |
+
|
180 |
+
dx = list()
|
181 |
+
dy = list()
|
182 |
+
edges = list()
|
183 |
+
mask = None
|
184 |
+
for rec in fileRecGen(dataFilePath, ","):
|
185 |
+
if len(rec) > 2:
|
186 |
+
x = rec[1 :-1]
|
187 |
+
x = toFloatList(x)
|
188 |
+
y = int(rec[-1])
|
189 |
+
dx.append(x)
|
190 |
+
dy.append(y)
|
191 |
+
elif len(rec) == 2:
|
192 |
+
e = toIntList(rec)
|
193 |
+
edges.append(e)
|
194 |
+
elif len(rec) == 1:
|
195 |
+
items = rec[0].split()
|
196 |
+
assertEqual(items[0], "mask", "invalid mask data")
|
197 |
+
numNodes = int(items[1])
|
198 |
+
print(numNodes)
|
199 |
+
mask = list()
|
200 |
+
for r in range(2, len(items), 1):
|
201 |
+
ri = items[r].split(":")
|
202 |
+
#print(ri)
|
203 |
+
ms = list(range(int(ri[0]), int(ri[1]), 1))
|
204 |
+
mask.extend(ms)
|
205 |
+
#scale node features
|
206 |
+
if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
|
207 |
+
scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
|
208 |
+
dx = scaleData(dx, scalingMethod)
|
209 |
+
|
210 |
+
dx = torch.tensor(dx, dtype=torch.float)
|
211 |
+
dy = torch.tensor(dy, dtype=torch.long)
|
212 |
+
edges = torch.tensor(edges, dtype=torch.long)
|
213 |
+
edges = edges.t().contiguous()
|
214 |
+
dx = dx.to(self.device)
|
215 |
+
dy = dy.to(self.device)
|
216 |
+
edges = edges.to(self.device)
|
217 |
+
self.data = Data(x=dx, edge_index=edges, y=dy)
|
218 |
+
|
219 |
+
#maks
|
220 |
+
if mask is None:
|
221 |
+
#trainiug data in the beginning
|
222 |
+
trStart = 0
|
223 |
+
vaStart = int(splits[0] * numLabeled)
|
224 |
+
teStart = vaStart + int(splits[1] * numLabeled)
|
225 |
+
|
226 |
+
trMask = [False] * numNodes
|
227 |
+
trMask[0:vaStart] = [True] * vaStart
|
228 |
+
vaMask = [False] * numNodes
|
229 |
+
vaMask[vaStart:teStart] = [True] * (teStart - vaStart)
|
230 |
+
teMask = [False] * numNodes
|
231 |
+
teMask[teStart:] = [True] * (numNodes - teStart)
|
232 |
+
else:
|
233 |
+
#training data anywhere
|
234 |
+
if crPredMask:
|
235 |
+
prMask = [True] * numNodes
|
236 |
+
for i in mask:
|
237 |
+
prMask[i] = False
|
238 |
+
self.prMask = torch.tensor(prMask, dtype=torch.bool)
|
239 |
+
|
240 |
+
nshuffle = int(len(mask) / 2)
|
241 |
+
shuffle(mask, nshuffle)
|
242 |
+
#print(mask)
|
243 |
+
lmask = len(mask)
|
244 |
+
trme = int(splits[0] * lmask)
|
245 |
+
vame = int((splits[0] + splits[1]) * lmask)
|
246 |
+
teme = lmask
|
247 |
+
trMask = [False] * numNodes
|
248 |
+
for i in mask[:trme]:
|
249 |
+
trMask[i] = True
|
250 |
+
vaMask = [False] * numNodes
|
251 |
+
for i in mask[trme:vame]:
|
252 |
+
vaMask[i] = True
|
253 |
+
teMask = [False] * numNodes
|
254 |
+
for i in mask[vame:]:
|
255 |
+
teMask[i] = True
|
256 |
+
#print(vaMask)
|
257 |
+
|
258 |
+
trMask = torch.tensor(trMask, dtype=torch.bool)
|
259 |
+
trMask = trMask.to(self.device)
|
260 |
+
self.data.train_mask = trMask
|
261 |
+
vaMask = torch.tensor(vaMask, dtype=torch.bool)
|
262 |
+
vaMask = vaMask.to(self.device)
|
263 |
+
self.data.val_mask = vaMask
|
264 |
+
teMask = torch.tensor(teMask, dtype=torch.bool)
|
265 |
+
teMask = teMask.to(self.device)
|
266 |
+
self.data.test_mask = teMask
|
267 |
+
|
268 |
+
|
269 |
+
def descData(self):
|
270 |
+
"""
|
271 |
+
describe data
|
272 |
+
"""
|
273 |
+
print(f'Number of nodes: {self.data.num_nodes}')
|
274 |
+
print(f'Number of edges: {self.data.num_edges}')
|
275 |
+
print(f'Number of node features: {self.data.num_node_features}')
|
276 |
+
print(f'Number of training nodes: {self.data.train_mask.sum()}')
|
277 |
+
print(f'Training node label rate: {int(self.data.train_mask.sum()) / data.num_nodes:.2f}')
|
278 |
+
print(f'Number of validation nodes: {self.data.val_mask.sum()}')
|
279 |
+
print(f'Number of test nodes: {self.data.test_mask.sum()}')
|
280 |
+
print(f'Is undirected: {self.data.is_undirected()}')
|
281 |
+
|
282 |
+
print("Data attributes")
|
283 |
+
print(self.data.keys)
|
284 |
+
|
285 |
+
print("Data types")
|
286 |
+
print(type(self.data.x))
|
287 |
+
print(type(self.data.y))
|
288 |
+
print(type(self.data.edge_index))
|
289 |
+
print(type(self.data.train_mask))
|
290 |
+
|
291 |
+
print("Sample data")
|
292 |
+
print("x", self.data.x[:4])
|
293 |
+
print("y", self.data.y[:4])
|
294 |
+
print("edge", self.data.edge_index[:4])
|
295 |
+
print("train mask", self.data.train_mask[:4])
|
296 |
+
print("test mask", self.data.test_mask[:4])
|
297 |
+
|
298 |
+
print("Any isolated node? " , self.data.has_isolated_nodes())
|
299 |
+
print("Any self loop? ", self.data.has_self_loops())
|
300 |
+
print("Is graph directed? ", self.data.is_directed())
|
301 |
+
|
302 |
+
def forward(self):
|
303 |
+
"""
|
304 |
+
forward prop
|
305 |
+
"""
|
306 |
+
x, edges = self.data.x, self.data.edge_index
|
307 |
+
for l in self.layers:
|
308 |
+
if isinstance(l, MessagePassing):
|
309 |
+
x = l(x, edges)
|
310 |
+
else:
|
311 |
+
x = l(x)
|
312 |
+
return x
|
313 |
+
|
314 |
+
@staticmethod
|
315 |
+
def trainModel(model):
|
316 |
+
"""
|
317 |
+
train with batch data
|
318 |
+
|
319 |
+
Parameters
|
320 |
+
model : torch model
|
321 |
+
"""
|
322 |
+
epochIntv = model.config.getIntConfig("train.epoch.intv")[0]
|
323 |
+
|
324 |
+
model.train()
|
325 |
+
if model.trackErr:
|
326 |
+
trErr = list()
|
327 |
+
vaErr = list()
|
328 |
+
|
329 |
+
for epoch in range(model.numIter):
|
330 |
+
out = model()
|
331 |
+
loss = model.lossFn(out[model.data.train_mask], model.data.y[model.data.train_mask])
|
332 |
+
|
333 |
+
#error tracking at batch level
|
334 |
+
if model.trackErr:
|
335 |
+
trErr.append(loss.item())
|
336 |
+
vErr = GraphConvoNetwork.evaluateModel(model)
|
337 |
+
vaErr.append(vErr)
|
338 |
+
if model.verbose and epoch % epochIntv == 0:
|
339 |
+
print("epoch {} loss {:.6f} val error {:.6f}".format(epoch, loss.item(), vErr))
|
340 |
+
|
341 |
+
model.optimizer.zero_grad()
|
342 |
+
loss.backward()
|
343 |
+
model.optimizer.step()
|
344 |
+
|
345 |
+
#acc = GraphConvoNetwork.evaluateModel(model, True)
|
346 |
+
#print(acc)
|
347 |
+
modelSave = model.config.getBooleanConfig("train.model.save")[0]
|
348 |
+
if modelSave:
|
349 |
+
FeedForwardNetwork.saveCheckpt(model)
|
350 |
+
|
351 |
+
if model.trackErr:
|
352 |
+
FeedForwardNetwork.errorPlot(model, trErr, vaErr)
|
353 |
+
|
354 |
+
model.trained = True
|
355 |
+
|
356 |
+
@staticmethod
|
357 |
+
def evaluateModel(model, verbose=False):
|
358 |
+
"""
|
359 |
+
evaluate model
|
360 |
+
|
361 |
+
Parameters
|
362 |
+
model : torch model
|
363 |
+
verbose : if True additional output
|
364 |
+
"""
|
365 |
+
model.eval()
|
366 |
+
with torch.no_grad():
|
367 |
+
out = model()
|
368 |
+
if verbose:
|
369 |
+
print(out)
|
370 |
+
yPred = out[model.data.val_mask].data.cpu().numpy()
|
371 |
+
yActual = model.data.y[model.data.val_mask].data.cpu().numpy()
|
372 |
+
if verbose:
|
373 |
+
for pa in zip(yPred, yActual):
|
374 |
+
print(pa)
|
375 |
+
#correct = yPred == yActual
|
376 |
+
#score = int(correct.sum()) / int(model.data.val_mask.sum())
|
377 |
+
|
378 |
+
score = perfMetric(model.lossFnStr, yActual, yPred, model.clabels)
|
379 |
+
|
380 |
+
model.train()
|
381 |
+
return score
|
382 |
+
|
383 |
+
@staticmethod
|
384 |
+
def validateModel(model, retPred=False):
|
385 |
+
"""
|
386 |
+
model validation
|
387 |
+
|
388 |
+
Parameters
|
389 |
+
model : torch model
|
390 |
+
retPred : if True return prediction
|
391 |
+
"""
|
392 |
+
model.eval()
|
393 |
+
with torch.no_grad():
|
394 |
+
out = model()
|
395 |
+
yPred = out.argmax(dim=1)
|
396 |
+
yPred = yPred[model.data.test_mask].data.cpu().numpy()
|
397 |
+
yActual = model.data.y[model.data.test_mask].data.cpu().numpy()
|
398 |
+
#correct = yPred == yActual
|
399 |
+
#score = int(correct.sum()) / int(model.data.val_mask.sum())
|
400 |
+
score = perfMetric(model.accMetric, yActual, yPred)
|
401 |
+
print(formatFloat(3, score, "test #perf score"))
|
402 |
+
return score
|
403 |
+
|
404 |
+
@staticmethod
|
405 |
+
def modelPrediction(model, inclData=True):
|
406 |
+
"""
|
407 |
+
make prediction
|
408 |
+
|
409 |
+
Parameters
|
410 |
+
model : torch model
|
411 |
+
inclData : True to include input data
|
412 |
+
"""
|
413 |
+
cmask = model.config.getBooleanConfig("predict.create.mask")[0]
|
414 |
+
if not cmask:
|
415 |
+
print("create prediction mask property needs to be set to True")
|
416 |
+
return None
|
417 |
+
|
418 |
+
useSavedModel = model.config.getBooleanConfig("predict.use.saved.model")[0]
|
419 |
+
if useSavedModel:
|
420 |
+
FeedForwardNetwork.restoreCheckpt(model)
|
421 |
+
else:
|
422 |
+
if not model.trained:
|
423 |
+
GraphConvoNetwork.trainModel(model)
|
424 |
+
|
425 |
+
model.eval()
|
426 |
+
with torch.no_grad():
|
427 |
+
out = model()
|
428 |
+
yPred = out.argmax(dim=1)
|
429 |
+
yPred = yPred[model.prMask].data.cpu().numpy()
|
430 |
+
|
431 |
+
if inclData:
|
432 |
+
dataFilePath = model.config.getStringConfig("train.data.file")[0]
|
433 |
+
filt = lambda r : len(r) > 2
|
434 |
+
ndata = list(fileFiltRecGen(dataFilePath, filt))
|
435 |
+
prMask = model.prMask.data.cpu().numpy()
|
436 |
+
assertEqual(len(ndata), prMask.shape[0], "data and mask lengths are not equal")
|
437 |
+
precs = list(compress(ndata, prMask))
|
438 |
+
precs = list(map(lambda r : r[:-1], precs))
|
439 |
+
assertEqual(len(precs), yPred.shape[0], "data and mask lengths are not equal")
|
440 |
+
res = zip(precs, yPred)
|
441 |
+
else:
|
442 |
+
res = yPred
|
443 |
+
return res
|
444 |
+
|
supv/knn.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/local/bin/python3
|
2 |
+
|
3 |
+
# avenir-python: Machine Learning
|
4 |
+
# Author: Pranab Ghosh
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7 |
+
# may not use this file except in compliance with the License. You may
|
8 |
+
# obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15 |
+
# implied. See the License for the specific language governing
|
16 |
+
# permissions and limitations under the License.
|
17 |
+
|
18 |
+
# Package imports
|
19 |
+
import os
|
20 |
+
import sys
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
import numpy as np
|
23 |
+
import sklearn as sk
|
24 |
+
import matplotlib
|
25 |
+
import random
|
26 |
+
import jprops
|
27 |
+
from sklearn.neighbors import KNeighborsClassifier
|
28 |
+
from random import randint
|
29 |
+
sys.path.append(os.path.abspath("../lib"))
|
30 |
+
from util import *
|
31 |
+
from mlutil import *
|
32 |
+
from bacl import *
|
33 |
+
|
34 |
+
|
35 |
+
# gradient boosting classification
|
36 |
+
class NearestNeighbor(BaseClassifier):
|
37 |
+
def __init__(self, configFile):
|
38 |
+
defValues = {}
|
39 |
+
defValues["common.mode"] = ("training", None)
|
40 |
+
defValues["common.model.directory"] = ("model", None)
|
41 |
+
defValues["common.model.file"] = (None, None)
|
42 |
+
defValues["common.preprocessing"] = (None, None)
|
43 |
+
defValues["common.scaling.method"] = ("zscale", None)
|
44 |
+
defValues["common.verbose"] = (False, None)
|
45 |
+
defValues["train.data.file"] = (None, "missing training data file")
|
46 |
+
defValues["train.data.fields"] = (None, "missing training data field ordinals")
|
47 |
+
defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
|
48 |
+
defValues["train.data.class.field"] = (None, "missing class field ordinal")
|
49 |
+
defValues["train.num.neighbors"] = (5, None)
|
50 |
+
defValues["train.neighbor.weight"] = ("uniform", None)
|
51 |
+
defValues["train.neighbor.search.algo"] = ("auto", None)
|
52 |
+
defValues["train.neighbor.search.leaf.size"] = (10, None)
|
53 |
+
defValues["train.neighbor.dist.metric"] = ("minkowski", None)
|
54 |
+
defValues["train.neighbor.dist.metric.pow"] = (2.0, None)
|
55 |
+
defValues["train.success.criterion"] = ("error", None)
|
56 |
+
defValues["train.model.save"] = (False, None)
|
57 |
+
defValues["train.score.method"] = ("accuracy", None)
|
58 |
+
defValues["predict.data.file"] = (None, None)
|
59 |
+
defValues["predict.data.fields"] = (None, "missing data field ordinals")
|
60 |
+
defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
|
61 |
+
defValues["predict.use.saved.model"] = (False, None)
|
62 |
+
|
63 |
+
super(NearestNeighbor, self).__init__(configFile, defValues, __name__)
|
64 |
+
|
65 |
+
def buildModel(self):
|
66 |
+
"""
|
67 |
+
builds model object
|
68 |
+
"""
|
69 |
+
self.logger.info("...building knn classifer model")
|
70 |
+
numNeighbors = self.config.getIntConfig("train.num.neighbors")[0]
|
71 |
+
neighborWeight = self.config.getStringConfig("train.neighbor.weight")[0]
|
72 |
+
searchAlgo = self.config.getStringConfig("train.neighbor.search.algo")[0]
|
73 |
+
leafSize = self.config.getIntConfig("train.neighbor.search.leaf.size")[0]
|
74 |
+
distMetric = self.config.getStringConfig("train.neighbor.dist.metric")[0]
|
75 |
+
metricPow = self.config.getIntConfig("train.neighbor.dist.metric.pow")[0]
|
76 |
+
|
77 |
+
model = KNeighborsClassifier(n_neighbors=numNeighbors, weights=neighborWeight, algorithm=searchAlgo,
|
78 |
+
leaf_size=30, p=metricPow, metric=distMetric)
|
79 |
+
self.classifier = model
|
80 |
+
return self.classifier
|
81 |
+
|
82 |
+
def predictProb(self, recs=None):
|
83 |
+
"""
|
84 |
+
predict probability
|
85 |
+
"""
|
86 |
+
# create model
|
87 |
+
self.prepModel()
|
88 |
+
|
89 |
+
#input record
|
90 |
+
if recs is None:
|
91 |
+
featData = self.prepPredictData()
|
92 |
+
else:
|
93 |
+
if type(recs) is str:
|
94 |
+
featData = self.prepStringPredictData(recs)
|
95 |
+
else:
|
96 |
+
featData = recs
|
97 |
+
if (featData.ndim == 1):
|
98 |
+
featData = featData.reshape(1, -1)
|
99 |
+
|
100 |
+
#predict
|
101 |
+
self.logger.info("...predicting class probability")
|
102 |
+
clsData = self.classifier.predict_proba(featData)
|
103 |
+
return clsData
|
104 |
+
|
105 |
+
|
106 |
+
|
supv/lrd.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/local/bin/python3
|
2 |
+
|
3 |
+
# avenir-python: Machine Learning
|
4 |
+
# Author: Pranab Ghosh
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7 |
+
# may not use this file except in compliance with the License. You may
|
8 |
+
# obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15 |
+
# implied. See the License for the specific language governing
|
16 |
+
# permissions and limitations under the License.
|
17 |
+
|
18 |
+
# Package imports
|
19 |
+
import os
|
20 |
+
import sys
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
import numpy as np
|
23 |
+
import sklearn as sk
|
24 |
+
import sklearn.linear_model
|
25 |
+
import matplotlib
|
26 |
+
import random
|
27 |
+
import jprops
|
28 |
+
from sklearn.linear_model import LogisticRegression
|
29 |
+
from random import randint
|
30 |
+
sys.path.append(os.path.abspath("../lib"))
|
31 |
+
from util import *
|
32 |
+
from mlutil import *
|
33 |
+
from pasearch import *
|
34 |
+
from bacl import *
|
35 |
+
|
36 |
+
# logistic regression classification
|
37 |
+
class LogisticRegressionDiscriminant(BaseClassifier):
|
38 |
+
|
39 |
+
def __init__(self, configFile):
|
40 |
+
defValues = {}
|
41 |
+
defValues["common.mode"] = ("train", None)
|
42 |
+
defValues["common.model.directory"] = ("model", None)
|
43 |
+
defValues["common.model.file"] = (None, None)
|
44 |
+
defValues["common.scale.file.path"] = (None, "missing scale file path")
|
45 |
+
defValues["common.preprocessing"] = (None, None)
|
46 |
+
defValues["common.verbose"] = (False, None)
|
47 |
+
defValues["train.data.file"] = (None, "missing training data file")
|
48 |
+
defValues["train.data.fields"] = (None, "missing training data field ordinals")
|
49 |
+
defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
|
50 |
+
defValues["train.data.class.field"] = (None, "missing class field ordinal")
|
51 |
+
defValues["train.validation"] = ("kfold", None)
|
52 |
+
defValues["train.num.folds"] = (5, None)
|
53 |
+
defValues["train.penalty"] = ("l2", None)
|
54 |
+
defValues["train.dual"] = (False, None)
|
55 |
+
defValues["train.tolerance"] = (0.0001, None)
|
56 |
+
defValues["train.regularization"] = (1.0, None)
|
57 |
+
defValues["train.fit.intercept"] = (True, None)
|
58 |
+
defValues["train.intercept.scaling"] = (1.0, None)
|
59 |
+
defValues["train.class.weight"] = (None, None)
|
60 |
+
defValues["train.random.state"] = (None, None)
|
61 |
+
defValues["train.solver"] = ("liblinear", None)
|
62 |
+
defValues["train.max.iter"] = (100, None)
|
63 |
+
defValues["train.multi.class"] = ("ovr", None)
|
64 |
+
defValues["train.verbose"] = (0, None)
|
65 |
+
defValues["train.warm.start"] = (False, None)
|
66 |
+
defValues["train.num.jobs"] = (None, None)
|
67 |
+
defValues["train.l1.ratio"] = (None, None)
|
68 |
+
defValues["train.success.criterion"] = ("error", None)
|
69 |
+
defValues["train.model.save"] = (False, None)
|
70 |
+
defValues["train.score.method"] = ("accuracy", None)
|
71 |
+
defValues["train.search.param.strategy"] = (None, None)
|
72 |
+
defValues["train.search.params"] = (None, None)
|
73 |
+
defValues["predict.data.file"] = (None, None)
|
74 |
+
defValues["predict.data.fields"] = (None, "missing data field ordinals")
|
75 |
+
defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
|
76 |
+
defValues["predict.use.saved.model"] = (False, None)
|
77 |
+
defValues["validate.data.file"] = (None, "missing validation data file")
|
78 |
+
defValues["validate.data.fields"] = (None, "missing validation data field ordinals")
|
79 |
+
defValues["validate.data.feature.fields"] = (None, "missing validation data feature field ordinals")
|
80 |
+
defValues["validate.data.class.field"] = (None, "missing class field ordinal")
|
81 |
+
defValues["validate.use.saved.model"] = (False, None)
|
82 |
+
defValues["validate.score.method"] = ("accuracy", None)
|
83 |
+
|
84 |
+
super(LogisticRegressionDiscriminant, self).__init__(configFile, defValues, __name__)
|
85 |
+
|
86 |
+
# builds model object
|
87 |
+
def buildModel(self):
|
88 |
+
print ("...building logistic regression model")
|
89 |
+
penalty = self.config.getStringConfig("train.penalty")[0]
|
90 |
+
dual = self.config.getBooleanConfig("train.dual")[0]
|
91 |
+
tol = self.config.getFloatConfig("train.tolerance")[0]
|
92 |
+
c = self.config.getFloatConfig("train.regularization")[0]
|
93 |
+
fitIntercept = self.config.getBooleanConfig("train.fit.intercept")[0]
|
94 |
+
interceptScaling = self.config.getFloatConfig("train.intercept.scaling")[0]
|
95 |
+
classWeight = self.config.getStringConfig("train.class.weight")[0]
|
96 |
+
randomState = self.config.getIntConfig("train.random.state")[0]
|
97 |
+
solver = self.config.getStringConfig("train.solver")[0]
|
98 |
+
maxIter = self.config.getIntConfig("train.max.iter")[0]
|
99 |
+
multiClass = self.config.getStringConfig("train.multi.class")[0]
|
100 |
+
verbos = self.config.getIntConfig("train.verbose")[0]
|
101 |
+
warmStart = self.config.getBooleanConfig("train.warm.start")[0]
|
102 |
+
nJobs = self.config.getIntConfig("train.num.jobs")[0]
|
103 |
+
l1Ratio = self.config.getFloatConfig("train.l1.ratio")[0]
|
104 |
+
|
105 |
+
self.classifier = LogisticRegression(penalty=penalty, dual=dual, tol=tol, C=c, fit_intercept=fitIntercept,\
|
106 |
+
intercept_scaling=interceptScaling, class_weight=classWeight, random_state=randomState, solver=solver,\
|
107 |
+
max_iter=maxIter, multi_class=multiClass, verbose=verbos, warm_start=warmStart, n_jobs=nJobs, l1_ratio=l1Ratio)
|
108 |
+
|
109 |
+
return self.classifier
|
110 |
+
|
111 |
+
|
112 |
+
|
supv/lstm.py
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/local/bin/python3
|
2 |
+
|
3 |
+
# avenir-python: Machine Learning
|
4 |
+
# Author: Pranab Ghosh
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7 |
+
# may not use this file except in compliance with the License. You may
|
8 |
+
# obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15 |
+
# implied. See the License for the specific language governing
|
16 |
+
# permissions and limitations under the License.
|
17 |
+
|
18 |
+
# Package imports
|
19 |
+
import os
|
20 |
+
import sys
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
import numpy as np
|
23 |
+
import torch
|
24 |
+
from torch import nn
|
25 |
+
from torch.autograd import Variable
|
26 |
+
from torch.utils.data import DataLoader
|
27 |
+
from torchvision import transforms
|
28 |
+
import sklearn as sk
|
29 |
+
import matplotlib
|
30 |
+
import random
|
31 |
+
import jprops
|
32 |
+
from random import randint
|
33 |
+
sys.path.append(os.path.abspath("../lib"))
|
34 |
+
from util import *
|
35 |
+
from mlutil import *
|
36 |
+
from tnn import FeedForwardNetwork
|
37 |
+
|
38 |
+
"""
|
39 |
+
LSTM with one or more hidden layers with multi domensional data
|
40 |
+
"""
|
41 |
+
|
42 |
+
class LstmNetwork(nn.Module):
|
43 |
+
def __init__(self, configFile):
|
44 |
+
"""
|
45 |
+
In the constructor we instantiate two nn.Linear modules and assign them as
|
46 |
+
member variables.
|
47 |
+
|
48 |
+
Parameters
|
49 |
+
configFile : config file path
|
50 |
+
"""
|
51 |
+
defValues = dict()
|
52 |
+
defValues["common.mode"] = ("training", None)
|
53 |
+
defValues["common.model.directory"] = ("model", None)
|
54 |
+
defValues["common.model.file"] = (None, None)
|
55 |
+
defValues["common.preprocessing"] = (None, None)
|
56 |
+
defValues["common.scaling.method"] = ("zscale", None)
|
57 |
+
defValues["common.scaling.minrows"] = (50, None)
|
58 |
+
defValues["common.verbose"] = (False, None)
|
59 |
+
defValues["common.device"] = ("cpu", None)
|
60 |
+
defValues["train.data.file"] = (None, "missing training data file path")
|
61 |
+
defValues["train.data.type"] = ("numeric", None)
|
62 |
+
defValues["train.data.feat.cols"] = (None, "missing feature columns")
|
63 |
+
defValues["train.data.target.col"] = (None, "missing target column")
|
64 |
+
defValues["train.data.delim"] = (",", None)
|
65 |
+
defValues["train.input.size"] = (None, "missing input size")
|
66 |
+
defValues["train.hidden.size"] = (None, "missing hidden size")
|
67 |
+
defValues["train.output.size"] = (None, "missing output size")
|
68 |
+
defValues["train.num.layers"] = (1, None)
|
69 |
+
defValues["train.seq.len"] = (1, None)
|
70 |
+
defValues["train.batch.size"] = (32, None)
|
71 |
+
defValues["train.batch.first"] = (False, None)
|
72 |
+
defValues["train.drop.prob"] = (0, None)
|
73 |
+
defValues["train.optimizer"] = ("adam", None)
|
74 |
+
defValues["train.opt.learning.rate"] = (.0001, None)
|
75 |
+
defValues["train.opt.weight.decay"] = (0, None)
|
76 |
+
defValues["train.opt.momentum"] = (0, None)
|
77 |
+
defValues["train.opt.eps"] = (1e-08, None)
|
78 |
+
defValues["train.opt.dampening"] = (0, None)
|
79 |
+
defValues["train.opt.momentum.nesterov"] = (False, None)
|
80 |
+
defValues["train.opt.betas"] = ([0.9, 0.999], None)
|
81 |
+
defValues["train.opt.alpha"] = (0.99, None)
|
82 |
+
defValues["train.out.sequence"] = (True, None)
|
83 |
+
defValues["train.out.activation"] = ("sigmoid", None)
|
84 |
+
defValues["train.loss.fn"] = ("mse", None)
|
85 |
+
defValues["train.loss.reduction"] = ("mean", None)
|
86 |
+
defValues["train.grad.clip"] = (5, None)
|
87 |
+
defValues["train.num.iterations"] = (500, None)
|
88 |
+
defValues["train.save.model"] = (False, None)
|
89 |
+
defValues["valid.data.file"] = (None, "missing validation data file path")
|
90 |
+
defValues["valid.accuracy.metric"] = (None, None)
|
91 |
+
defValues["predict.data.file"] = (None, None)
|
92 |
+
defValues["predict.use.saved.model"] = (True, None)
|
93 |
+
defValues["predict.output"] = ("binary", None)
|
94 |
+
defValues["predict.feat.pad.size"] = (60, None)
|
95 |
+
|
96 |
+
self.config = Configuration(configFile, defValues)
|
97 |
+
|
98 |
+
super(LstmNetwork, self).__init__()
|
99 |
+
|
100 |
+
def getConfig(self):
|
101 |
+
return self.config
|
102 |
+
|
103 |
+
def buildModel(self):
|
104 |
+
"""
|
105 |
+
Loads configuration and builds the various piecess necessary for the model
|
106 |
+
"""
|
107 |
+
torch.manual_seed(9999)
|
108 |
+
self.verbose = self.config.getStringConfig("common.verbose")[0]
|
109 |
+
self.inputSize = self.config.getIntConfig("train.input.size")[0]
|
110 |
+
self.outputSize = self.config.getIntConfig("train.output.size")[0]
|
111 |
+
self.nLayers = self.config.getIntConfig("train.num.layers")[0]
|
112 |
+
self.hiddenSize = self.config.getIntConfig("train.hidden.size")[0]
|
113 |
+
self.seqLen = self.config.getIntConfig("train.seq.len")[0]
|
114 |
+
self.batchSize = self.config.getIntConfig("train.batch.size")[0]
|
115 |
+
self.batchFirst = self.config.getBooleanConfig("train.batch.first")[0]
|
116 |
+
dropProb = self.config.getFloatConfig("train.drop.prob")[0]
|
117 |
+
self.outSeq = self.config.getBooleanConfig("train.out.sequence")[0]
|
118 |
+
self.device = FeedForwardNetwork.getDevice(self)
|
119 |
+
|
120 |
+
#model
|
121 |
+
self.lstm = nn.LSTM(self.inputSize, self.hiddenSize, self.nLayers, dropout=dropProb, batch_first=self.batchFirst)
|
122 |
+
self.linear = nn.Linear(self.hiddenSize, self.outputSize)
|
123 |
+
outAct = self.config.getStringConfig("train.out.activation")[0]
|
124 |
+
self.outAct = FeedForwardNetwork.createActivation(outAct)
|
125 |
+
|
126 |
+
#load training data
|
127 |
+
dataFilePath = self.config.getStringConfig("train.data.file")[0]
|
128 |
+
self.fCols = self.config.getIntListConfig("train.data.feat.cols")[0]
|
129 |
+
assert len(self.fCols) == 2, "specify only start and end columns of features"
|
130 |
+
self.tCol = self.config.getIntConfig("train.data.target.col")[0]
|
131 |
+
self.delim = self.config.getStringConfig("train.data.delim")[0]
|
132 |
+
|
133 |
+
self.fData, self.tData = self.loadData(dataFilePath, self.delim, self.fCols[0],self.fCols[1], self.tCol)
|
134 |
+
self.fData = torch.from_numpy(self.fData)
|
135 |
+
self.fData = self.fData.to(self.device)
|
136 |
+
self.tData = torch.from_numpy(self.tData)
|
137 |
+
self.tData = self.tData.to(self.device)
|
138 |
+
|
139 |
+
#load validation data
|
140 |
+
vaDataFilePath = self.config.getStringConfig("valid.data.file")[0]
|
141 |
+
self.vfData, self.vtData = self.loadData(vaDataFilePath, self.delim, self.fCols[0], self.fCols[1], self.tCol)
|
142 |
+
self.vfData = torch.from_numpy(self.vfData)
|
143 |
+
self.vfData = self.vfData.to(self.device)
|
144 |
+
self.vtData = torch.from_numpy(self.vtData)
|
145 |
+
self.vtData = self.vtData.to(self.device)
|
146 |
+
|
147 |
+
self.batchSize = self.config.getIntConfig("train.batch.size")[0]
|
148 |
+
self.dataSize = self.fData.shape[0]
|
149 |
+
self.numBatch = int(self.dataSize / self.batchSize)
|
150 |
+
self.restored = False
|
151 |
+
|
152 |
+
self.to(self.device)
|
153 |
+
|
154 |
+
def loadData(self, filePath, delim, scolStart, scolEnd, targetCol):
|
155 |
+
"""
|
156 |
+
loads data for file with one sequence per line and data can be a vector
|
157 |
+
|
158 |
+
Parameters
|
159 |
+
filePath : file path
|
160 |
+
delim : field delemeter
|
161 |
+
scolStart : seq column start index
|
162 |
+
scolEnd : seq column end index
|
163 |
+
targetCol : target field col index
|
164 |
+
"""
|
165 |
+
if targetCol >= 0:
|
166 |
+
#include target column
|
167 |
+
cols = list(range(scolStart, scolEnd + 1, 1))
|
168 |
+
cols.append(targetCol)
|
169 |
+
data = np.loadtxt(filePath, delimiter=delim, usecols=cols)
|
170 |
+
#one output for whole sequence
|
171 |
+
sData = data[:, :-1]
|
172 |
+
if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
|
173 |
+
sData = self.scaleSeqData(sData)
|
174 |
+
tData = data[:, -1]
|
175 |
+
|
176 |
+
#target int (index into class labels) for classification
|
177 |
+
sData = sData.astype(np.float32)
|
178 |
+
tData = tData.astype(np.float32) if self.outputSize == 1 else tData.astype(np.long)
|
179 |
+
exData = (sData, tData)
|
180 |
+
else:
|
181 |
+
#exclude target column
|
182 |
+
cols = list(range(scolStart, scolEnd + 1, 1))
|
183 |
+
data = np.loadtxt(filePath, delimiter=delim, usecols=cols)
|
184 |
+
|
185 |
+
#one output for whole sequence
|
186 |
+
sData = data
|
187 |
+
if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
|
188 |
+
sData = self.scaleSeqData(sData)
|
189 |
+
|
190 |
+
#target int (index into class labels) for classification
|
191 |
+
sData = sData.astype(np.float32)
|
192 |
+
exData = sData
|
193 |
+
|
194 |
+
return exData
|
195 |
+
|
196 |
+
def scaleSeqData(self, sData):
|
197 |
+
"""
|
198 |
+
scales data transforming non squence format
|
199 |
+
|
200 |
+
Parameters
|
201 |
+
sData : sequence data
|
202 |
+
"""
|
203 |
+
scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
|
204 |
+
sData = fromMultDimSeqToTabular(sData, self.inputSize, self.seqLen)
|
205 |
+
sData = scaleData(sData, scalingMethod)
|
206 |
+
sData = fromTabularToMultDimSeq(sData, self.inputSize, self.seqLen)
|
207 |
+
return sData
|
208 |
+
|
209 |
+
def formattedBatchGenarator(self):
|
210 |
+
"""
|
211 |
+
transforms traing data from (dataSize, seqLength x inputSize) to (batch, seqLength, inputSize) tensor
|
212 |
+
or (seqLength, batch, inputSize) tensor
|
213 |
+
"""
|
214 |
+
|
215 |
+
for _ in range(self.numBatch):
|
216 |
+
bfData = torch.zeros([self.batchSize, self.seqLen, self.inputSize], dtype=torch.float32) if self.batchFirst\
|
217 |
+
else torch.zeros([self.seqLen, self.batchSize, self.inputSize], dtype=torch.float32)
|
218 |
+
tdType = torch.float32 if self.outputSize == 1 else torch.long
|
219 |
+
btData = torch.zeros([self.batchSize], dtype=tdType)
|
220 |
+
|
221 |
+
i = 0
|
222 |
+
for bdi in range(self.batchSize):
|
223 |
+
di = sampleUniform(0, self.dataSize-1)
|
224 |
+
row = self.fData[di]
|
225 |
+
for ci, cv in enumerate(row):
|
226 |
+
si = int(ci / self.inputSize)
|
227 |
+
ii = ci % self.inputSize
|
228 |
+
if self.batchFirst:
|
229 |
+
bfData[bdi][si][ii] = cv
|
230 |
+
else:
|
231 |
+
#print(si, bdi, ii)
|
232 |
+
bfData[si][bdi][ii] = cv
|
233 |
+
btData[i] = self.tData[di]
|
234 |
+
i += 1
|
235 |
+
|
236 |
+
#for seq output correct first 2 dimensions
|
237 |
+
if self.outSeq and not self.batchFirst:
|
238 |
+
btData = torch.transpose(btData,0,1)
|
239 |
+
|
240 |
+
yield (bfData, btData)
|
241 |
+
|
242 |
+
def formatData(self, fData, tData=None):
|
243 |
+
"""
|
244 |
+
transforms validation or prediction data data from (dataSize, seqLength x inputSize) to
|
245 |
+
(batch, seqLength, inputSize) tensor or (seqLength, batch, inputSize) tensor
|
246 |
+
|
247 |
+
Parameters
|
248 |
+
fData : feature data
|
249 |
+
tData : target data
|
250 |
+
"""
|
251 |
+
dSize = fData.shape[0]
|
252 |
+
bfData = torch.zeros([dSize, self.seqLen, self.inputSize], dtype=torch.float32) if self.batchFirst\
|
253 |
+
else torch.zeros([self.seqLen, dSize, self.inputSize], dtype=torch.float32)
|
254 |
+
|
255 |
+
for ri in range(dSize):
|
256 |
+
row = fData[ri]
|
257 |
+
for ci, cv in enumerate(row):
|
258 |
+
si = int(ci / self.inputSize)
|
259 |
+
ii = ci % self.inputSize
|
260 |
+
if self.batchFirst:
|
261 |
+
bfData[ri][si][ii] = cv
|
262 |
+
else:
|
263 |
+
bfData[si][ri][ii] = cv
|
264 |
+
if tData is not None:
|
265 |
+
btData = torch.transpose(tData,0,1) if self.outSeq and not self.batchFirst else tData
|
266 |
+
formData = (bfData, btData)
|
267 |
+
else:
|
268 |
+
formData = bfData
|
269 |
+
return formData
|
270 |
+
|
271 |
+
def forward(self, x, h):
|
272 |
+
"""
|
273 |
+
Forward pass
|
274 |
+
|
275 |
+
Parameters
|
276 |
+
x : input data
|
277 |
+
h : targhiddenet state
|
278 |
+
"""
|
279 |
+
out, hout = self.lstm(x,h)
|
280 |
+
if self.outSeq:
|
281 |
+
# seq to seq prediction
|
282 |
+
out = out.view(-1, self.hiddenSize)
|
283 |
+
out = self.linear(out)
|
284 |
+
if self.outAct is not None:
|
285 |
+
out = self.outAct(out)
|
286 |
+
out = out.view(self.batchSize * self.seqLen, -1)
|
287 |
+
else:
|
288 |
+
#seq to one prediction
|
289 |
+
out = out[self.seqLen - 1].view(-1, self.hiddenSize)
|
290 |
+
out = self.linear(out)
|
291 |
+
if self.outAct is not None:
|
292 |
+
out = self.outAct(out)
|
293 |
+
#out = out.view(self.batchSize, -1)
|
294 |
+
|
295 |
+
return out, hout
|
296 |
+
|
297 |
+
def initHidden(self, batch):
|
298 |
+
"""
|
299 |
+
Initialize hidden weights
|
300 |
+
|
301 |
+
Parameters
|
302 |
+
batch : batch size
|
303 |
+
"""
|
304 |
+
hidden = (torch.zeros(self.nLayers,batch,self.hiddenSize),
|
305 |
+
torch.zeros(self.nLayers,batch,self.hiddenSize))
|
306 |
+
return hidden
|
307 |
+
|
308 |
+
def trainLstm(self):
|
309 |
+
"""
|
310 |
+
train lstm
|
311 |
+
"""
|
312 |
+
print("..starting training")
|
313 |
+
self.train()
|
314 |
+
|
315 |
+
#device = self.config.getStringConfig("common.device")[0]
|
316 |
+
#self.to(device)
|
317 |
+
optimizerName = self.config.getStringConfig("train.optimizer")[0]
|
318 |
+
self.optimizer = FeedForwardNetwork.createOptimizer(self, optimizerName)
|
319 |
+
lossFn = self.config.getStringConfig("train.loss.fn")[0]
|
320 |
+
criterion = FeedForwardNetwork.createLossFunction(self, lossFn)
|
321 |
+
clip = self.config.getFloatConfig("train.grad.clip")[0]
|
322 |
+
numIter = self.config.getIntConfig("train.num.iterations")[0]
|
323 |
+
accMetric = self.config.getStringConfig("valid.accuracy.metric")[0]
|
324 |
+
|
325 |
+
|
326 |
+
for it in range(numIter):
|
327 |
+
b = 0
|
328 |
+
for inputs, labels in self.formattedBatchGenarator():
|
329 |
+
#forward pass
|
330 |
+
hid = self.initHidden(self.batchSize)
|
331 |
+
hid = (hid[0].to(self.device), hid[1].to(self.device))
|
332 |
+
inputs, labels = inputs.to(self.device), labels.to(self.device)
|
333 |
+
output, hid = self(inputs, hid)
|
334 |
+
|
335 |
+
#loss
|
336 |
+
if self.outSeq:
|
337 |
+
labels = labels.view(self.batchSize * self.seqLen, -1)
|
338 |
+
loss = criterion(output, labels)
|
339 |
+
|
340 |
+
if self.verbose and it % 50 == 0 and b % 10 == 0:
|
341 |
+
print("epoch {} batch {} loss {:.6f}".format(it, b, loss.item()))
|
342 |
+
|
343 |
+
# zero gradients, perform a backward pass, and update the weights.
|
344 |
+
self.optimizer.zero_grad()
|
345 |
+
loss.backward()
|
346 |
+
nn.utils.clip_grad_norm_(self.parameters(), clip)
|
347 |
+
self.optimizer.step()
|
348 |
+
b += 1
|
349 |
+
|
350 |
+
#validate
|
351 |
+
print("..validating model")
|
352 |
+
self.eval()
|
353 |
+
with torch.no_grad():
|
354 |
+
fData, tData = self.formatData(self.vfData, self.vtData)
|
355 |
+
fData = fData.to(self.device)
|
356 |
+
vsize = tData.shape[0]
|
357 |
+
hid = self.initHidden(vsize)
|
358 |
+
hid = (hid[0].to(self.device), hid[1].to(self.device))
|
359 |
+
yPred, _ = self(fData, hid)
|
360 |
+
yPred = yPred.data.cpu().numpy()
|
361 |
+
yActual = tData.data.cpu().numpy()
|
362 |
+
|
363 |
+
if self.verbose:
|
364 |
+
print("\npredicted \t\t actual")
|
365 |
+
for i in range(vsize):
|
366 |
+
print(str(yPred[i]) + "\t" + str(yActual[i]))
|
367 |
+
|
368 |
+
score = perfMetric(accMetric, yActual, yPred)
|
369 |
+
print(formatFloat(3, score, "perf score"))
|
370 |
+
|
371 |
+
#save
|
372 |
+
modelSave = self.config.getBooleanConfig("train.model.save")[0]
|
373 |
+
if modelSave:
|
374 |
+
FeedForwardNetwork.saveCheckpt(self)
|
375 |
+
|
376 |
+
def predictLstm(self):
|
377 |
+
"""
|
378 |
+
predict
|
379 |
+
"""
|
380 |
+
print("..predicting using model")
|
381 |
+
useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
|
382 |
+
if useSavedModel:
|
383 |
+
FeedForwardNetwork.restoreCheckpt(self)
|
384 |
+
else:
|
385 |
+
self.trainLstm()
|
386 |
+
|
387 |
+
prDataFilePath = self.config.getStringConfig("predict.data.file")[0]
|
388 |
+
pfData = self.loadData(prDataFilePath, self.delim, self.fCols[0], self.fCols[1], -1)
|
389 |
+
pfData = torch.from_numpy(pfData)
|
390 |
+
dsize = pfData.shape[0]
|
391 |
+
|
392 |
+
#predict
|
393 |
+
#device = self.config.getStringConfig("common.device")[0]
|
394 |
+
self.eval()
|
395 |
+
with torch.no_grad():
|
396 |
+
fData = self.formatData(pfData)
|
397 |
+
fData = fData.to(self.device)
|
398 |
+
hid = self.initHidden(dsize)
|
399 |
+
hid = (hid[0].to(self.device), hid[1].to(self.device))
|
400 |
+
yPred, _ = self(fData, hid)
|
401 |
+
yPred = yPred.data.cpu().numpy()
|
402 |
+
|
403 |
+
if self.outputSize == 2:
|
404 |
+
#classification
|
405 |
+
yPred = FeedForwardNetwork.processClassifOutput(yPred, self.config)
|
406 |
+
|
407 |
+
# print prediction
|
408 |
+
FeedForwardNetwork.printPrediction(yPred, self.config, prDataFilePath)
|
409 |
+
|
410 |
+
|
411 |
+
|
412 |
+
|
413 |
+
|
414 |
+
|
supv/mcalib.py
ADDED
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/local/bin/python3
|
2 |
+
|
3 |
+
# avenir-python: Machine Learning
|
4 |
+
# Author: Pranab Ghosh
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7 |
+
# may not use this file except in compliance with the License. You may
|
8 |
+
# obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15 |
+
# implied. See the License for the specific language governing
|
16 |
+
# permissions and limitations under the License.
|
17 |
+
|
18 |
+
# Package imports
|
19 |
+
import os
|
20 |
+
import sys
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
import numpy as np
|
23 |
+
import sklearn as sk
|
24 |
+
from sklearn.neighbors import KDTree
|
25 |
+
import matplotlib
|
26 |
+
import random
|
27 |
+
import jprops
|
28 |
+
from random import randint
|
29 |
+
import statistics
|
30 |
+
sys.path.append(os.path.abspath("../lib"))
|
31 |
+
from util import *
|
32 |
+
from mlutil import *
|
33 |
+
from tnn import *
|
34 |
+
from stats import *
|
35 |
+
|
36 |
+
"""
|
37 |
+
neural model calibration
|
38 |
+
"""
|
39 |
+
class ModelCalibration(object):
|
40 |
+
def __init__(self):
|
41 |
+
pass
|
42 |
+
|
43 |
+
@staticmethod
|
44 |
+
def findModelCalibration(model):
|
45 |
+
"""
|
46 |
+
pmodel calibration
|
47 |
+
"""
|
48 |
+
FeedForwardNetwork.prepValidate(model)
|
49 |
+
FeedForwardNetwork.validateModel(model)
|
50 |
+
|
51 |
+
yPred = model.yPred.flatten()
|
52 |
+
yActual = model.validOutData.flatten()
|
53 |
+
nsamp = len(yActual)
|
54 |
+
|
55 |
+
#print(yPred.shape)
|
56 |
+
#print(yActual.shape)
|
57 |
+
|
58 |
+
nBins = model.config.getIntConfig("calibrate.num.bins")[0]
|
59 |
+
prThreshhold = model.config.getFloatConfig("calibrate.pred.prob.thresh")[0]
|
60 |
+
|
61 |
+
minConf = yPred.min()
|
62 |
+
maxConf = yPred.max()
|
63 |
+
bsize = (maxConf - minConf) / nBins
|
64 |
+
#print("minConf {:.3f} maxConf {:.3f} bsize {:.3f}".format(minConf, maxConf, bsize))
|
65 |
+
blist = list(map(lambda i : None, range(nBins)))
|
66 |
+
|
67 |
+
#binning
|
68 |
+
for yp, ya in zip(yPred, yActual):
|
69 |
+
indx = int((yp - minConf) / bsize)
|
70 |
+
if indx == nBins:
|
71 |
+
indx = nBins - 1
|
72 |
+
#print("yp {:.3f} indx {}".format(yp, indx))
|
73 |
+
pair = (yp, ya)
|
74 |
+
plist = blist[indx]
|
75 |
+
if plist is None:
|
76 |
+
plist = list()
|
77 |
+
blist[indx] = plist
|
78 |
+
plist.append(pair)
|
79 |
+
|
80 |
+
x = list()
|
81 |
+
y = list()
|
82 |
+
yideal = list()
|
83 |
+
ece = 0
|
84 |
+
mce = 0
|
85 |
+
|
86 |
+
# per bin confidence and accuracy
|
87 |
+
b = 0
|
88 |
+
for plist in blist:
|
89 |
+
if plist is not None:
|
90 |
+
#confidence
|
91 |
+
ypl = list(map(lambda p : p[0], plist))
|
92 |
+
ypm = statistics.mean(ypl)
|
93 |
+
x.append(ypm)
|
94 |
+
|
95 |
+
#accuracy
|
96 |
+
ypcount = 0
|
97 |
+
for p in plist:
|
98 |
+
yp = 1 if p[0] > prThreshhold else 0
|
99 |
+
if (yp == 1 and p[1] == 1):
|
100 |
+
ypcount += 1
|
101 |
+
|
102 |
+
acc = ypcount / len(plist)
|
103 |
+
y.append(acc)
|
104 |
+
yideal.append(ypm)
|
105 |
+
|
106 |
+
ce = abs(ypm - acc)
|
107 |
+
ece += len(plist) * ce
|
108 |
+
if ce > mce:
|
109 |
+
mce = ce
|
110 |
+
else:
|
111 |
+
ypm = minConf + (b + 0.5) * bsize
|
112 |
+
x.append(ypm)
|
113 |
+
yideal.append(ypm)
|
114 |
+
y.append(0)
|
115 |
+
b += 1
|
116 |
+
|
117 |
+
#calibration plot
|
118 |
+
drawPairPlot(x, y, yideal, "confidence", "accuracy", "actual", "ideal")
|
119 |
+
|
120 |
+
print("confidence\taccuracy")
|
121 |
+
for z in zip(x,y):
|
122 |
+
print("{:.3f}\t{:.3f}".format(z[0], z[1]))
|
123 |
+
|
124 |
+
|
125 |
+
#expected calibration error
|
126 |
+
ece /= nsamp
|
127 |
+
print("expected calibration error\t{:.3f}".format(ece))
|
128 |
+
print("maximum calibration error\t{:.3f}".format(mce))
|
129 |
+
|
130 |
+
|
131 |
+
@staticmethod
|
132 |
+
def findModelCalibrationLocal(model):
|
133 |
+
"""
|
134 |
+
pmodel calibration based k nearest neghbors
|
135 |
+
"""
|
136 |
+
FeedForwardNetwork.prepValidate(model)
|
137 |
+
FeedForwardNetwork.validateModel(model)
|
138 |
+
|
139 |
+
yPred = model.yPred.flatten()
|
140 |
+
yActual = model.validOutData.flatten()
|
141 |
+
nsamp = len(yActual)
|
142 |
+
|
143 |
+
neighborCnt = model.config.getIntConfig("calibrate.num.nearest.neighbors")[0]
|
144 |
+
prThreshhold = model.config.getFloatConfig("calibrate.pred.prob.thresh")[0]
|
145 |
+
fData = model.validFeatData.numpy()
|
146 |
+
tree = KDTree(fData, leaf_size=4)
|
147 |
+
|
148 |
+
dist, ind = tree.query(fData, k=neighborCnt)
|
149 |
+
calibs = list()
|
150 |
+
#all data
|
151 |
+
for si, ni in enumerate(ind):
|
152 |
+
conf = 0
|
153 |
+
ypcount = 0
|
154 |
+
#all neighbors
|
155 |
+
for i in ni:
|
156 |
+
conf += yPred[i]
|
157 |
+
yp = 1 if yPred[i] > prThreshhold else 0
|
158 |
+
if (yp == 1 and yActual[i] == 1):
|
159 |
+
ypcount += 1
|
160 |
+
conf /= neighborCnt
|
161 |
+
acc = ypcount / neighborCnt
|
162 |
+
calib = (si, conf, acc)
|
163 |
+
calibs.append(calib)
|
164 |
+
|
165 |
+
#descending sort by difference between confidence and accuracy
|
166 |
+
calibs = sorted(calibs, key=lambda c : abs(c[1] - c[2]), reverse=True)
|
167 |
+
print("local calibration")
|
168 |
+
print("conf\taccu\trecord")
|
169 |
+
for i in range(19):
|
170 |
+
si, conf, acc = calibs[i]
|
171 |
+
rec = toStrFromList(fData[si], 3)
|
172 |
+
print("{:.3f}\t{:.3f}\t{}".format(conf, acc, rec))
|
173 |
+
|
174 |
+
@staticmethod
|
175 |
+
def findModelSharpness(model):
|
176 |
+
"""
|
177 |
+
pmodel calibration
|
178 |
+
"""
|
179 |
+
FeedForwardNetwork.prepValidate(model)
|
180 |
+
FeedForwardNetwork.validateModel(model)
|
181 |
+
|
182 |
+
yPred = model.yPred.flatten()
|
183 |
+
yActual = model.validOutData.flatten()
|
184 |
+
nsamp = len(yActual)
|
185 |
+
|
186 |
+
#print(yPred.shape)
|
187 |
+
#print(yActual.shape)
|
188 |
+
|
189 |
+
nBins = model.config.getIntConfig("calibrate.num.bins")[0]
|
190 |
+
prThreshhold = model.config.getFloatConfig("calibrate.pred.prob.thresh")[0]
|
191 |
+
|
192 |
+
minConf = yPred.min()
|
193 |
+
maxConf = yPred.max()
|
194 |
+
bsize = (maxConf - minConf) / nBins
|
195 |
+
#print("minConf {:.3f} maxConf {:.3f} bsize {:.3f}".format(minConf, maxConf, bsize))
|
196 |
+
blist = list(map(lambda i : None, range(nBins)))
|
197 |
+
|
198 |
+
#binning
|
199 |
+
for yp, ya in zip(yPred, yActual):
|
200 |
+
indx = int((yp - minConf) / bsize)
|
201 |
+
if indx == nBins:
|
202 |
+
indx = nBins - 1
|
203 |
+
#print("yp {:.3f} indx {}".format(yp, indx))
|
204 |
+
pair = (yp, ya)
|
205 |
+
plist = blist[indx]
|
206 |
+
if plist is None:
|
207 |
+
plist = list()
|
208 |
+
blist[indx] = plist
|
209 |
+
plist.append(pair)
|
210 |
+
|
211 |
+
y = list()
|
212 |
+
ypgcount = 0
|
213 |
+
# per bin confidence and accuracy
|
214 |
+
for plist in blist:
|
215 |
+
#ypl = list(map(lambda p : p[0], plist))
|
216 |
+
#ypm = statistics.mean(ypl)
|
217 |
+
#x.append(ypm)
|
218 |
+
|
219 |
+
ypcount = 0
|
220 |
+
for p in plist:
|
221 |
+
yp = 1 if p[0] > prThreshhold else 0
|
222 |
+
if (yp == 1 and p[1] == 1):
|
223 |
+
ypcount += 1
|
224 |
+
ypgcount += 1
|
225 |
+
|
226 |
+
acc = ypcount / len(plist)
|
227 |
+
y.append(acc)
|
228 |
+
|
229 |
+
print("{} {}".format(ypgcount, nsamp))
|
230 |
+
accg = ypgcount / nsamp
|
231 |
+
accgl = [accg] * nBins
|
232 |
+
x = list(range(nBins))
|
233 |
+
drawPairPlot(x, y, accgl, "discretized confidence", "accuracy", "local", "global")
|
234 |
+
|
235 |
+
contrast = list(map(lambda acc : abs(acc - accg), y))
|
236 |
+
contrast = statistics.mean(contrast)
|
237 |
+
print("contrast {:.3f}".format(contrast))
|
238 |
+
|
239 |
+
"""
|
240 |
+
neural model robustness
|
241 |
+
"""
|
242 |
+
class ModelRobustness(object):
|
243 |
+
def __init__(self):
|
244 |
+
pass
|
245 |
+
|
246 |
+
def localPerformance(self, model, fpath, nsamp, neighborCnt):
|
247 |
+
"""
|
248 |
+
local performnance sampling
|
249 |
+
"""
|
250 |
+
|
251 |
+
#load data
|
252 |
+
fData, oData = FeedForwardNetwork.prepData(model, fpath)
|
253 |
+
#print(type(fData))
|
254 |
+
#print(type(oData))
|
255 |
+
#print(fData.shape)
|
256 |
+
dsize = fData.shape[0]
|
257 |
+
ncol = fData.shape[1]
|
258 |
+
|
259 |
+
#kdd
|
260 |
+
tree = KDTree(fData, leaf_size=4)
|
261 |
+
|
262 |
+
scores = list()
|
263 |
+
indices = list()
|
264 |
+
for _ in range(nsamp):
|
265 |
+
indx = randomInt(0, dsize - 1)
|
266 |
+
indices.append(indx)
|
267 |
+
frow = fData[indx]
|
268 |
+
frow = np.reshape(frow, (1, ncol))
|
269 |
+
dist, ind = tree.query(frow, k=neighborCnt)
|
270 |
+
|
271 |
+
ind = ind[0]
|
272 |
+
vfData = fData[ind]
|
273 |
+
voData = oData[ind]
|
274 |
+
|
275 |
+
#print(type(vfData))
|
276 |
+
#print(vfData.shape)
|
277 |
+
#print(type(voData))
|
278 |
+
#print(voData.shape)
|
279 |
+
|
280 |
+
model.setValidationData((vfData, voData), False)
|
281 |
+
score = FeedForwardNetwork.validateModel(model)
|
282 |
+
scores.append(score)
|
283 |
+
|
284 |
+
#performance distribution
|
285 |
+
m, s = basicStat(scores)
|
286 |
+
print("model performance: mean {:.3f}\tstd dev {:.3f}".format(m,s))
|
287 |
+
drawHist(scores, "model accuracy", "accuracy", "frequency")
|
288 |
+
|
289 |
+
#worst performance
|
290 |
+
lscores = sorted(zip(indices, scores), key=lambda s : s[1])
|
291 |
+
print(lscores[:5])
|
292 |
+
|
293 |
+
lines = getFileLines(fpath, None)
|
294 |
+
print("worst performing features regions")
|
295 |
+
for i,s in lscores[:5]:
|
296 |
+
print("score {:.3f}\t{}".format(s, lines[i]))
|
297 |
+
|
298 |
+
|
299 |
+
"""
|
300 |
+
conformal prediction for regression
|
301 |
+
"""
|
302 |
+
class ConformalRegressionPrediction(object):
|
303 |
+
def __init__(self):
|
304 |
+
self.calibration = dict()
|
305 |
+
|
306 |
+
def calibrate(self, ypair, confBound):
|
307 |
+
""" n
|
308 |
+
calibration for conformal prediction
|
309 |
+
"""
|
310 |
+
cscores = list()
|
311 |
+
ymax = None
|
312 |
+
ymin = None
|
313 |
+
for yp, ya in ypair:
|
314 |
+
cscore = abs(yp - ya)
|
315 |
+
cscores.append(cscore)
|
316 |
+
if ymax is None:
|
317 |
+
ymax = ya
|
318 |
+
ymin = ya
|
319 |
+
else:
|
320 |
+
ymax = ya if ya > ymax else ymax
|
321 |
+
ymin = ya if ya < ymin else ymin
|
322 |
+
|
323 |
+
cscores.sort()
|
324 |
+
drawHist(cscores, "conformal score distribution", "conformal score", "frequency", 20)
|
325 |
+
cbi = int(confBound * len(cscores))
|
326 |
+
scoreConfBound = cscores[cbi]
|
327 |
+
self.calibration["scoreConfBound"] = scoreConfBound
|
328 |
+
self.calibration["ymin"] = ymin
|
329 |
+
self.calibration["ymax"] = ymax
|
330 |
+
print(self.calibration)
|
331 |
+
|
332 |
+
def saveCalib(self, fPath):
|
333 |
+
"""
|
334 |
+
saves scoformal score calibration
|
335 |
+
"""
|
336 |
+
saveObject(self.calibration, fPath)
|
337 |
+
|
338 |
+
def restoreCalib(self, fPath):
|
339 |
+
"""
|
340 |
+
saves scoformal score calibration
|
341 |
+
"""
|
342 |
+
self.calibration = restoreObject(fPath)
|
343 |
+
print(self.calibration)
|
344 |
+
|
345 |
+
def getPredRange(self, yp, nstep=100):
|
346 |
+
"""
|
347 |
+
get prediction range and related data
|
348 |
+
"""
|
349 |
+
ymin = self.calibration["ymin"]
|
350 |
+
ymax = self.calibration["ymax"]
|
351 |
+
step = (ymax - ymin) / nstep
|
352 |
+
scoreConfBound = self.calibration["scoreConfBound"]
|
353 |
+
|
354 |
+
rmin = None
|
355 |
+
rmax = None
|
356 |
+
rcount = 0
|
357 |
+
#print(ymin, ymax, step)
|
358 |
+
for ya in np.arange(ymin, ymax, step):
|
359 |
+
cscore = abs(yp - ya)
|
360 |
+
if cscore < scoreConfBound:
|
361 |
+
if rmin is None:
|
362 |
+
#lower bound
|
363 |
+
rmin = ya
|
364 |
+
rmax = ya
|
365 |
+
else:
|
366 |
+
#keep updating upper bound
|
367 |
+
rmax = ya if ya > rmax else rmax
|
368 |
+
rcount += 1
|
369 |
+
else:
|
370 |
+
if rmax is not None and rcount > 0:
|
371 |
+
#past upper bound
|
372 |
+
break
|
373 |
+
|
374 |
+
res = dict()
|
375 |
+
res["predRangeMin"] = rmin
|
376 |
+
res["predRangeMax"] = rmax
|
377 |
+
accepted = yp >= rmin and yp <= rmax
|
378 |
+
res["status"] = "accepted" if accepted else "rejected"
|
379 |
+
conf = 1.0 - (rmax - rmin) / (ymax - ymin)
|
380 |
+
res["confidence"] = conf
|
381 |
+
|
382 |
+
return res
|
383 |
+
|
384 |
+
|
supv/mcclf.py
ADDED
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/local/bin/python3
|
2 |
+
|
3 |
+
# Author: Pranab Ghosh
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
6 |
+
# may not use this file except in compliance with the License. You may
|
7 |
+
# obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
14 |
+
# implied. See the License for the specific language governing
|
15 |
+
# permissions and limitations under the License.
|
16 |
+
|
17 |
+
# Package imports
|
18 |
+
import os
|
19 |
+
import sys
|
20 |
+
import matplotlib.pyplot as plt
|
21 |
+
import numpy as np
|
22 |
+
import random
|
23 |
+
import jprops
|
24 |
+
from random import randint
|
25 |
+
from matumizi.util import *
|
26 |
+
from matumizi.mlutil import *
|
27 |
+
|
28 |
+
"""
|
29 |
+
Markov chain classifier
|
30 |
+
"""
|
31 |
+
class MarkovChainClassifier():
|
32 |
+
def __init__(self, configFile):
|
33 |
+
"""
|
34 |
+
constructor
|
35 |
+
|
36 |
+
Parameters
|
37 |
+
configFile: config file path
|
38 |
+
"""
|
39 |
+
defValues = {}
|
40 |
+
defValues["common.model.directory"] = ("model", None)
|
41 |
+
defValues["common.model.file"] = (None, None)
|
42 |
+
defValues["common.verbose"] = (False, None)
|
43 |
+
defValues["common.states"] = (None, "missing state list")
|
44 |
+
defValues["train.data.file"] = (None, "missing training data file")
|
45 |
+
defValues["train.data.class.labels"] = (["F", "T"], None)
|
46 |
+
defValues["train.data.key.len"] = (1, None)
|
47 |
+
defValues["train.model.save"] = (False, None)
|
48 |
+
defValues["train.score.method"] = ("accuracy", None)
|
49 |
+
defValues["predict.data.file"] = (None, None)
|
50 |
+
defValues["predict.use.saved.model"] = (True, None)
|
51 |
+
defValues["predict.log.odds.threshold"] = (0, None)
|
52 |
+
defValues["validate.data.file"] = (None, "missing validation data file")
|
53 |
+
defValues["validate.use.saved.model"] = (False, None)
|
54 |
+
defValues["valid.accuracy.metric"] = ("acc", None)
|
55 |
+
self.config = Configuration(configFile, defValues)
|
56 |
+
|
57 |
+
self.stTranPr = dict()
|
58 |
+
self.clabels = self.config.getStringListConfig("train.data.class.labels")[0]
|
59 |
+
self.states = self.config.getStringListConfig("common.states")[0]
|
60 |
+
self.nstates = len(self.states)
|
61 |
+
for cl in self.clabels:
|
62 |
+
stp = np.ones((self.nstates,self.nstates))
|
63 |
+
self.stTranPr[cl] = stp
|
64 |
+
|
65 |
+
def train(self):
|
66 |
+
"""
|
67 |
+
train model
|
68 |
+
"""
|
69 |
+
#state transition matrix
|
70 |
+
tdfPath = self.config.getStringConfig("train.data.file")[0]
|
71 |
+
klen = self.config.getIntConfig("train.data.key.len")[0]
|
72 |
+
for rec in fileRecGen(tdfPath):
|
73 |
+
cl = rec[klen]
|
74 |
+
rlen = len(rec)
|
75 |
+
for i in range(klen+1, rlen-1, 1):
|
76 |
+
fst = self.states.index(rec[i])
|
77 |
+
tst = self.states.index(rec[i+1])
|
78 |
+
self.stTranPr[cl][fst][tst] += 1
|
79 |
+
|
80 |
+
#normalize to probability
|
81 |
+
for cl in self.clabels:
|
82 |
+
stp = self.stTranPr[cl]
|
83 |
+
for i in range(self.nstates):
|
84 |
+
s = stp[i].sum()
|
85 |
+
r = stp[i] / s
|
86 |
+
stp[i] = r
|
87 |
+
|
88 |
+
#save
|
89 |
+
if self.config.getBooleanConfig("train.model.save")[0]:
|
90 |
+
mdPath = self.config.getStringConfig("common.model.directory")[0]
|
91 |
+
assert os.path.exists(mdPath), "model save directory does not exist"
|
92 |
+
mfPath = self.config.getStringConfig("common.model.file")[0]
|
93 |
+
mfPath = os.path.join(mdPath, mfPath)
|
94 |
+
|
95 |
+
with open(mfPath, "w") as fh:
|
96 |
+
for cl in self.clabels:
|
97 |
+
fh.write("label:" + cl +"\n")
|
98 |
+
stp = self.stTranPr[cl]
|
99 |
+
for r in stp:
|
100 |
+
rs = ",".join(toStrList(r, 6)) + "\n"
|
101 |
+
fh.write(rs)
|
102 |
+
|
103 |
+
def validate(self):
|
104 |
+
"""
|
105 |
+
validate using model
|
106 |
+
"""
|
107 |
+
useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
|
108 |
+
if useSavedModel:
|
109 |
+
self.__restoreModel()
|
110 |
+
else:
|
111 |
+
self.train()
|
112 |
+
|
113 |
+
vdfPath = self.config.getStringConfig("validate.data.file")[0]
|
114 |
+
accMetric = self.config.getStringConfig("valid.accuracy.metric")[0]
|
115 |
+
|
116 |
+
yac, ypr = self.__getPrediction(vdfPath, True)
|
117 |
+
if type(self.clabels[0]) == str:
|
118 |
+
yac = self.__toIntClabel(yac)
|
119 |
+
ypr = self.__toIntClabel(ypr)
|
120 |
+
score = perfMetric(accMetric, yac, ypr)
|
121 |
+
print(formatFloat(3, score, "perf score"))
|
122 |
+
|
123 |
+
|
124 |
+
def predict(self):
|
125 |
+
"""
|
126 |
+
predict using model
|
127 |
+
"""
|
128 |
+
useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
|
129 |
+
if useSavedModel:
|
130 |
+
self.__restoreModel()
|
131 |
+
else:
|
132 |
+
self.train()
|
133 |
+
|
134 |
+
#predict
|
135 |
+
pdfPath = self.config.getStringConfig("predict.data.file")[0]
|
136 |
+
_ , ypr = self.__getPrediction(pdfPath)
|
137 |
+
return ypr
|
138 |
+
|
139 |
+
def __restoreModel(self):
|
140 |
+
"""
|
141 |
+
restore model
|
142 |
+
"""
|
143 |
+
mdPath = self.config.getStringConfig("common.model.directory")[0]
|
144 |
+
assert os.path.exists(mdPath), "model save directory does not exist"
|
145 |
+
mfPath = self.config.getStringConfig("common.model.file")[0]
|
146 |
+
mfPath = os.path.join(mdPath, mfPath)
|
147 |
+
stp = None
|
148 |
+
cl = None
|
149 |
+
for rec in fileRecGen(mfPath):
|
150 |
+
if len(rec) == 1:
|
151 |
+
if stp is not None:
|
152 |
+
stp = np.array(stp)
|
153 |
+
self.stTranPr[cl] = stp
|
154 |
+
cl = rec[0].split(":")[1]
|
155 |
+
stp = list()
|
156 |
+
else:
|
157 |
+
frec = asFloatList(rec)
|
158 |
+
stp.append(frec)
|
159 |
+
|
160 |
+
stp = np.array(stp)
|
161 |
+
self.stTranPr[cl] = stp
|
162 |
+
|
163 |
+
def __getPrediction(self, fpath, validate=False):
|
164 |
+
"""
|
165 |
+
get predictions
|
166 |
+
|
167 |
+
Parameters
|
168 |
+
fpath : data file path
|
169 |
+
validate: True if validation
|
170 |
+
"""
|
171 |
+
|
172 |
+
nc = self.clabels[0]
|
173 |
+
pc = self.clabels[1]
|
174 |
+
thold = self.config.getFloatConfig("predict.log.odds.threshold")[0]
|
175 |
+
klen = self.config.getIntConfig("train.data.key.len")[0]
|
176 |
+
offset = klen+1 if validate else klen
|
177 |
+
ypr = list()
|
178 |
+
yac = list()
|
179 |
+
for rec in fileRecGen(fpath):
|
180 |
+
lodds = 0
|
181 |
+
rlen = len(rec)
|
182 |
+
for i in range(offset, rlen-1, 1):
|
183 |
+
fst = self.states.index(rec[i])
|
184 |
+
tst = self.states.index(rec[i+1])
|
185 |
+
odds = self.stTranPr[pc][fst][tst] / self.stTranPr[nc][fst][tst]
|
186 |
+
lodds += math.log(odds)
|
187 |
+
prc = pc if lodds > thold else nc
|
188 |
+
ypr.append(prc)
|
189 |
+
if validate:
|
190 |
+
yac.append(rec[klen])
|
191 |
+
else:
|
192 |
+
recp = prc + "\t" + ",".join(rec)
|
193 |
+
print(recp)
|
194 |
+
|
195 |
+
re = (yac, ypr)
|
196 |
+
return re
|
197 |
+
|
198 |
+
def __toIntClabel(self, labels):
|
199 |
+
"""
|
200 |
+
convert string class label to int
|
201 |
+
|
202 |
+
Parameters
|
203 |
+
labels : class label values
|
204 |
+
"""
|
205 |
+
return list(map(lambda l : self.clabels.index(l), labels))
|
206 |
+
|
207 |
+
|
supv/nlm.py
ADDED
@@ -0,0 +1,434 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/local/bin/python3
|
2 |
+
|
3 |
+
# avenir-python: Machine Learning
|
4 |
+
# Author: Pranab Ghosh
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7 |
+
# may not use this file except in compliance with the License. You may
|
8 |
+
# obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15 |
+
# implied. See the License for the specific language governing
|
16 |
+
# permissions and limitations under the License.
|
17 |
+
|
18 |
+
import os
|
19 |
+
import sys
|
20 |
+
from random import randint
|
21 |
+
import random
|
22 |
+
import time
|
23 |
+
from datetime import datetime
|
24 |
+
import re, string, unicodedata
|
25 |
+
import spacy
|
26 |
+
import torch
|
27 |
+
from collections import defaultdict
|
28 |
+
import pickle
|
29 |
+
import numpy as np
|
30 |
+
import re
|
31 |
+
from sentence_transformers import CrossEncoder
|
32 |
+
|
33 |
+
sys.path.append(os.path.abspath("../lib"))
|
34 |
+
from util import *
|
35 |
+
from mlutil import *
|
36 |
+
|
37 |
+
"""
|
38 |
+
neural language model
|
39 |
+
"""
|
40 |
+
|
41 |
+
class NeuralLangModel(object):
|
42 |
+
def __init__(self):
|
43 |
+
"""
|
44 |
+
initialize
|
45 |
+
"""
|
46 |
+
self.dexts = None
|
47 |
+
|
48 |
+
def loadDocs(self, fpaths):
|
49 |
+
"""
|
50 |
+
loads documents from one file
|
51 |
+
"""
|
52 |
+
fPaths = fpaths.split(",")
|
53 |
+
if len(fPaths) == 1:
|
54 |
+
if os.path.isfile(fPaths[0]):
|
55 |
+
#one file
|
56 |
+
print("got one file from path")
|
57 |
+
dnames = fpaths
|
58 |
+
docStr = getOneFileContent(fPaths[0])
|
59 |
+
dtexts = [docStr]
|
60 |
+
else:
|
61 |
+
#all files under directory
|
62 |
+
print("got all files under directory from path")
|
63 |
+
dtexts, dnames = getFileContent(fPaths[0])
|
64 |
+
print("found following files")
|
65 |
+
for dt, dn in zip(dtexts, dnames):
|
66 |
+
print(dn + "\t" + dt[:40])
|
67 |
+
else:
|
68 |
+
#list of files
|
69 |
+
print("got list of files from path")
|
70 |
+
dnames = fpaths
|
71 |
+
dtexts = list(map(getOneFileContent, fpaths))
|
72 |
+
|
73 |
+
ndocs = (dtexts, dnames)
|
74 |
+
return ndocs
|
75 |
+
|
76 |
+
#Encoded doc
|
77 |
+
class EncodedDoc:
|
78 |
+
def __init__(self, dtext, dname, drank=None):
|
79 |
+
"""
|
80 |
+
initialize
|
81 |
+
"""
|
82 |
+
self.dtext = dtext
|
83 |
+
self.dname = dname
|
84 |
+
self.drank = drank
|
85 |
+
self.denc = None
|
86 |
+
self.score = None
|
87 |
+
|
88 |
+
def encode(self, nlp):
|
89 |
+
"""
|
90 |
+
encode
|
91 |
+
"""
|
92 |
+
self.denc = nlp(self.dtext)
|
93 |
+
|
94 |
+
#similarity at token and sentence level for BERT encoding
|
95 |
+
class SemanticSearch:
|
96 |
+
def __init__(self, docs=None):
|
97 |
+
"""
|
98 |
+
initialize
|
99 |
+
"""
|
100 |
+
print("loading BERT transformer model")
|
101 |
+
self.nlp = spacy.load("en_trf_bertbaseuncased_lg")
|
102 |
+
self.docs = docs if docs is not None else list()
|
103 |
+
|
104 |
+
def docAv(self,qu, doc):
|
105 |
+
"""
|
106 |
+
whole doc similarity
|
107 |
+
"""
|
108 |
+
return qu.similarity(doc)
|
109 |
+
|
110 |
+
def tokSimAv(self, qu, doc):
|
111 |
+
"""
|
112 |
+
token pair wise average
|
113 |
+
"""
|
114 |
+
qts = simAll(qu, doc)
|
115 |
+
asi = numpy.mean(qts)
|
116 |
+
return asi
|
117 |
+
|
118 |
+
def tokSimMed(self, qu, doc):
|
119 |
+
"""
|
120 |
+
token pair wise average
|
121 |
+
|
122 |
+
"""
|
123 |
+
qts = simAll(qu, doc)
|
124 |
+
asi = numpy.median(qts)
|
125 |
+
return asi
|
126 |
+
|
127 |
+
def tokSimMax(self, qu, doc):
|
128 |
+
"""
|
129 |
+
token pair wise max (tsma)
|
130 |
+
"""
|
131 |
+
qte = self. __getTensor(qu)
|
132 |
+
dte = self. __getTensor(doc)
|
133 |
+
return self.simMax(qte, dte)
|
134 |
+
|
135 |
+
def tokSimAvMax(self, qu, doc):
|
136 |
+
"""
|
137 |
+
token max then average (tsavm)
|
138 |
+
"""
|
139 |
+
qte = self. __getTensor(qu)
|
140 |
+
dte = self. __getTensor(doc)
|
141 |
+
return self.simAvMax(qte, dte)
|
142 |
+
|
143 |
+
def tokSimMaxAv(self, qu, doc):
|
144 |
+
"""
|
145 |
+
token average and then max
|
146 |
+
"""
|
147 |
+
qte = self. __getTensor(qu)
|
148 |
+
dte = self. __getTensor(doc)
|
149 |
+
return self.simMaxAv(qte, dte)
|
150 |
+
|
151 |
+
def sentSimAv(self, qu, doc):
|
152 |
+
"""
|
153 |
+
sentence wise average
|
154 |
+
"""
|
155 |
+
qse, dse = self.__sentEnc(qu, doc)
|
156 |
+
sims = self.simAll(qse, dse)
|
157 |
+
return numpy.mean(sims)
|
158 |
+
|
159 |
+
def sentSimMed(self, qu, doc):
|
160 |
+
"""
|
161 |
+
sentence wise average (ssma)
|
162 |
+
"""
|
163 |
+
qse, dse = self.__sentEnc(qu, doc)
|
164 |
+
sims = self.simAll(qse, dse)
|
165 |
+
return numpy.median(sims)
|
166 |
+
|
167 |
+
def sentSimMax(self, qu, doc):
|
168 |
+
"""
|
169 |
+
sentence wise average (ssma)
|
170 |
+
"""
|
171 |
+
qse, dse = self.__sentEnc(qu, doc)
|
172 |
+
sims = self.simAll(qse, dse)
|
173 |
+
return numpy.maximum(sims)
|
174 |
+
|
175 |
+
|
176 |
+
def sentSimAvMax(self, qu, doc):
|
177 |
+
"""
|
178 |
+
sentence max then average (tsavm)
|
179 |
+
"""
|
180 |
+
qse, dse = self.__sentEnc(qu, doc)
|
181 |
+
return self.simAvMax(qse, dse)
|
182 |
+
|
183 |
+
def sentSimMaxAv(self, qu, doc):
|
184 |
+
"""
|
185 |
+
sentence average and then max
|
186 |
+
"""
|
187 |
+
qse, dse = self.__sentEnc(qu, doc)
|
188 |
+
return self.simMaxAv(qse, dse)
|
189 |
+
|
190 |
+
def simMax(self, qte, dte):
|
191 |
+
"""
|
192 |
+
max similarity between 2 elements
|
193 |
+
"""
|
194 |
+
msi = 0
|
195 |
+
for qt in qte:
|
196 |
+
for dt in dte:
|
197 |
+
si = cosineSimilarity(qt, dt)
|
198 |
+
if not math.isnan(si) and si > msi:
|
199 |
+
msi = si
|
200 |
+
return msi
|
201 |
+
|
202 |
+
def simAvMax(self, qte, dte):
|
203 |
+
"""
|
204 |
+
max then average (tsavm)
|
205 |
+
"""
|
206 |
+
qts = list()
|
207 |
+
for qt in qte:
|
208 |
+
msi = 0
|
209 |
+
for dt in dte:
|
210 |
+
si = cosineSimilarity(qt, dt)
|
211 |
+
if not math.isnan(si) and si > msi:
|
212 |
+
msi = si
|
213 |
+
qts.append(msi)
|
214 |
+
|
215 |
+
amsi = numpy.mean(numpy.array(qts))
|
216 |
+
return amsi
|
217 |
+
|
218 |
+
def simMaxAv(self, lqe, lde):
|
219 |
+
"""
|
220 |
+
average and then max
|
221 |
+
"""
|
222 |
+
masi = 0
|
223 |
+
for qe in lqe:
|
224 |
+
qes = list()
|
225 |
+
for de in lde:
|
226 |
+
si = cosineSimilarity(qe, de)
|
227 |
+
if not math.isnan(si):
|
228 |
+
qes.append(si)
|
229 |
+
av = numpy.mean(numpy.array(qes))
|
230 |
+
if av > masi:
|
231 |
+
masi = av
|
232 |
+
return masi
|
233 |
+
|
234 |
+
def simAll(self, lqe, lde):
|
235 |
+
"""
|
236 |
+
all similarity
|
237 |
+
"""
|
238 |
+
qes = list()
|
239 |
+
for qe in lqe:
|
240 |
+
for de in lde:
|
241 |
+
si = cosineSimilarity(qe, de)
|
242 |
+
if not math.isnan(si):
|
243 |
+
qes.append(si)
|
244 |
+
return numpy.array(qes)
|
245 |
+
|
246 |
+
def __sentEnc(self, qu, doc):
|
247 |
+
"""
|
248 |
+
sentence encoding for query and doc
|
249 |
+
"""
|
250 |
+
qstr = qu._.trf_word_pieces_
|
251 |
+
qte = zip(qstr, qu._.trf_last_hidden_state)
|
252 |
+
qse = list()
|
253 |
+
for t, v in qte:
|
254 |
+
if t == "[CLS]":
|
255 |
+
qse.append(v)
|
256 |
+
|
257 |
+
|
258 |
+
dstr = doc._.trf_word_pieces_
|
259 |
+
dte = zip(dstr, doc._.trf_last_hidden_state)
|
260 |
+
dse = list()
|
261 |
+
for t, v in dte:
|
262 |
+
if t == "[CLS]":
|
263 |
+
dse.append(v)
|
264 |
+
|
265 |
+
enp = (numpy.array(qse), numpy.array(dse))
|
266 |
+
return enp
|
267 |
+
|
268 |
+
def __getTensor(self, toks):
|
269 |
+
"""
|
270 |
+
tensors from tokens
|
271 |
+
"""
|
272 |
+
return list(map(lambda t: t.tensor, toks))
|
273 |
+
|
274 |
+
def addDocs(self, docs):
|
275 |
+
"""
|
276 |
+
add named doc content
|
277 |
+
"""
|
278 |
+
self.docs.extend(docs)
|
279 |
+
|
280 |
+
def loadDocs(self, fpaths):
|
281 |
+
"""
|
282 |
+
loads documents from one file
|
283 |
+
"""
|
284 |
+
fPaths = fpaths.split(",")
|
285 |
+
if len(fPaths) == 1:
|
286 |
+
if os.path.isfile(fPaths[0]):
|
287 |
+
#one file
|
288 |
+
print("one file")
|
289 |
+
dnames = fpaths
|
290 |
+
docStr = getOneFileContent(fPaths[0])
|
291 |
+
dtexts = [docStr]
|
292 |
+
else:
|
293 |
+
#all files under directory
|
294 |
+
print("all files under directory")
|
295 |
+
dtexts, dnames = getFileContent(fPaths[0])
|
296 |
+
print("found following files")
|
297 |
+
for dt, dn in zip(dtexts, dnames):
|
298 |
+
print(dn + "\t" + dt[:40])
|
299 |
+
else:
|
300 |
+
#list of files
|
301 |
+
print("list of files")
|
302 |
+
dnames = fpaths
|
303 |
+
dtexts = list(map(getOneFileContent, fpaths))
|
304 |
+
|
305 |
+
docs = list(map(lambda dtext, dname : EncodedDoc(dtext, dname), zip(dtexts, dnames)))
|
306 |
+
self.docs.extend(docs)
|
307 |
+
|
308 |
+
def search(self, qstr, algo, gdranks=None):
|
309 |
+
"""
|
310 |
+
tensors from tokens
|
311 |
+
"""
|
312 |
+
qv = self.nlp(qstr)
|
313 |
+
res = list()
|
314 |
+
for d in self.docs:
|
315 |
+
dn = d.dname
|
316 |
+
if d.denc == None:
|
317 |
+
d.encode(self.nlp)
|
318 |
+
dv = d.denc
|
319 |
+
if algo == "ds":
|
320 |
+
si = self.docAv(qv, dv)
|
321 |
+
elif algo == "tsa":
|
322 |
+
si = self.tokSimAv(qv, dv)
|
323 |
+
elif algo == "tsme":
|
324 |
+
si = self.tokSimMed(qv, dv)
|
325 |
+
elif algo == "tsma":
|
326 |
+
si = self.tokSimMax(qv, dv)
|
327 |
+
elif algo == "tsavm":
|
328 |
+
si = self.tokSimAvMax(qv, dv)
|
329 |
+
elif algo == "tsmav":
|
330 |
+
si = self.tokSimMaxAv(qv, dv)
|
331 |
+
elif algo == "ssa":
|
332 |
+
si = self.sentSimAv(qv, dv)
|
333 |
+
elif algo == "ssme":
|
334 |
+
si = self.sentSimMed(qv, dv)
|
335 |
+
elif algo == "ssma":
|
336 |
+
si = self.sentSimMax(qv, dv)
|
337 |
+
elif algo == "ssavm":
|
338 |
+
si = self.sentSimAvMax(qv, dv)
|
339 |
+
elif algo == "ssmav":
|
340 |
+
si = self.sentSimMaxAv(qv, dv)
|
341 |
+
else:
|
342 |
+
si = -1.0
|
343 |
+
print("invalid semilarity algo")
|
344 |
+
|
345 |
+
#print("{} score {:.6f}".format(dn, si))
|
346 |
+
d.score = si
|
347 |
+
r = (dn, si)
|
348 |
+
res.append(r)
|
349 |
+
|
350 |
+
#search score for each document
|
351 |
+
res.sort(key=lambda r : r[1], reverse=True)
|
352 |
+
print("\nsorted search result")
|
353 |
+
print("query: {} matching algo: {}".format(qstr, algo))
|
354 |
+
for r in res:
|
355 |
+
print("{} score {:.3f}".format(r[0], r[1]))
|
356 |
+
|
357 |
+
#rank order if gold truuth rank provided
|
358 |
+
if gdranks is not None:
|
359 |
+
i = 0
|
360 |
+
count = 0
|
361 |
+
for d in gdranks:
|
362 |
+
while i < len(gdranks):
|
363 |
+
if d == res[i][0]:
|
364 |
+
count += 1
|
365 |
+
i += 1
|
366 |
+
break;
|
367 |
+
i += 1
|
368 |
+
ro = count / len(gdranks)
|
369 |
+
print("rank order {:.3f}".format(ro))
|
370 |
+
|
371 |
+
#similarity at passage or paragraph level using sbertcross encoder
|
372 |
+
class SemanticSimilaityCrossEnc(NeuralLangModel):
|
373 |
+
|
374 |
+
def __init__(self, docs=None):
|
375 |
+
self.dparas = None
|
376 |
+
self.scores = None
|
377 |
+
print("loading cross encoder")
|
378 |
+
self.model = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2")
|
379 |
+
print("done loading cross encoder")
|
380 |
+
super(NeuralLangModel, self).__init__()
|
381 |
+
|
382 |
+
def paraSimilarity(self, dtext, fpaths, minParNl=1):
|
383 |
+
"""
|
384 |
+
returns paragarph pair similarity across 2 documents
|
385 |
+
"""
|
386 |
+
dtexts, dnames = self.loadDocs(fpaths)
|
387 |
+
if dtext is None:
|
388 |
+
assertEqual(len(dtexts), 2, "exactly 2 files needed")
|
389 |
+
self.dtexts = dtexts
|
390 |
+
else:
|
391 |
+
assertEqual(len(dtexts), 1, "exactly 1 file needed")
|
392 |
+
self.dtexts = list()
|
393 |
+
self.dtexts.append(dtext)
|
394 |
+
self.dtexts.append(dtexts[0])
|
395 |
+
|
396 |
+
|
397 |
+
self.dparas = list()
|
398 |
+
for text in self.dtexts:
|
399 |
+
regx = "\n+" if minParNl == 1 else "\n{2,}"
|
400 |
+
paras = re.split(regx, text.replace("\r\n", "\n"))
|
401 |
+
print("no of paras {}".format(len(paras)))
|
402 |
+
self.dparas.append(paras)
|
403 |
+
|
404 |
+
tinp = list()
|
405 |
+
for para1 in self.dparas[0]:
|
406 |
+
inp = list(map(lambda para2: [para1, para2], self.dparas[1]))
|
407 |
+
tinp.extend(inp)
|
408 |
+
|
409 |
+
print("input shape " + str(np.array(tinp).shape))
|
410 |
+
scores = self.model.predict(tinp)
|
411 |
+
print("score shape " + str(np.array(scores).shape))
|
412 |
+
#assertEqual(len(scores), len(self.dparas[0]) * len(self.dparas[1]), "no of scores don't match no of paragraph pairs")
|
413 |
+
print(scores)
|
414 |
+
|
415 |
+
i = 0
|
416 |
+
print("text paragraph pair wise similarity")
|
417 |
+
for para1 in self.dparas[0]:
|
418 |
+
for para2 in self.dparas[1]:
|
419 |
+
print("first: {}\t second: {}\t score: {:.6f}".format(para1[:20], para2[:20], scores[i]))
|
420 |
+
i += 1
|
421 |
+
|
422 |
+
self.scores = scores
|
423 |
+
|
424 |
+
def avMaxScore(self):
|
425 |
+
"""
|
426 |
+
"""
|
427 |
+
pass
|
428 |
+
|
429 |
+
def ner(text, nlp):
|
430 |
+
#nlp = spacy.load("en_core_web_md")
|
431 |
+
doc = nlp(text)
|
432 |
+
for ent in doc.ents:
|
433 |
+
print(ent.text, ent.start_char, ent.end_char, ent.label_)
|
434 |
+
|
supv/optunar.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/local/bin/python3
|
2 |
+
|
3 |
+
# avenir-python: Machine Learning
|
4 |
+
# Author: Pranab Ghosh
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7 |
+
# may not use this file except in compliance with the License. You may
|
8 |
+
# obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15 |
+
# implied. See the License for the specific language governing
|
16 |
+
# permissions and limitations under the License.
|
17 |
+
|
18 |
+
# Package imports
|
19 |
+
import os
|
20 |
+
import sys
|
21 |
+
import torch
|
22 |
+
from torch.utils.data import DataLoader
|
23 |
+
import random
|
24 |
+
import jprops
|
25 |
+
from random import randint
|
26 |
+
import optuna
|
27 |
+
sys.path.append(os.path.abspath("../lib"))
|
28 |
+
from util import *
|
29 |
+
from mlutil import *
|
30 |
+
|
31 |
+
"""
|
32 |
+
neural network hyper paramter tuning with ptuna
|
33 |
+
"""
|
34 |
+
|
35 |
+
def createTunerConfig(configFile):
|
36 |
+
"""
|
37 |
+
create tuner config pbject
|
38 |
+
"""
|
39 |
+
defValues = dict()
|
40 |
+
defValues["train.num.layers"] = ([2,4], None)
|
41 |
+
defValues["train.num.units"] = (None, "missing range of number of units")
|
42 |
+
defValues["train.activation"] = ("relu", None)
|
43 |
+
defValues["train.batch.normalize"] = (["true", "false"], None)
|
44 |
+
defValues["train.dropout.prob"] = ([-0.1, 0.5], None)
|
45 |
+
defValues["train.out.num.units"] = (None, "missing number of output units")
|
46 |
+
defValues["train.out.activation"] = (None, "missing output activation")
|
47 |
+
defValues["train.batch.size"] = ([16, 128], None)
|
48 |
+
defValues["train.opt.learning.rate"] = ([.0001, .005], None)
|
49 |
+
|
50 |
+
config = Configuration(configFile, defValues)
|
51 |
+
return config
|
52 |
+
|
53 |
+
def showStudyResults(study):
|
54 |
+
"""
|
55 |
+
shows study results
|
56 |
+
"""
|
57 |
+
print("Number of finished trials: ", len(study.trials))
|
58 |
+
print("Best trial:")
|
59 |
+
trial = study.best_trial
|
60 |
+
print("Value: ", trial.value)
|
61 |
+
print("Params: ")
|
62 |
+
for key, value in trial.params.items():
|
63 |
+
print(" {}: {}".format(key, value))
|
64 |
+
|
65 |
+
|
66 |
+
def objective(trial, networkType, modelConfigFile, tunerConfigFile):
|
67 |
+
"""
|
68 |
+
optuna based hyperparamter tuning for neural network
|
69 |
+
"""
|
70 |
+
tConfig = createTunerConfig(tunerConfigFile)
|
71 |
+
|
72 |
+
#tuning parameters
|
73 |
+
nlayers = config.getIntListConfig("train.num.layers")[0]
|
74 |
+
nunits = config.getIntListConfig("train.num.units")[0]
|
75 |
+
act = config.getStringConfig("train.activation")[0]
|
76 |
+
dropOutRange = config.getFloatListConfig("train.dropout.prob")[0]
|
77 |
+
outNunits = config.getIntConfig("train.out.num.units")[0]
|
78 |
+
outAct = config.getStringConfig("train.out.activation")[0]
|
79 |
+
batchSizes = config.getIntListConfig("train.batch.size")[0]
|
80 |
+
learningRates = config.getFloatListConfig("train.opt.learning.rate")[0]
|
81 |
+
|
82 |
+
numLayers = trial.suggest_int("numLayers", nlayers[0], nlayers[1])
|
83 |
+
|
84 |
+
#batch normalize on for all layers or none
|
85 |
+
batchNormOptions = ["true", "false"]
|
86 |
+
batchNorm = trial.suggest_categorical("batchNorm", batchNormOptions)
|
87 |
+
|
88 |
+
layerConfig = ""
|
89 |
+
maxUnits = nunits[1]
|
90 |
+
sep = ":"
|
91 |
+
for i in range(nlayers):
|
92 |
+
if i < nlayers - 1:
|
93 |
+
nunit = trial.suggest_int("numUnits_l{}".format(i), nunits[0], maxUnits)
|
94 |
+
dropOut = trial.suggest_int("dropOut_l{}".format(i), dropOutRange[0], dropOutRange[1])
|
95 |
+
lconfig = [str(nunit), act, batchNorm, "true", "{:.3f}".format(dropOut)]
|
96 |
+
lconfig = sep.join(lconfig) + ","
|
97 |
+
maxUnits = nunit
|
98 |
+
else:
|
99 |
+
lconfig = [str(outNunits), outAct, "false", "false", "{:.3f}".format(-0.1)]
|
100 |
+
lconfig = sep.join(lconfig)
|
101 |
+
layerConfig = layerConfig + lconfig
|
102 |
+
|
103 |
+
batchSize = trial.suggest_int("batchSize", batchSizes[0], batchSizes[1])
|
104 |
+
learningRate = trial.suggest_int("learningRate", learningRates[0], learningRates[1])
|
105 |
+
|
106 |
+
#train model
|
107 |
+
nnModel = FeedForwardNetwork(modelConfigFile)
|
108 |
+
nnModel.setConfigParam("train.layer.data", layerConfig)
|
109 |
+
nnModel.setConfigParam("train.batch.size", batchSize)
|
110 |
+
nnModel.setConfigParam("train.opt.learning.rate", learningRate)
|
111 |
+
nnModel.buildModel()
|
112 |
+
score = FeedForwardNetwork.batchTrain(nnModel)
|
113 |
+
return score
|
114 |
+
|
115 |
+
if __name__ == "__main__":
|
116 |
+
assert len(sys.argv) == 5, "requires 4 command line args"
|
117 |
+
|
118 |
+
networkType = sys.argv[1]
|
119 |
+
modelConfigFile = sys.argv[2]
|
120 |
+
tunerConfigFile = sys.argv[3]
|
121 |
+
numTrial = int(sys.argv[4])
|
122 |
+
|
123 |
+
study = optuna.create_study()
|
124 |
+
study.optimize(lambda trial: objective(trial, networkType, modelConfigFile, tunerConfigFile), n_trials=numTrial)
|
125 |
+
|
126 |
+
showStudyResults(study)
|
127 |
+
|
supv/pasearch.py
ADDED
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/Users/pranab/Tools/anaconda/bin/python
|
2 |
+
|
3 |
+
# Package imports
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
import numpy as np
|
7 |
+
import sklearn as sk
|
8 |
+
import random
|
9 |
+
import jprops
|
10 |
+
import abc
|
11 |
+
import math
|
12 |
+
import random
|
13 |
+
sys.path.append(os.path.abspath("../lib"))
|
14 |
+
from util import *
|
15 |
+
|
16 |
+
#base parameter search
|
17 |
+
class BaseParameterSearch(object):
|
18 |
+
__metaclass__ = abc.ABCMeta
|
19 |
+
|
20 |
+
def __init__(self, verbose):
|
21 |
+
self.verbose = verbose
|
22 |
+
self.parameters = []
|
23 |
+
self.paramData = {}
|
24 |
+
self.currentParams = []
|
25 |
+
self.curIter = 0
|
26 |
+
self.bestSolution = None
|
27 |
+
|
28 |
+
# add param name and type
|
29 |
+
def addParam(self, param):
|
30 |
+
self.parameters.append(param)
|
31 |
+
|
32 |
+
# add param data
|
33 |
+
def addParamVaues(self, paramName, paramData):
|
34 |
+
self.paramData[paramName] = paramData
|
35 |
+
|
36 |
+
# max iterations
|
37 |
+
def setMaxIter(self, maxIter):
|
38 |
+
self.maxIter = maxIter
|
39 |
+
|
40 |
+
@abc.abstractmethod
|
41 |
+
def prepare(self):
|
42 |
+
pass
|
43 |
+
|
44 |
+
@abc.abstractmethod
|
45 |
+
def nextParamValues(self):
|
46 |
+
pass
|
47 |
+
|
48 |
+
@abc.abstractmethod
|
49 |
+
def setCost(self, cost):
|
50 |
+
pass
|
51 |
+
|
52 |
+
# get best solution
|
53 |
+
def getBestSolution(self):
|
54 |
+
return self.bestSolution
|
55 |
+
|
56 |
+
#enumerate through provided list of param values
|
57 |
+
class GuidedParameterSearch:
|
58 |
+
def __init__(self, verbose=False):
|
59 |
+
self.verbose = verbose
|
60 |
+
self.parameters = []
|
61 |
+
self.paramData = {}
|
62 |
+
self.paramIndexes = []
|
63 |
+
self.numParamValues = []
|
64 |
+
self.currentParams = []
|
65 |
+
self.bestSolution = None
|
66 |
+
|
67 |
+
# max iterations
|
68 |
+
def setMaxIter(self,maxIter):
|
69 |
+
self.maxIter = maxIter
|
70 |
+
|
71 |
+
# add param name and type
|
72 |
+
def addParam(self, param):
|
73 |
+
self.parameters.append(param)
|
74 |
+
|
75 |
+
# add param data
|
76 |
+
def addParamVaues(self, paramName, paramData):
|
77 |
+
self.paramData[paramName] = paramData
|
78 |
+
|
79 |
+
# prepare
|
80 |
+
def prepare(self):
|
81 |
+
self.numParams = len(self.parameters)
|
82 |
+
for i in range(self.numParams):
|
83 |
+
self.paramIndexes.append(0)
|
84 |
+
|
85 |
+
#number of values for each parameter
|
86 |
+
paramName = self.parameters[i][0]
|
87 |
+
self.numParamValues.append(len(self.paramData[paramName]))
|
88 |
+
self.curParamIndex = 0
|
89 |
+
|
90 |
+
paramValueCombList = []
|
91 |
+
paramValueComb = []
|
92 |
+
paramValueCombList.append(paramValueComb)
|
93 |
+
|
94 |
+
# all params
|
95 |
+
for i in range(self.numParams):
|
96 |
+
paramValueCombListTemp = []
|
97 |
+
for paramValueComb in paramValueCombList:
|
98 |
+
# all param values
|
99 |
+
for j in range(self.numParamValues[i]):
|
100 |
+
paramValueCombTemp = paramValueComb[:]
|
101 |
+
paramValueCombTemp.append(j)
|
102 |
+
paramValueCombListTemp.append(paramValueCombTemp)
|
103 |
+
paramValueCombList = paramValueCombListTemp
|
104 |
+
self.paramValueCombList = paramValueCombList
|
105 |
+
self.numParamValueComb = len(self.paramValueCombList)
|
106 |
+
self.curParamValueCombIndx = 0;
|
107 |
+
|
108 |
+
# next param combination
|
109 |
+
def nextParamValues(self):
|
110 |
+
retParamNameValue = None
|
111 |
+
if self.curParamValueCombIndx < len(self.paramValueCombList):
|
112 |
+
retParamNameValue = []
|
113 |
+
curParams = self.paramValueCombList[self.curParamValueCombIndx]
|
114 |
+
print (curParams)
|
115 |
+
for i in range(len(curParams)):
|
116 |
+
paramName = self.parameters[i][0]
|
117 |
+
paramValue = self.paramData[paramName][curParams[i]]
|
118 |
+
retParamNameValue.append((paramName, paramValue))
|
119 |
+
self.curParamValueCombIndx = self.curParamValueCombIndx + 1
|
120 |
+
self.currentParams = retParamNameValue
|
121 |
+
return retParamNameValue
|
122 |
+
|
123 |
+
# set cost of current parameter set
|
124 |
+
def setCost(self, cost):
|
125 |
+
if self.bestSolution is not None:
|
126 |
+
if cost < self.bestSolution[1]:
|
127 |
+
self.bestSolution = (self.currentParams, cost)
|
128 |
+
else:
|
129 |
+
self.bestSolution = (self.currentParams, cost)
|
130 |
+
|
131 |
+
# get best solution
|
132 |
+
def getBestSolution(self):
|
133 |
+
return self.bestSolution
|
134 |
+
|
135 |
+
#random search through provided list of parameter values
|
136 |
+
class RandomParameterSearch(BaseParameterSearch):
|
137 |
+
def __init__(self, verbose=False):
|
138 |
+
super(RandomParameterSearch, self).__init__(verbose)
|
139 |
+
|
140 |
+
|
141 |
+
# prepare
|
142 |
+
def prepare(self):
|
143 |
+
pass
|
144 |
+
|
145 |
+
# next param combination
|
146 |
+
def nextParamValues(self):
|
147 |
+
retParamNameValue = None
|
148 |
+
if (self.curIter < self.maxIter):
|
149 |
+
retParamNameValue = []
|
150 |
+
for pName, pValues in self.paramData.iteritems():
|
151 |
+
pValue = selectRandomFromList(pValues)
|
152 |
+
retParamNameValue.append((pName, pValue))
|
153 |
+
self.curIter = self.curIter + 1
|
154 |
+
self.currentParams = retParamNameValue
|
155 |
+
return retParamNameValue
|
156 |
+
|
157 |
+
# set cost of current parameter set
|
158 |
+
def setCost(self, cost):
|
159 |
+
if self.bestSolution is not None:
|
160 |
+
if cost < self.bestSolution[1]:
|
161 |
+
self.bestSolution = (self.currentParams, cost)
|
162 |
+
else:
|
163 |
+
self.bestSolution = (self.currentParams, cost)
|
164 |
+
|
165 |
+
#random search through provided list of parameter values
|
166 |
+
class SimulatedAnnealingParameterSearch(BaseParameterSearch):
|
167 |
+
def __init__(self, verbose=False):
|
168 |
+
self.curSolution = None
|
169 |
+
self.nextSolution = None
|
170 |
+
super(SimulatedAnnealingParameterSearch, self).__init__(verbose)
|
171 |
+
|
172 |
+
# prepare
|
173 |
+
def prepare(self):
|
174 |
+
pass
|
175 |
+
|
176 |
+
def setTemp(self, temp):
|
177 |
+
self.temp = temp
|
178 |
+
|
179 |
+
def setTempReductionRate(self, tempRedRate):
|
180 |
+
self.tempRedRate = tempRedRate
|
181 |
+
|
182 |
+
# next param combination
|
183 |
+
def nextParamValues(self):
|
184 |
+
retParamNameValue = None
|
185 |
+
if (self.curIter == 0):
|
186 |
+
#initial random solution
|
187 |
+
retParamNameValue = []
|
188 |
+
for pName, pValues in self.paramData.iteritems():
|
189 |
+
pValue = selectRandomFromList(pValues)
|
190 |
+
retParamNameValue.append((pName, pValue))
|
191 |
+
self.curIter = self.curIter + 1
|
192 |
+
self.currentParams = retParamNameValue
|
193 |
+
elif (self.curIter < self.maxIter):
|
194 |
+
#perturb current solution
|
195 |
+
retParamNameValue = []
|
196 |
+
|
197 |
+
#randomly mutate one parameter value
|
198 |
+
(pNameSel, pValue) = selectRandomFromList(self.currentParams)
|
199 |
+
pValueNext = selectRandomFromList(self.paramData[pNameSel])
|
200 |
+
while (pValueNext == pValue):
|
201 |
+
pValueNext = selectRandomFromList(self.paramData[pNameSel])
|
202 |
+
|
203 |
+
#copy
|
204 |
+
for (pName, pValue) in self.currentParams:
|
205 |
+
if (pName == pNameSel):
|
206 |
+
pValueNew = pValueNext
|
207 |
+
else:
|
208 |
+
pValueNew = pValue
|
209 |
+
retParamNameValue.append((pName, pValueNew))
|
210 |
+
self.curIter = self.curIter + 1
|
211 |
+
self.currentParams = retParamNameValue
|
212 |
+
return retParamNameValue
|
213 |
+
|
214 |
+
# set cost of current parameter set
|
215 |
+
def setCost(self, cost):
|
216 |
+
if self.curSolution is None:
|
217 |
+
self.curSolution = (self.currentParams, cost)
|
218 |
+
self.bestSolution = (self.currentParams, cost)
|
219 |
+
else:
|
220 |
+
self.nextSolution = (self.currentParams, cost)
|
221 |
+
if (self.nextSolution[1] < self.curSolution[1]):
|
222 |
+
if (self.verbose):
|
223 |
+
print ("next soln better")
|
224 |
+
self.curSolution = self.nextSolution
|
225 |
+
if (self.nextSolution[1] < self.bestSolution[1]):
|
226 |
+
if (self.verbose):
|
227 |
+
print ("next soln better than best")
|
228 |
+
self.bestSolution = self.nextSolution
|
229 |
+
else:
|
230 |
+
if (self.verbose):
|
231 |
+
print ("next soln worst")
|
232 |
+
pr = math.exp((self.curSolution[1] - self.nextSolution[1]) / self.temp)
|
233 |
+
if (pr > random.random()):
|
234 |
+
self.curSolution = self.nextSolution
|
235 |
+
if (self.verbose):
|
236 |
+
print ("next soln worst but accepted")
|
237 |
+
else:
|
238 |
+
if (self.verbose):
|
239 |
+
print ("next soln worst and rejected")
|
240 |
+
|
241 |
+
self.temp = self.temp * self.tempRedRate
|
242 |
+
|
243 |
+
|
supv/regress.py
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/local/bin/python3
|
2 |
+
|
3 |
+
# avenir-python: Machine Learning
|
4 |
+
# Author: Pranab Ghosh
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7 |
+
# may not use this file except in compliance with the License. You may
|
8 |
+
# obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15 |
+
# implied. See the License for the specific language governing
|
16 |
+
# permissions and limitations under the License.
|
17 |
+
|
18 |
+
# Package imports
|
19 |
+
import os
|
20 |
+
import sys
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
import numpy as np
|
23 |
+
import sklearn as sk
|
24 |
+
import matplotlib
|
25 |
+
import random
|
26 |
+
import jprops
|
27 |
+
from io import StringIO
|
28 |
+
from sklearn.model_selection import cross_val_score
|
29 |
+
import joblib
|
30 |
+
from random import randint
|
31 |
+
from io import StringIO
|
32 |
+
from sklearn.linear_model import LinearRegression
|
33 |
+
sys.path.append(os.path.abspath("../lib"))
|
34 |
+
from util import *
|
35 |
+
from mlutil import *
|
36 |
+
from pasearch import *
|
37 |
+
|
38 |
+
class BaseRegressor(object):
|
39 |
+
"""
|
40 |
+
base regression class
|
41 |
+
"""
|
42 |
+
|
43 |
+
def __init__(self, configFile, defValues):
|
44 |
+
"""
|
45 |
+
intializer
|
46 |
+
"""
|
47 |
+
defValues["common.mode"] = ("train", None)
|
48 |
+
defValues["common.model.directory"] = ("model", None)
|
49 |
+
defValues["common.model.file"] = (None, None)
|
50 |
+
defValues["common.scale.file.path"] = (None, "missing scale file path")
|
51 |
+
defValues["common.preprocessing"] = (None, None)
|
52 |
+
defValues["common.verbose"] = (False, None)
|
53 |
+
defValues["train.data.file"] = (None, "missing training data file")
|
54 |
+
defValues["train.data.fields"] = (None, "missing training data field ordinals")
|
55 |
+
defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
|
56 |
+
defValues["train.data.out.field"] = (None, "missing out field ordinal")
|
57 |
+
|
58 |
+
self.config = Configuration(configFile, defValues)
|
59 |
+
self.featData = None
|
60 |
+
self.outData = None
|
61 |
+
self.regressor = None
|
62 |
+
self.verbose = self.config.getBooleanConfig("common.verbose")[0]
|
63 |
+
self.mode = self.config.getBooleanConfig("common.mode")[0]
|
64 |
+
logFilePath = self.config.getStringConfig("common.logging.file")[0]
|
65 |
+
logLevName = self.config.getStringConfig("common.logging.level")[0]
|
66 |
+
self.logger = createLogger(__name__, logFilePath, logLevName)
|
67 |
+
self.logger.info("********* starting session")
|
68 |
+
|
69 |
+
def initConfig(self, configFile, defValues):
|
70 |
+
"""
|
71 |
+
initialize config
|
72 |
+
"""
|
73 |
+
self.config = Configuration(configFile, defValues)
|
74 |
+
|
75 |
+
def getConfig(self):
|
76 |
+
"""
|
77 |
+
get config object
|
78 |
+
"""
|
79 |
+
return self.config
|
80 |
+
|
81 |
+
def setConfigParam(self, name, value):
|
82 |
+
"""
|
83 |
+
set config param
|
84 |
+
"""
|
85 |
+
self.config.setParam(name, value)
|
86 |
+
|
87 |
+
def getMode(self):
|
88 |
+
"""
|
89 |
+
get mode
|
90 |
+
"""
|
91 |
+
return self.mode
|
92 |
+
|
93 |
+
def train(self):
|
94 |
+
"""
|
95 |
+
train model
|
96 |
+
"""
|
97 |
+
#build model
|
98 |
+
self.buildModel()
|
99 |
+
|
100 |
+
# training data
|
101 |
+
if self.featData is None:
|
102 |
+
(featData, outData) = self.prepData("train")
|
103 |
+
(self.featData, self.outData) = (featData, outData)
|
104 |
+
else:
|
105 |
+
(featData, outData) = (self.featData, self.outData)
|
106 |
+
|
107 |
+
# parameters
|
108 |
+
modelSave = self.config.getBooleanConfig("train.model.save")[0]
|
109 |
+
|
110 |
+
#train
|
111 |
+
self.logger.info("...training model")
|
112 |
+
self.regressor.fit(featData, outData)
|
113 |
+
rsqScore = self.regressor.score(featData, outData)
|
114 |
+
coef = self.regressor.coef_
|
115 |
+
intc = self.regressor.intercept_
|
116 |
+
result = (rsqScore, intc, coef)
|
117 |
+
|
118 |
+
if modelSave:
|
119 |
+
self.logger.info("...saving model")
|
120 |
+
modelFilePath = self.getModelFilePath()
|
121 |
+
joblib.dump(self.regressor, modelFilePath)
|
122 |
+
return result
|
123 |
+
|
124 |
+
def validate(self):
|
125 |
+
# create model
|
126 |
+
self.prepModel()
|
127 |
+
|
128 |
+
# prepare test data
|
129 |
+
(featData, outDataActual) = self.prepData("validate")
|
130 |
+
|
131 |
+
#predict
|
132 |
+
self.logger.info("...predicting")
|
133 |
+
outDataPred = self.regressor.predict(featData)
|
134 |
+
|
135 |
+
#error
|
136 |
+
rsqScore = self.regressor.score(featData, outDataActual)
|
137 |
+
result = (outDataPred, rsqScore)
|
138 |
+
return result
|
139 |
+
|
140 |
+
def predict(self):
|
141 |
+
"""
|
142 |
+
predict using trained model
|
143 |
+
"""
|
144 |
+
# create model
|
145 |
+
self.prepModel()
|
146 |
+
|
147 |
+
# prepare test data
|
148 |
+
featData = self.prepData("predict")[0]
|
149 |
+
|
150 |
+
#predict
|
151 |
+
self.logger.info("...predicting")
|
152 |
+
outData = self.regressor.predict(featData)
|
153 |
+
return outData
|
154 |
+
|
155 |
+
def prepData(self, mode):
|
156 |
+
"""
|
157 |
+
loads and prepares data for training and validation
|
158 |
+
"""
|
159 |
+
# parameters
|
160 |
+
key = mode + ".data.file"
|
161 |
+
dataFile = self.config.getStringConfig(key)[0]
|
162 |
+
|
163 |
+
key = mode + ".data.fields"
|
164 |
+
fieldIndices = self.config.getStringConfig(key)[0]
|
165 |
+
if not fieldIndices is None:
|
166 |
+
fieldIndices = strToIntArray(fieldIndices, ",")
|
167 |
+
|
168 |
+
|
169 |
+
key = mode + ".data.feature.fields"
|
170 |
+
featFieldIndices = self.config.getStringConfig(key)[0]
|
171 |
+
if not featFieldIndices is None:
|
172 |
+
featFieldIndices = strToIntArray(featFieldIndices, ",")
|
173 |
+
|
174 |
+
if not mode == "predict":
|
175 |
+
key = mode + ".data.out.field"
|
176 |
+
outFieldIndex = self.config.getIntConfig(key)[0]
|
177 |
+
|
178 |
+
#load data
|
179 |
+
(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
|
180 |
+
if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
|
181 |
+
featData = sk.preprocessing.scale(featData)
|
182 |
+
outData = None
|
183 |
+
if not mode == "predict":
|
184 |
+
outData = extrColumns(data, outFieldIndex)
|
185 |
+
return (featData, outData)
|
186 |
+
|
187 |
+
def prepModel(self):
|
188 |
+
"""
|
189 |
+
load saved model or train model
|
190 |
+
"""
|
191 |
+
useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
|
192 |
+
if (useSavedModel and not self.regressor):
|
193 |
+
# load saved model
|
194 |
+
self.logger.info("...loading saved model")
|
195 |
+
modelFilePath = self.getModelFilePath()
|
196 |
+
self.regressor = joblib.load(modelFilePath)
|
197 |
+
else:
|
198 |
+
# train model
|
199 |
+
self.train()
|
200 |
+
|
201 |
+
class LinearRegressor(BaseRegressor):
|
202 |
+
"""
|
203 |
+
linear regression
|
204 |
+
"""
|
205 |
+
def __init__(self, configFile):
|
206 |
+
defValues = {}
|
207 |
+
defValues["train.normalize"] = (False, None)
|
208 |
+
|
209 |
+
super(LinearRegressor, self).__init__(configFile, defValues)
|
210 |
+
|
211 |
+
def buildModel(self):
|
212 |
+
"""
|
213 |
+
builds model object
|
214 |
+
"""
|
215 |
+
self.logger.info("...building linear regression model")
|
216 |
+
normalize = self.config.getBooleanConfig("train.normalize")[0]
|
217 |
+
self.regressor = LinearRegression(normalize=normalize)
|
218 |
+
|
219 |
+
class ElasticNetRegressor(BaseRegressor):
|
220 |
+
"""
|
221 |
+
elastic net regression
|
222 |
+
"""
|
223 |
+
def __init__(self, configFile):
|
224 |
+
defValues = {}
|
225 |
+
defValues["train.alpha"] = (1.0, None)
|
226 |
+
defValues["train.loneratio"] = (0.5, None)
|
227 |
+
defValues["train.normalize"] = (False, None)
|
228 |
+
defValues["train.precompute"] = (False, None)
|
229 |
+
defValues["train.max.iter"] = (1000, None)
|
230 |
+
defValues["train.tol"] = (0.0001, None)
|
231 |
+
defValues["train.random.state"] = (None, None)
|
232 |
+
defValues["train.selection"] = ("cyclic", None)
|
233 |
+
|
234 |
+
super(ElasticNetRegressor, self).__init__(configFile, defValues)
|
235 |
+
|
236 |
+
def buildModel(self):
|
237 |
+
"""
|
238 |
+
builds model object
|
239 |
+
"""
|
240 |
+
self.logger.info("...building elastic net regression model")
|
241 |
+
alpha = self.config.getFloatConfig("train.alpha")[0]
|
242 |
+
loneratio = self.config.getFloatConfig("train.loneratio")[0]
|
243 |
+
normalize = self.config.getBooleanConfig("train.normalize")[0]
|
244 |
+
precompute = self.config.getBooleanConfig("train.precompute")[0]
|
245 |
+
maxIter = self.config.getIntConfig("train.max.iter")[0]
|
246 |
+
tol = self.config.getFloatConfig("train.tol")[0]
|
247 |
+
randState = self.config.getIntConfig("train.random.state")[0]
|
248 |
+
selection = self.config.getIntConfig("train.selection")[0]
|
249 |
+
|
250 |
+
self.regressor = ElasticNet(alpha=alpha, l1_ratio=loneratio, normalize=normalize, precompute=precompute,
|
251 |
+
max_iter=maxIter, tol=tol, random_state=randState, selection=selection)
|
252 |
+
|
253 |
+
|
supv/rf.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/local/bin/python3
|
2 |
+
|
3 |
+
# avenir-python: Machine Learning
|
4 |
+
# Author: Pranab Ghosh
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7 |
+
# may not use this file except in compliance with the License. You may
|
8 |
+
# obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15 |
+
# implied. See the License for the specific language governing
|
16 |
+
# permissions and limitations under the License.
|
17 |
+
|
18 |
+
# Package imports
|
19 |
+
import os
|
20 |
+
import sys
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
import numpy as np
|
23 |
+
import sklearn as sk
|
24 |
+
import matplotlib
|
25 |
+
import random
|
26 |
+
import jprops
|
27 |
+
from sklearn.ensemble import RandomForestClassifier
|
28 |
+
from random import randint
|
29 |
+
sys.path.append(os.path.abspath("../lib"))
|
30 |
+
from util import *
|
31 |
+
from mlutil import *
|
32 |
+
from pasearch import *
|
33 |
+
from bacl import *
|
34 |
+
|
35 |
+
|
36 |
+
# gradient boosting classification
|
37 |
+
class RandomForest(BaseClassifier):
|
38 |
+
def __init__(self, configFile):
|
39 |
+
defValues = {}
|
40 |
+
defValues["common.mode"] = ("training", None)
|
41 |
+
defValues["common.model.directory"] = ("model", None)
|
42 |
+
defValues["common.model.file"] = (None, None)
|
43 |
+
defValues["common.preprocessing"] = (None, None)
|
44 |
+
defValues["common.verbose"] = (False, None)
|
45 |
+
defValues["train.data.file"] = (None, "missing training data file")
|
46 |
+
defValues["train.data.fields"] = (None, "missing training data field ordinals")
|
47 |
+
defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
|
48 |
+
defValues["train.data.class.field"] = (None, "missing class field ordinal")
|
49 |
+
defValues["train.validation"] = ("kfold", None)
|
50 |
+
defValues["train.num.folds"] = (5, None)
|
51 |
+
defValues["train.num.trees"] = (100, None)
|
52 |
+
defValues["train.split.criterion"] = ("gini", None)
|
53 |
+
defValues["train.max.depth"] = (None, None)
|
54 |
+
defValues["train.min.samples.split"] = (4, None)
|
55 |
+
defValues["train.min.samples.leaf"] = (2, None)
|
56 |
+
defValues["train.min.weight.fraction.leaf"] = (0, None)
|
57 |
+
defValues["train.max.features"] = ("auto", None)
|
58 |
+
defValues["train.max.leaf.nodes"] = (None, None)
|
59 |
+
defValues["train.min.impurity.decrease"] = (0, None)
|
60 |
+
defValues["train.min.impurity.split"] = (1.0e-07, None)
|
61 |
+
defValues["train.bootstrap"] = (True, None)
|
62 |
+
defValues["train.oob.score"] = (False, None)
|
63 |
+
defValues["train.num.jobs"] = (1, None)
|
64 |
+
defValues["train.random.state"] = (None, None)
|
65 |
+
defValues["train.verbose"] = (0, None)
|
66 |
+
defValues["train.warm.start"] = (False, None)
|
67 |
+
defValues["train.success.criterion"] = ("error", None)
|
68 |
+
defValues["train.model.save"] = (False, None)
|
69 |
+
defValues["train.score.method"] = ("accuracy", None)
|
70 |
+
defValues["train.search.param.strategy"] = (None, None)
|
71 |
+
defValues["train.search.params"] = (None, None)
|
72 |
+
defValues["predict.data.file"] = (None, None)
|
73 |
+
defValues["predict.data.fields"] = (None, "missing data field ordinals")
|
74 |
+
defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
|
75 |
+
defValues["predict.use.saved.model"] = (False, None)
|
76 |
+
defValues["validate.data.file"] = (None, "missing validation data file")
|
77 |
+
defValues["validate.data.fields"] = (None, "missing validation data field ordinals")
|
78 |
+
defValues["validate.data.feature.fields"] = (None, "missing validation data feature field ordinals")
|
79 |
+
defValues["validate.data.class.field"] = (None, "missing class field ordinal")
|
80 |
+
defValues["validate.use.saved.model"] = (False, None)
|
81 |
+
defValues["validate.score.method"] = ("accuracy", None)
|
82 |
+
|
83 |
+
super(RandomForest, self).__init__(configFile, defValues, __name__)
|
84 |
+
|
85 |
+
# builds model object
|
86 |
+
def buildModel(self):
|
87 |
+
self.logger.info("...building random forest model")
|
88 |
+
numTrees = self.config.getIntConfig("train.num.trees")[0]
|
89 |
+
splitCriterion = self.config.getStringConfig("train.split.criterion")[0]
|
90 |
+
maxDepth = self.config.getStringConfig("train.max.depth")[0]
|
91 |
+
maxDepth = typedValue(maxDepth)
|
92 |
+
minSamplesSplit = self.config.getStringConfig("train.min.samples.split")[0]
|
93 |
+
minSamplesSplit = typedValue(minSamplesSplit)
|
94 |
+
minSamplesLeaf = self.config.getStringConfig("train.min.samples.leaf")[0]
|
95 |
+
minSamplesLeaf = typedValue(minSamplesLeaf)
|
96 |
+
minWeightFractionLeaf = self.config.getFloatConfig("train.min.weight.fraction.leaf")[0]
|
97 |
+
maxFeatures = self.config.getStringConfig("train.max.features")[0]
|
98 |
+
maxFeatures = typedValue(maxFeatures)
|
99 |
+
maxLeafNodes = self.config.getIntConfig("train.max.leaf.nodes")[0]
|
100 |
+
minImpurityDecrease = self.config.getFloatConfig("train.min.impurity.decrease")[0]
|
101 |
+
minImpurityDecrease = self.config.getFloatConfig("train.min.impurity.split")[0]
|
102 |
+
bootstrap = self.config.getBooleanConfig("train.bootstrap")[0]
|
103 |
+
oobScore = self.config.getBooleanConfig("train.oob.score")[0]
|
104 |
+
numJobs = self.config.getIntConfig("train.num.jobs")[0]
|
105 |
+
randomState = self.config.getIntConfig("train.random.state")[0]
|
106 |
+
verbose = self.config.getIntConfig("train.verbose")[0]
|
107 |
+
warmStart = self.config.getBooleanConfig("train.warm.start")[0]
|
108 |
+
|
109 |
+
model = RandomForestClassifier(n_estimators=numTrees, criterion=splitCriterion, max_depth=maxDepth, \
|
110 |
+
min_samples_split=minSamplesSplit, min_samples_leaf=minSamplesLeaf, min_weight_fraction_leaf=minWeightFractionLeaf, \
|
111 |
+
max_features=maxFeatures, max_leaf_nodes=maxLeafNodes, min_impurity_decrease=minImpurityDecrease, \
|
112 |
+
min_impurity_split=None, bootstrap=bootstrap, oob_score=oobScore, n_jobs=numJobs, random_state=randomState, \
|
113 |
+
verbose=verbose, warm_start=warmStart, class_weight=None)
|
114 |
+
self.classifier = model
|
115 |
+
return self.classifier
|
116 |
+
|
117 |
+
#predict probability with in memory data
|
118 |
+
def predictProb(self, recs):
|
119 |
+
# create model
|
120 |
+
self.prepModel()
|
121 |
+
|
122 |
+
#input record
|
123 |
+
if type(recs) is str:
|
124 |
+
featData = self.prepStringPredictData(recs)
|
125 |
+
else:
|
126 |
+
featData = recs
|
127 |
+
if (featData.ndim == 1):
|
128 |
+
featData = featData.reshape(1, -1)
|
129 |
+
|
130 |
+
#predict
|
131 |
+
self.logger.info("...predicting class probability")
|
132 |
+
clsData = self.classifier.predict_proba(featData)
|
133 |
+
return clsData
|
134 |
+
|
supv/svm.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/local/bin/python3
|
2 |
+
|
3 |
+
# avenir-python: Machine Learning
|
4 |
+
# Author: Pranab Ghosh
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7 |
+
# may not use this file except in compliance with the License. You may
|
8 |
+
# obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15 |
+
# implied. See the License for the specific language governing
|
16 |
+
# permissions and limitations under the License.
|
17 |
+
|
18 |
+
# Package imports
|
19 |
+
import os
|
20 |
+
import sys
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
import numpy as np
|
23 |
+
import sklearn as sk
|
24 |
+
import sklearn.linear_model
|
25 |
+
import matplotlib
|
26 |
+
import random
|
27 |
+
import jprops
|
28 |
+
from random import randint
|
29 |
+
sys.path.append(os.path.abspath("../lib"))
|
30 |
+
from util import *
|
31 |
+
from mlutil import *
|
32 |
+
from pasearch import *
|
33 |
+
from bacl import *
|
34 |
+
|
35 |
+
# gradient boosting classification
|
36 |
+
class SupportVectorMachine(BaseClassifier):
|
37 |
+
|
38 |
+
def __init__(self, configFile):
|
39 |
+
defValues = {}
|
40 |
+
defValues["common.mode"] = ("train", None)
|
41 |
+
defValues["common.model.directory"] = ("model", None)
|
42 |
+
defValues["common.model.file"] = (None, None)
|
43 |
+
defValues["common.scale.file.path"] = (None, "missing scale file path")
|
44 |
+
defValues["common.preprocessing"] = (None, None)
|
45 |
+
defValues["common.verbose"] = (False, None)
|
46 |
+
defValues["train.data.file"] = (None, "missing training data file")
|
47 |
+
defValues["train.data.fields"] = (None, "missing training data field ordinals")
|
48 |
+
defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
|
49 |
+
defValues["train.data.class.field"] = (None, "missing class field ordinal")
|
50 |
+
defValues["train.validation"] = ("kfold", None)
|
51 |
+
defValues["train.num.folds"] = (5, None)
|
52 |
+
defValues["train.algorithm"] = ("svc", None)
|
53 |
+
defValues["train.kernel.function"] = ("rbf", None)
|
54 |
+
defValues["train.poly.degree"] = (3, None)
|
55 |
+
defValues["train.penalty"] = (1.0, None)
|
56 |
+
defValues["train.gamma"] = ("scale", None)
|
57 |
+
defValues["train.penalty.norm"] = ("l2", None)
|
58 |
+
defValues["train.loss"] = ("squared_hinge", None)
|
59 |
+
defValues["train.dual"] = (True, None)
|
60 |
+
defValues["train.shrinking"] = (True, None)
|
61 |
+
defValues["train.nu"] = (0.5, None)
|
62 |
+
defValues["train.predict.probability"] = (False, None)
|
63 |
+
defValues["train.print.sup.vectors"] = (False, None)
|
64 |
+
defValues["train.success.criterion"] = ("error", None)
|
65 |
+
defValues["train.model.save"] = (False, None)
|
66 |
+
defValues["train.score.method"] = ("accuracy", None)
|
67 |
+
defValues["train.search.param.strategy"] = (None, None)
|
68 |
+
defValues["train.search.params"] = (None, None)
|
69 |
+
defValues["predict.data.file"] = (None, None)
|
70 |
+
defValues["predict.data.fields"] = (None, "missing data field ordinals")
|
71 |
+
defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
|
72 |
+
defValues["predict.use.saved.model"] = (False, None)
|
73 |
+
defValues["validate.data.file"] = (None, "missing validation data file")
|
74 |
+
defValues["validate.data.fields"] = (None, "missing validation data field ordinals")
|
75 |
+
defValues["validate.data.feature.fields"] = (None, "missing validation data feature field ordinals")
|
76 |
+
defValues["validate.data.class.field"] = (None, "missing class field ordinal")
|
77 |
+
defValues["validate.use.saved.model"] = (False, None)
|
78 |
+
defValues["validate.score.method"] = ("accuracy", None)
|
79 |
+
|
80 |
+
super(SupportVectorMachine, self).__init__(configFile, defValues, __name__)
|
81 |
+
|
82 |
+
# builds model object
|
83 |
+
def buildModel(self):
|
84 |
+
self.logger.info("...building svm model")
|
85 |
+
algo = self.config.getStringConfig("train.algorithm")[0]
|
86 |
+
kernelFun = self.config.getStringConfig("train.kernel.function")[0]
|
87 |
+
penalty = self.config.getFloatConfig("train.penalty")[0]
|
88 |
+
polyDegree = self.config.getIntConfig("train.poly.degree")[0]
|
89 |
+
kernelCoeff = self.config.getStringConfig("train.gamma")[0]
|
90 |
+
kernelCoeff = typedValue(kernelCoeff)
|
91 |
+
penaltyNorm = self.config.getStringConfig("train.penalty.norm")[0]
|
92 |
+
trainLoss = self.config.getStringConfig("train.loss")[0]
|
93 |
+
dualOpt = self.config.getBooleanConfig("train.dual")[0]
|
94 |
+
shrinkHeuristic = self.config.getBooleanConfig("train.shrinking")[0]
|
95 |
+
predictProb = self.config.getBooleanConfig("train.predict.probability")[0]
|
96 |
+
supVecBound = self.config.getFloatConfig("train.nu")[0]
|
97 |
+
|
98 |
+
if (algo == "svc"):
|
99 |
+
if kernelFun == "poly":
|
100 |
+
model = sk.svm.SVC(C=penalty,kernel=kernelFun,degree=polyDegree,gamma=kernelCoeff, shrinking=shrinkHeuristic, \
|
101 |
+
probability=predictProb)
|
102 |
+
elif kernelFun == "rbf" or kernelFun == "sigmoid":
|
103 |
+
model = sk.svm.SVC(C=penalty,kernel=kernelFun,gamma=kernelCoeff, shrinking=shrinkHeuristic, probability=predictProb)
|
104 |
+
else:
|
105 |
+
model = sk.svm.SVC(C=penalty, kernel=kernelFun, shrinking=shrinkHeuristic, probability=predictProb)
|
106 |
+
elif (algo == "nusvc"):
|
107 |
+
if kernelFun == "poly":
|
108 |
+
model = sk.svm.NuSVC(nu=supVecBound, kernel=kernelFun,degree=polyDegree,gamma=kernelCoeff, shrinking=shrinkHeuristic, \
|
109 |
+
probability=predictProb)
|
110 |
+
elif kernelFun == "rbf" or kernelFun == "sigmoid":
|
111 |
+
model = sk.svm.NuSVC(nu=supVecBound, kernel=kernelFun,gamma=kernelCoeff, shrinking=shrinkHeuristic, probability=predictProb)
|
112 |
+
else:
|
113 |
+
model = sk.svm.NuSVC(nu=supVecBound, kernel=kernelFun, shrinking=shrinkHeuristic, probability=predictProb)
|
114 |
+
elif (algo == "linearsvc"):
|
115 |
+
model = sk.svm.LinearSVC(penalty=penaltyNorm, loss=trainLoss, dual=dualOpt)
|
116 |
+
else:
|
117 |
+
self.logger.info("invalid svm algorithm")
|
118 |
+
sys.exit()
|
119 |
+
self.classifier = model
|
120 |
+
return self.classifier
|
121 |
+
|
122 |
+
#predict probability with in memory data
|
123 |
+
def predictProb(self, recs):
|
124 |
+
# create model
|
125 |
+
self.prepModel()
|
126 |
+
|
127 |
+
#input record
|
128 |
+
if type(recs) is str:
|
129 |
+
featData = self.prepStringPredictData(recs)
|
130 |
+
else:
|
131 |
+
featData = recs
|
132 |
+
if (featData.ndim == 1):
|
133 |
+
featData = featData.reshape(1, -1)
|
134 |
+
|
135 |
+
#predict
|
136 |
+
self.logger.info("...predicting class probability")
|
137 |
+
clsData = self.classifier.predict_proba(featData)
|
138 |
+
return clsData
|
139 |
+
|
140 |
+
|
141 |
+
|
supv/svml.py
ADDED
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/Users/pranab/Tools/anaconda/bin/python
|
2 |
+
|
3 |
+
# avenir-python: Machine Learning
|
4 |
+
# Author: Pranab Ghosh
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7 |
+
# may not use this file except in compliance with the License. You may
|
8 |
+
# obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15 |
+
# implied. See the License for the specific language governing
|
16 |
+
# permissions and limitations under the License.
|
17 |
+
|
18 |
+
# Package imports
|
19 |
+
import os
|
20 |
+
import sys
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
import numpy as np
|
23 |
+
import sklearn as sk
|
24 |
+
import sklearn.linear_model
|
25 |
+
import matplotlib
|
26 |
+
import random
|
27 |
+
import jprops
|
28 |
+
from sklearn.externals import joblib
|
29 |
+
from sklearn.ensemble import BaggingClassifier
|
30 |
+
from random import randint
|
31 |
+
|
32 |
+
if len(sys.argv) < 2:
|
33 |
+
print "usage: ./svm.py <config_properties_file>"
|
34 |
+
sys.exit()
|
35 |
+
|
36 |
+
#train by bagging
|
37 |
+
def train_bagging():
|
38 |
+
model = build_model()
|
39 |
+
bagging_model = BaggingClassifier(base_estimator=model,n_estimators=bagging_num_estimator,
|
40 |
+
max_samples=bagging_sample_fraction,oob_score=bagging_use_oob)
|
41 |
+
|
42 |
+
#train model
|
43 |
+
bagging_model.fit(XC, yc)
|
44 |
+
|
45 |
+
#persist model
|
46 |
+
if persist_model:
|
47 |
+
models = bagging_model.estimators_
|
48 |
+
for m in zip(range(0, len(models)), models):
|
49 |
+
model_file = model_file_directory + "/" + model_file_prefix + "_" + str(m[0] + 1) + ".mod"
|
50 |
+
joblib.dump(m[1], model_file)
|
51 |
+
|
52 |
+
score = bagging_model.score(XC, yc)
|
53 |
+
print "average error %.3f" %(1.0 - score)
|
54 |
+
|
55 |
+
#linear k fold validation
|
56 |
+
def train_kfold_validation(nfold):
|
57 |
+
if native_kfold_validation:
|
58 |
+
print "native linear kfold validation"
|
59 |
+
model = build_model()
|
60 |
+
scores = sk.cross_validation.cross_val_score(model, XC, yc, cv=nfold)
|
61 |
+
av_score = np.mean(scores)
|
62 |
+
print "average error %.3f" %(1.0 - av_score)
|
63 |
+
else:
|
64 |
+
print "extended linear kfold validation"
|
65 |
+
train_kfold_validation_ext(nfold)
|
66 |
+
|
67 |
+
#linear k fold validation
|
68 |
+
def train_kfold_validation_ext(nfold):
|
69 |
+
model = build_model()
|
70 |
+
#scores = sk.cross_validation.cross_val_score(model, XC, yc, cv=nfold)
|
71 |
+
#print scores
|
72 |
+
|
73 |
+
offset = 0
|
74 |
+
length = dsize / nfold
|
75 |
+
errors = []
|
76 |
+
fp_errors = []
|
77 |
+
fn_errors = []
|
78 |
+
for i in range(0, nfold):
|
79 |
+
print "....Next fold %d" %(i)
|
80 |
+
|
81 |
+
#split data
|
82 |
+
(XV,yv,X,y) = split_data(offset, length)
|
83 |
+
dvsize = len(XV)
|
84 |
+
|
85 |
+
#train model
|
86 |
+
model.fit(X, y)
|
87 |
+
|
88 |
+
#persist model
|
89 |
+
if persist_model:
|
90 |
+
model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod"
|
91 |
+
joblib.dump(model, model_file)
|
92 |
+
|
93 |
+
#print support vectors
|
94 |
+
print_support_vectors(model)
|
95 |
+
|
96 |
+
#predict
|
97 |
+
print "making predictions..."
|
98 |
+
yp = model.predict(XV)
|
99 |
+
|
100 |
+
#show prediction output
|
101 |
+
(er, fp_er, fn_er) = validate(dvsize,yv,yp)
|
102 |
+
errors.append(er)
|
103 |
+
fp_errors.append(fp_er)
|
104 |
+
fn_errors.append(fn_er)
|
105 |
+
|
106 |
+
offset += length
|
107 |
+
|
108 |
+
#average error
|
109 |
+
av_error = np.mean(errors)
|
110 |
+
av_fp_error = np.mean(fp_errors)
|
111 |
+
av_fn_error = np.mean(fn_errors)
|
112 |
+
print "average error %.3f false positive error %.3f false negative error %.3f" %(av_error, av_fp_error, av_fn_error)
|
113 |
+
|
114 |
+
# random k fold validation
|
115 |
+
def train_rfold_validation(nfold, niter):
|
116 |
+
if native_rfold_validation:
|
117 |
+
print "native random kfold validation"
|
118 |
+
train_fraction = 1.0 / nfold
|
119 |
+
scores = []
|
120 |
+
for i in range(0,niter):
|
121 |
+
state = randint(1,100)
|
122 |
+
X, XV, y, yv = sk.cross_validation.train_test_split(XC, yc, test_size=train_fraction, random_state=state)
|
123 |
+
model = build_model()
|
124 |
+
model.fit(X,y)
|
125 |
+
scores.append(model.score(XV, yv))
|
126 |
+
|
127 |
+
print scores
|
128 |
+
av_score = np.mean(scores)
|
129 |
+
print "average error %.3f" %(1.0 - av_score)
|
130 |
+
|
131 |
+
else:
|
132 |
+
print "extended random kfold validation"
|
133 |
+
train_rfold_validation_ext(nfold, niter)
|
134 |
+
|
135 |
+
# random k fold validation
|
136 |
+
def train_rfold_validation_ext(nfold, niter):
|
137 |
+
max_offset_frac = 1.0 - 1.0 / nfold
|
138 |
+
max_offset_frac -= .01
|
139 |
+
length = dsize / nfold
|
140 |
+
|
141 |
+
errors = []
|
142 |
+
fp_errors = []
|
143 |
+
fn_errors = []
|
144 |
+
for i in range(0,niter):
|
145 |
+
print "...Next iteration %d" %(i)
|
146 |
+
offset = int(dsize * random.random() * max_offset_frac)
|
147 |
+
print "offset: %d length: %d" %(offset, length)
|
148 |
+
(XV,yv,X,y) = split_data(offset, length)
|
149 |
+
dvsize = len(XV)
|
150 |
+
|
151 |
+
#build model
|
152 |
+
model = build_model()
|
153 |
+
|
154 |
+
#train model
|
155 |
+
model.fit(X, y)
|
156 |
+
|
157 |
+
#persist model
|
158 |
+
if persist_model:
|
159 |
+
model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod"
|
160 |
+
print "saving model file " + model_file
|
161 |
+
joblib.dump(model, model_file)
|
162 |
+
|
163 |
+
#print support vectors
|
164 |
+
print_support_vectors(model)
|
165 |
+
|
166 |
+
#predict
|
167 |
+
print "making predictions..."
|
168 |
+
yp = model.predict(XV)
|
169 |
+
|
170 |
+
#show prediction output
|
171 |
+
(er, fp_er, fn_er) = validate(dvsize,yv,yp)
|
172 |
+
errors.append(er)
|
173 |
+
fp_errors.append(fp_er)
|
174 |
+
fn_errors.append(fn_er)
|
175 |
+
|
176 |
+
av_error = np.mean(errors)
|
177 |
+
av_fp_error = np.mean(fp_errors)
|
178 |
+
av_fn_error = np.mean(fn_errors)
|
179 |
+
print "average error %.3f false positive error %.3f false negative error %.3f" %(av_error, av_fp_error, av_fn_error)
|
180 |
+
|
181 |
+
# make predictions
|
182 |
+
def predict():
|
183 |
+
psize = len(X)
|
184 |
+
class_counts = []
|
185 |
+
|
186 |
+
#all models
|
187 |
+
for i in range(0, num_models):
|
188 |
+
model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod"
|
189 |
+
print "loading model file " + model_file
|
190 |
+
model = joblib.load(model_file)
|
191 |
+
|
192 |
+
yp = model.predict(X)
|
193 |
+
if i == 0:
|
194 |
+
#initialize class counts
|
195 |
+
for y in yp:
|
196 |
+
class_count = {}
|
197 |
+
if y == 0:
|
198 |
+
class_count[0] = 1
|
199 |
+
class_count[1] = 0
|
200 |
+
else:
|
201 |
+
class_count[1] = 1
|
202 |
+
class_count[0] = 0
|
203 |
+
class_counts.append(class_count)
|
204 |
+
|
205 |
+
else:
|
206 |
+
#increment class count
|
207 |
+
for j in range(0, psize):
|
208 |
+
class_count = class_counts[j]
|
209 |
+
y = yp[j]
|
210 |
+
class_count[y] += 1
|
211 |
+
|
212 |
+
# predict based on majority vote
|
213 |
+
print "here are the predictions"
|
214 |
+
for k in range(0, psize):
|
215 |
+
class_count = class_counts[k]
|
216 |
+
if (class_count[0] > class_count[1]):
|
217 |
+
y = 0
|
218 |
+
majority = class_count[0]
|
219 |
+
else:
|
220 |
+
y = 1
|
221 |
+
majority = class_count[1]
|
222 |
+
|
223 |
+
print X[k]
|
224 |
+
print "prediction %d majority count %d" %(y, majority)
|
225 |
+
|
226 |
+
#builds model
|
227 |
+
def build_model():
|
228 |
+
#build model
|
229 |
+
print "building model..."
|
230 |
+
if algo == "svc":
|
231 |
+
if kernel_fun == "poly":
|
232 |
+
model = sk.svm.SVC(C=penalty,kernel=kernel_fun,degree=poly_degree,gamma=kernel_coeff)
|
233 |
+
elif kernel_fun == "rbf" or kernel_fun == "sigmoid":
|
234 |
+
model = sk.svm.SVC(C=penalty,kernel=kernel_fun,gamma=kernel_coeff)
|
235 |
+
else:
|
236 |
+
model = sk.svm.SVC(C=penalty,kernel=kernel_fun)
|
237 |
+
elif algo == "nusvc":
|
238 |
+
if kernel_fun == "poly":
|
239 |
+
model = sk.svm.NuSVC(kernel=kernel_fun,degree=poly_degree,gamma=kernel_coeff)
|
240 |
+
elif kernel_fun == "rbf" or kernel_fun == "sigmoid":
|
241 |
+
model = sk.svm.NuSVC(kernel=kernel_fun,gamma=kernel_coeff)
|
242 |
+
else:
|
243 |
+
model = sk.svm.NuSVC(kernel=kernel_fun)
|
244 |
+
elif algo == "linearsvc":
|
245 |
+
model = sk.svm.LinearSVC()
|
246 |
+
else:
|
247 |
+
print "invalid svm algorithm"
|
248 |
+
sys.exit()
|
249 |
+
return model
|
250 |
+
|
251 |
+
#splits data into training and validation sets
|
252 |
+
def split_data(offset, length):
|
253 |
+
print "splitting data..."
|
254 |
+
#copy data
|
255 |
+
XC_c = np.copy(XC)
|
256 |
+
yc_c = list(yc)
|
257 |
+
|
258 |
+
# validation set
|
259 |
+
vlo = offset
|
260 |
+
vup = vlo + length
|
261 |
+
if (vup > len(yc)):
|
262 |
+
vup = len(yc)
|
263 |
+
XV = XC_c[vlo:vup:1]
|
264 |
+
yv = yc_c[vlo:vup:1]
|
265 |
+
dvsize = len(XV)
|
266 |
+
print "data size %d validation data size %d" %(dsize, dvsize)
|
267 |
+
#print "validation set"
|
268 |
+
#print XV
|
269 |
+
#print yv
|
270 |
+
|
271 |
+
#training set
|
272 |
+
X = np.delete(XC_c, np.s_[vlo:vup:1], 0)
|
273 |
+
y = np.delete(yc_c, np.s_[vlo:vup:1], 0)
|
274 |
+
#print "training set"
|
275 |
+
#print X
|
276 |
+
#print y
|
277 |
+
return (XV,yv,X,y)
|
278 |
+
|
279 |
+
#print support vectors
|
280 |
+
def print_support_vectors(model):
|
281 |
+
if (not algo == "linearsvc"):
|
282 |
+
if print_sup_vectors:
|
283 |
+
print "showing support vectors..."
|
284 |
+
print model.support_vectors_
|
285 |
+
print "num of support vectors"
|
286 |
+
print model.n_support_
|
287 |
+
|
288 |
+
#prints prediction output
|
289 |
+
def validate(dvsize,yv,yp):
|
290 |
+
print "showing predictions..."
|
291 |
+
err_count = 0
|
292 |
+
tp = 0
|
293 |
+
tn = 0
|
294 |
+
fp = 0
|
295 |
+
fn = 0
|
296 |
+
for r in range(0,dvsize):
|
297 |
+
#print "actual: %d predicted: %d" %(yv[r], yp[r])
|
298 |
+
if (not yv[r] == yp[r]):
|
299 |
+
err_count += 1
|
300 |
+
|
301 |
+
if (yp[r] == 1 and yv[r] == 1):
|
302 |
+
tp += 1
|
303 |
+
elif (yp[r] == 1 and yv[r] == 0):
|
304 |
+
fp += 1
|
305 |
+
elif (yp[r] == 0 and yv[r] == 0):
|
306 |
+
tn += 1
|
307 |
+
else:
|
308 |
+
fn += 1
|
309 |
+
|
310 |
+
er = float(err_count) / dvsize
|
311 |
+
fp_er = float(fp) / dvsize
|
312 |
+
fn_er = float(fn) / dvsize
|
313 |
+
print "error %.3f" %(er)
|
314 |
+
print "true positive : %.3f" %(float(tp) / dvsize)
|
315 |
+
print "false positive: %.3f" %(fp_er)
|
316 |
+
print "true negative : %.3f" %(float(tn) / dvsize)
|
317 |
+
print "false negative: %.3f" %(fn_er)
|
318 |
+
|
319 |
+
return (er, fp_er, fn_er)
|
320 |
+
|
321 |
+
# load configuration
|
322 |
+
def getConfigs(configFile):
|
323 |
+
configs = {}
|
324 |
+
print "using following configurations"
|
325 |
+
with open(configFile) as fp:
|
326 |
+
for key, value in jprops.iter_properties(fp):
|
327 |
+
print key, value
|
328 |
+
configs[key] = value
|
329 |
+
|
330 |
+
return configs
|
331 |
+
|
332 |
+
|
333 |
+
# load configuration
|
334 |
+
configs = getConfigs(sys.argv[1])
|
335 |
+
mode = configs["common.mode"]
|
336 |
+
|
337 |
+
if mode == "train":
|
338 |
+
#train
|
339 |
+
print "running in train mode"
|
340 |
+
data_file = configs["train.data.file"]
|
341 |
+
feat_field_indices = configs["train.data.feature.fields"].split(",")
|
342 |
+
feat_field_indices = [int(a) for a in feat_field_indices]
|
343 |
+
class_field_index = int(configs["train.data.class.field"])
|
344 |
+
preprocess = configs["common.preprocessing"]
|
345 |
+
validation = configs["train.validation"]
|
346 |
+
num_folds = int(configs["train.num.folds"])
|
347 |
+
num_iter = int(configs["train.num.iter"])
|
348 |
+
algo = configs["train.algorithm"]
|
349 |
+
kernel_fun = configs["train.kernel.function"]
|
350 |
+
poly_degree = int(configs["train.poly.degree"])
|
351 |
+
penalty = float(configs["train.penalty"])
|
352 |
+
if penalty < 0:
|
353 |
+
penalty = 1.0
|
354 |
+
print "using default for penalty"
|
355 |
+
kernel_coeff = float(configs["train.gamma"])
|
356 |
+
if kernel_coeff < 0:
|
357 |
+
kernel_coeff = 'auto'
|
358 |
+
print "using default for gamma"
|
359 |
+
print_sup_vectors = configs["train.print.sup.vectors"].lower() == "true"
|
360 |
+
persist_model = configs["train.persist.model"].lower() == "true"
|
361 |
+
model_file_directory = configs["common.model.directory"]
|
362 |
+
model_file_prefix = configs["common.model.file.prefix"]
|
363 |
+
|
364 |
+
print feat_field_indices
|
365 |
+
|
366 |
+
#extract feature fields
|
367 |
+
d = np.loadtxt(data_file, delimiter=',')
|
368 |
+
dsize = len(d)
|
369 |
+
XC = d[:,feat_field_indices]
|
370 |
+
|
371 |
+
#preprocess features
|
372 |
+
if (preprocess == "scale"):
|
373 |
+
XC = sk.preprocessing.scale(XC)
|
374 |
+
elif (preprocess == "normalize"):
|
375 |
+
XC = sk.preprocessing.normalize(XC, norm='l2')
|
376 |
+
else:
|
377 |
+
print "no preprocessing done"
|
378 |
+
|
379 |
+
#extract output field
|
380 |
+
yc = d[:,[class_field_index]]
|
381 |
+
yc = yc.reshape(dsize)
|
382 |
+
yc = [int(a) for a in yc]
|
383 |
+
|
384 |
+
#print XC
|
385 |
+
#print yc
|
386 |
+
|
387 |
+
|
388 |
+
# train model
|
389 |
+
if validation == "kfold":
|
390 |
+
native_kfold_validation = configs["train.native.kfold.validation"].lower() == "true"
|
391 |
+
train_kfold_validation(num_folds)
|
392 |
+
elif validation == "rfold":
|
393 |
+
native_rfold_validation = configs["train.native.rfold.validation"].lower() == "true"
|
394 |
+
train_rfold_validation(num_folds,num_iter)
|
395 |
+
elif validation == "bagging":
|
396 |
+
bagging_num_estimator = int(configs["train.bagging.num.estimators"])
|
397 |
+
bagging_sample_fraction = float(configs["train.bagging.sample.fraction"])
|
398 |
+
bagging_use_oob = configs["train.bagging.sample.fraction"].lower() == "true"
|
399 |
+
train_bagging()
|
400 |
+
else:
|
401 |
+
print "invalid training validation method"
|
402 |
+
sys.exit()
|
403 |
+
|
404 |
+
else:
|
405 |
+
#predict
|
406 |
+
print "running in prediction mode"
|
407 |
+
pred_data_file = configs["pred.data.file"]
|
408 |
+
pred_feat_field_indices = configs["pred.data.feature.fields"].split(",")
|
409 |
+
pred_feat_field_indices = [int(a) for a in pred_feat_field_indices]
|
410 |
+
preprocess = configs["common.preprocessing"]
|
411 |
+
num_models = int(configs["pred.num.models"])
|
412 |
+
model_file_directory = configs["common.model.directory"]
|
413 |
+
model_file_prefix = configs["common.model.file.prefix"]
|
414 |
+
|
415 |
+
#extract feature fields
|
416 |
+
pd = np.loadtxt(pred_data_file, delimiter=',')
|
417 |
+
pdsize = len(pd)
|
418 |
+
X = pd[:,pred_feat_field_indices]
|
419 |
+
|
420 |
+
#preprocess features
|
421 |
+
if (preprocess == "scale"):
|
422 |
+
X = sk.preprocessing.scale(X)
|
423 |
+
elif (preprocess == "normalize"):
|
424 |
+
X = sk.preprocessing.normalize(X, norm='l2')
|
425 |
+
else:
|
426 |
+
print "no preprocessing done"
|
427 |
+
|
428 |
+
predict()
|
supv/tnn.py
ADDED
@@ -0,0 +1,789 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/local/bin/python3
|
2 |
+
|
3 |
+
# avenir-python: Machine Learning
|
4 |
+
# Author: Pranab Ghosh
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7 |
+
# may not use this file except in compliance with the License. You may
|
8 |
+
# obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15 |
+
# implied. See the License for the specific language governing
|
16 |
+
# permissions and limitations under the License.
|
17 |
+
|
18 |
+
# Package imports
|
19 |
+
import os
|
20 |
+
import sys
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
import numpy as np
|
23 |
+
import torch
|
24 |
+
from torch.autograd import Variable
|
25 |
+
from torch.utils.data import Dataset, TensorDataset
|
26 |
+
from torch.utils.data import DataLoader
|
27 |
+
import sklearn as sk
|
28 |
+
from sklearn.neighbors import KDTree
|
29 |
+
import matplotlib
|
30 |
+
import random
|
31 |
+
import jprops
|
32 |
+
from random import randint
|
33 |
+
import statistics
|
34 |
+
sys.path.append(os.path.abspath("../lib"))
|
35 |
+
from util import *
|
36 |
+
from mlutil import *
|
37 |
+
|
38 |
+
"""
|
39 |
+
forward hook function
|
40 |
+
"""
|
41 |
+
intermedOut = {}
|
42 |
+
lvalues = list()
|
43 |
+
|
44 |
+
def hookFn(m, i, o):
|
45 |
+
"""
|
46 |
+
call back for latent values
|
47 |
+
"""
|
48 |
+
#intermedOut[m] = o
|
49 |
+
lv = o.data.cpu().numpy()
|
50 |
+
lv = lv[0].tolist()
|
51 |
+
lvalues.append(lv)
|
52 |
+
#print(lv)
|
53 |
+
|
54 |
+
def getLatValues():
|
55 |
+
"""
|
56 |
+
"""
|
57 |
+
return lvalues
|
58 |
+
|
59 |
+
class FeedForwardNetwork(torch.nn.Module):
|
60 |
+
def __init__(self, configFile, addDefValues=None):
|
61 |
+
"""
|
62 |
+
In the constructor we instantiate two nn.Linear modules and assign them as
|
63 |
+
member variables.
|
64 |
+
|
65 |
+
Parameters
|
66 |
+
configFile : config file path
|
67 |
+
addDefValues : dictionary of additional default values
|
68 |
+
"""
|
69 |
+
defValues = dict() if addDefValues is None else addDefValues.copy()
|
70 |
+
defValues["common.mode"] = ("training", None)
|
71 |
+
defValues["common.model.directory"] = ("model", None)
|
72 |
+
defValues["common.model.file"] = (None, None)
|
73 |
+
defValues["common.preprocessing"] = (None, None)
|
74 |
+
defValues["common.scaling.method"] = ("zscale", None)
|
75 |
+
defValues["common.scaling.minrows"] = (50, None)
|
76 |
+
defValues["common.scaling.param.file"] = (None, None)
|
77 |
+
defValues["common.verbose"] = (False, None)
|
78 |
+
defValues["common.device"] = ("cpu", None)
|
79 |
+
defValues["train.data.file"] = (None, "missing training data file")
|
80 |
+
defValues["train.data.fields"] = (None, "missing training data field ordinals")
|
81 |
+
defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
|
82 |
+
defValues["train.data.out.fields"] = (None, "missing training data feature field ordinals")
|
83 |
+
defValues["train.layer.data"] = (None, "missing layer data")
|
84 |
+
defValues["train.input.size"] = (None, None)
|
85 |
+
defValues["train.output.size"] = (None, "missing output size")
|
86 |
+
defValues["train.batch.size"] = (10, None)
|
87 |
+
defValues["train.loss.reduction"] = ("mean", None)
|
88 |
+
defValues["train.num.iterations"] = (500, None)
|
89 |
+
defValues["train.lossFn"] = ("mse", None)
|
90 |
+
defValues["train.optimizer"] = ("sgd", None)
|
91 |
+
defValues["train.opt.learning.rate"] = (.0001, None)
|
92 |
+
defValues["train.opt.weight.decay"] = (0, None)
|
93 |
+
defValues["train.opt.momentum"] = (0, None)
|
94 |
+
defValues["train.opt.eps"] = (1e-08, None)
|
95 |
+
defValues["train.opt.dampening"] = (0, None)
|
96 |
+
defValues["train.opt.momentum.nesterov"] = (False, None)
|
97 |
+
defValues["train.opt.betas"] = ([0.9, 0.999], None)
|
98 |
+
defValues["train.opt.alpha"] = (0.99, None)
|
99 |
+
defValues["train.save.model"] = (False, None)
|
100 |
+
defValues["train.track.error"] = (False, None)
|
101 |
+
defValues["train.epoch.intv"] = (5, None)
|
102 |
+
defValues["train.batch.intv"] = (5, None)
|
103 |
+
defValues["train.print.weights"] = (False, None)
|
104 |
+
defValues["valid.data.file"] = (None, None)
|
105 |
+
defValues["valid.accuracy.metric"] = (None, None)
|
106 |
+
defValues["predict.data.file"] = (None, None)
|
107 |
+
defValues["predict.use.saved.model"] = (True, None)
|
108 |
+
defValues["predict.output"] = ("binary", None)
|
109 |
+
defValues["predict.feat.pad.size"] = (60, None)
|
110 |
+
defValues["predict.print.output"] = (True, None)
|
111 |
+
defValues["calibrate.num.bins"] = (10, None)
|
112 |
+
defValues["calibrate.pred.prob.thresh"] = (0.5, None)
|
113 |
+
defValues["calibrate.num.nearest.neighbors"] = (10, None)
|
114 |
+
self.config = Configuration(configFile, defValues)
|
115 |
+
|
116 |
+
super(FeedForwardNetwork, self).__init__()
|
117 |
+
|
118 |
+
def setConfigParam(self, name, value):
|
119 |
+
"""
|
120 |
+
set config param
|
121 |
+
|
122 |
+
Parameters
|
123 |
+
name : config name
|
124 |
+
value : config value
|
125 |
+
"""
|
126 |
+
self.config.setParam(name, value)
|
127 |
+
|
128 |
+
def getConfig(self):
|
129 |
+
"""
|
130 |
+
get config object
|
131 |
+
"""
|
132 |
+
return self.config
|
133 |
+
|
134 |
+
def setVerbose(self, verbose):
|
135 |
+
self.verbose = verbose
|
136 |
+
|
137 |
+
def buildModel(self):
|
138 |
+
"""
|
139 |
+
Loads configuration and builds the various piecess necessary for the model
|
140 |
+
"""
|
141 |
+
torch.manual_seed(9999)
|
142 |
+
|
143 |
+
self.verbose = self.config.getBooleanConfig("common.verbose")[0]
|
144 |
+
numinp = self.config.getIntConfig("train.input.size")[0]
|
145 |
+
if numinp is None:
|
146 |
+
numinp = len(self.config.getIntListConfig("train.data.feature.fields")[0])
|
147 |
+
#numOut = len(self.config.getStringConfig("train.data.out.fields")[0].split(","))
|
148 |
+
self.outputSize = self.config.getIntConfig("train.output.size")[0]
|
149 |
+
self.batchSize = self.config.getIntConfig("train.batch.size")[0]
|
150 |
+
#lossRed = self.config.getStringConfig("train.loss.reduction")[0]
|
151 |
+
#learnRate = self.config.getFloatConfig("train.opt.learning.rate")[0]
|
152 |
+
self.numIter = self.config.getIntConfig("train.num.iterations")[0]
|
153 |
+
optimizer = self.config.getStringConfig("train.optimizer")[0]
|
154 |
+
self.lossFnStr = self.config.getStringConfig("train.lossFn")[0]
|
155 |
+
self.accMetric = self.config.getStringConfig("valid.accuracy.metric")[0]
|
156 |
+
self.trackErr = self.config.getBooleanConfig("train.track.error")[0]
|
157 |
+
self.batchIntv = self.config.getIntConfig("train.batch.intv")[0]
|
158 |
+
self.restored = False
|
159 |
+
self.clabels = list(range(self.outputSize)) if self.outputSize > 1 else None
|
160 |
+
|
161 |
+
#build network
|
162 |
+
layers = list()
|
163 |
+
ninp = numinp
|
164 |
+
trData = self.config.getStringConfig("train.layer.data")[0].split(",")
|
165 |
+
for ld in trData:
|
166 |
+
lde = ld.split(":")
|
167 |
+
assert len(lde) == 5, "expecting 5 items for layer data"
|
168 |
+
|
169 |
+
#num of units, activation, whether batch normalize, whether batch normalize after activation, dropout fraction
|
170 |
+
nunit = int(lde[0])
|
171 |
+
actStr = lde[1]
|
172 |
+
act = FeedForwardNetwork.createActivation(actStr) if actStr != "none" else None
|
173 |
+
bnorm = lde[2] == "true"
|
174 |
+
afterAct = lde[3] == "true"
|
175 |
+
dpr = float(lde[4])
|
176 |
+
|
177 |
+
layers.append(torch.nn.Linear(ninp, nunit))
|
178 |
+
if bnorm:
|
179 |
+
#with batch norm
|
180 |
+
if afterAct:
|
181 |
+
safeAppend(layers, act)
|
182 |
+
layers.append(torch.nn.BatchNorm1d(nunit))
|
183 |
+
else:
|
184 |
+
layers.append(torch.nn.BatchNorm1d(nunit))
|
185 |
+
safeAppend(layers, act)
|
186 |
+
else:
|
187 |
+
#without batch norm
|
188 |
+
safeAppend(layers, act)
|
189 |
+
|
190 |
+
if dpr > 0:
|
191 |
+
layers.append(torch.nn.Dropout(dpr))
|
192 |
+
ninp = nunit
|
193 |
+
|
194 |
+
self.layers = torch.nn.Sequential(*layers)
|
195 |
+
|
196 |
+
self.device = FeedForwardNetwork.getDevice(self)
|
197 |
+
|
198 |
+
#training data
|
199 |
+
dataFile = self.config.getStringConfig("train.data.file")[0]
|
200 |
+
(featData, outData) = FeedForwardNetwork.prepData(self, dataFile)
|
201 |
+
self.featData = torch.from_numpy(featData)
|
202 |
+
self.outData = torch.from_numpy(outData)
|
203 |
+
|
204 |
+
#validation data
|
205 |
+
dataFile = self.config.getStringConfig("valid.data.file")[0]
|
206 |
+
(featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataFile)
|
207 |
+
self.validFeatData = torch.from_numpy(featDataV)
|
208 |
+
self.validOutData = torch.from_numpy(outDataV)
|
209 |
+
|
210 |
+
# loss function and optimizer
|
211 |
+
self.lossFn = FeedForwardNetwork.createLossFunction(self, self.lossFnStr)
|
212 |
+
self.optimizer = FeedForwardNetwork.createOptimizer(self, optimizer)
|
213 |
+
|
214 |
+
self.yPred = None
|
215 |
+
self.restored = False
|
216 |
+
|
217 |
+
#mode to device
|
218 |
+
self.device = FeedForwardNetwork.getDevice(self)
|
219 |
+
self.featData = self.featData.to(self.device)
|
220 |
+
self.outData = self.outData.to(self.device)
|
221 |
+
self.validFeatData = self.validFeatData.to(self.device)
|
222 |
+
self.to(self.device)
|
223 |
+
|
224 |
+
@staticmethod
|
225 |
+
def getDevice(model):
|
226 |
+
"""
|
227 |
+
gets device
|
228 |
+
|
229 |
+
Parameters
|
230 |
+
model : torch model
|
231 |
+
"""
|
232 |
+
devType = model.config.getStringConfig("common.device")[0]
|
233 |
+
if devType == "cuda":
|
234 |
+
if torch.cuda.is_available():
|
235 |
+
device = torch.device("cuda")
|
236 |
+
else:
|
237 |
+
exitWithMsg("cuda not available")
|
238 |
+
else:
|
239 |
+
device = torch.device("cpu")
|
240 |
+
return device
|
241 |
+
|
242 |
+
def setValidationData(self, dataSource, prep=True):
|
243 |
+
"""
|
244 |
+
sets validation data
|
245 |
+
|
246 |
+
Parameters
|
247 |
+
dataSource : data source str if file path or 2D array
|
248 |
+
prep : if True load and prepare
|
249 |
+
"""
|
250 |
+
if prep:
|
251 |
+
(featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataSource)
|
252 |
+
self.validFeatData = torch.from_numpy(featDataV)
|
253 |
+
self.validOutData = outDataV
|
254 |
+
else:
|
255 |
+
self.validFeatData = torch.from_numpy(dataSource[0])
|
256 |
+
self.validOutData = dataSource[1]
|
257 |
+
|
258 |
+
self.validFeatData = self.validFeatData.to(self.device)
|
259 |
+
|
260 |
+
@staticmethod
|
261 |
+
def createActivation(actName):
|
262 |
+
"""
|
263 |
+
create activation
|
264 |
+
|
265 |
+
Parameters
|
266 |
+
actName : activation name
|
267 |
+
"""
|
268 |
+
if actName is None:
|
269 |
+
activation = None
|
270 |
+
elif actName == "relu":
|
271 |
+
activation = torch.nn.ReLU()
|
272 |
+
elif actName == "tanh":
|
273 |
+
activation = torch.nn.Tanh()
|
274 |
+
elif actName == "sigmoid":
|
275 |
+
activation = torch.nn.Sigmoid()
|
276 |
+
elif actName == "softmax":
|
277 |
+
activation = torch.nn.Softmax(dim=1)
|
278 |
+
else:
|
279 |
+
exitWithMsg("invalid activation function name " + actName)
|
280 |
+
return activation
|
281 |
+
|
282 |
+
@staticmethod
|
283 |
+
def createLossFunction(model, lossFnName):
|
284 |
+
"""
|
285 |
+
create loss function
|
286 |
+
|
287 |
+
Parameters
|
288 |
+
lossFnName : loss function name
|
289 |
+
"""
|
290 |
+
config = model.config
|
291 |
+
lossRed = config.getStringConfig("train.loss.reduction")[0]
|
292 |
+
if lossFnName == "ltwo" or lossFnName == "mse":
|
293 |
+
lossFunc = torch.nn.MSELoss(reduction=lossRed)
|
294 |
+
elif lossFnName == "ce":
|
295 |
+
lossFunc = torch.nn.CrossEntropyLoss(reduction=lossRed)
|
296 |
+
elif lossFnName == "lone" or lossFnName == "mae":
|
297 |
+
lossFunc = torch.nn.L1Loss(reduction=lossRed)
|
298 |
+
elif lossFnName == "bce":
|
299 |
+
lossFunc = torch.nn.BCELoss(reduction=lossRed)
|
300 |
+
elif lossFnName == "bcel":
|
301 |
+
lossFunc = torch.nn.BCEWithLogitsLoss(reduction=lossRed)
|
302 |
+
elif lossFnName == "sm":
|
303 |
+
lossFunc = torch.nn.SoftMarginLoss(reduction=lossRed)
|
304 |
+
elif lossFnName == "mlsm":
|
305 |
+
lossFunc = torch.nn.MultiLabelSoftMarginLoss(reduction=lossRed)
|
306 |
+
else:
|
307 |
+
exitWithMsg("invalid loss function name " + lossFnName)
|
308 |
+
return lossFunc
|
309 |
+
|
310 |
+
@staticmethod
|
311 |
+
def createOptimizer(model, optName):
|
312 |
+
"""
|
313 |
+
create optimizer
|
314 |
+
|
315 |
+
Parameters
|
316 |
+
optName : optimizer name
|
317 |
+
"""
|
318 |
+
config = model.config
|
319 |
+
learnRate = config.getFloatConfig("train.opt.learning.rate")[0]
|
320 |
+
weightDecay = config.getFloatConfig("train.opt.weight.decay")[0]
|
321 |
+
momentum = config.getFloatConfig("train.opt.momentum")[0]
|
322 |
+
eps = config.getFloatConfig("train.opt.eps")[0]
|
323 |
+
if optName == "sgd":
|
324 |
+
dampening = config.getFloatConfig("train.opt.dampening")[0]
|
325 |
+
momentumNesterov = config.getBooleanConfig("train.opt.momentum.nesterov")[0]
|
326 |
+
optimizer = torch.optim.SGD(model.parameters(),lr=learnRate, momentum=momentum,
|
327 |
+
dampening=dampening, weight_decay=weightDecay, nesterov=momentumNesterov)
|
328 |
+
elif optName == "adam":
|
329 |
+
betas = config.getFloatListConfig("train.opt.betas")[0]
|
330 |
+
betas = (betas[0], betas[1])
|
331 |
+
optimizer = torch.optim.Adam(model.parameters(), lr=learnRate,betas=betas, eps = eps,
|
332 |
+
weight_decay=weightDecay)
|
333 |
+
elif optName == "rmsprop":
|
334 |
+
alpha = config.getFloatConfig("train.opt.alpha")[0]
|
335 |
+
optimizer = torch.optim.RMSprop(model.parameters(), lr=learnRate, alpha=alpha,
|
336 |
+
eps=eps, weight_decay=weightDecay, momentum=momentum)
|
337 |
+
else:
|
338 |
+
exitWithMsg("invalid optimizer name " + optName)
|
339 |
+
return optimizer
|
340 |
+
|
341 |
+
|
342 |
+
def forward(self, x):
|
343 |
+
"""
|
344 |
+
In the forward function we accept a Tensor of input data and we must return
|
345 |
+
a Tensor of output data. We can use Modules defined in the constructor as
|
346 |
+
well as arbitrary (differentiable) operations on Tensors.
|
347 |
+
|
348 |
+
Parameters
|
349 |
+
x : data batch
|
350 |
+
"""
|
351 |
+
y = self.layers(x)
|
352 |
+
return y
|
353 |
+
|
354 |
+
@staticmethod
|
355 |
+
def addForwardHook(model, l, cl = 0):
|
356 |
+
"""
|
357 |
+
register forward hooks
|
358 |
+
|
359 |
+
Parameters
|
360 |
+
l :
|
361 |
+
cl :
|
362 |
+
"""
|
363 |
+
for name, layer in model._modules.items():
|
364 |
+
#If it is a sequential, don't register a hook on it
|
365 |
+
# but recursively register hook on all it's module children
|
366 |
+
print(str(cl) + " : " + name)
|
367 |
+
if isinstance(layer, torch.nn.Sequential):
|
368 |
+
FeedForwardNetwork.addForwardHook(layer, l, cl)
|
369 |
+
else:
|
370 |
+
# it's a non sequential. Register a hook
|
371 |
+
if cl == l:
|
372 |
+
print("setting hook at layer " + str(l))
|
373 |
+
layer.register_forward_hook(hookFn)
|
374 |
+
cl += 1
|
375 |
+
|
376 |
+
@staticmethod
|
377 |
+
def prepData(model, dataSource, includeOutFld=True):
|
378 |
+
"""
|
379 |
+
loads and prepares data
|
380 |
+
|
381 |
+
Parameters
|
382 |
+
dataSource : data source str if file path or 2D array
|
383 |
+
includeOutFld : True if target freld to be included
|
384 |
+
"""
|
385 |
+
# parameters
|
386 |
+
fieldIndices = model.config.getIntListConfig("train.data.fields")[0]
|
387 |
+
featFieldIndices = model.config.getIntListConfig("train.data.feature.fields")[0]
|
388 |
+
|
389 |
+
#all data and feature data
|
390 |
+
isDataFile = isinstance(dataSource, str)
|
391 |
+
selFieldIndices = fieldIndices if includeOutFld else fieldIndices[:-1]
|
392 |
+
if isDataFile:
|
393 |
+
#source file path
|
394 |
+
(data, featData) = loadDataFile(dataSource, ",", selFieldIndices, featFieldIndices)
|
395 |
+
else:
|
396 |
+
# tabular data
|
397 |
+
data = tableSelFieldsFilter(dataSource, selFieldIndices)
|
398 |
+
featData = tableSelFieldsFilter(data, featFieldIndices)
|
399 |
+
#print(featData)
|
400 |
+
featData = np.array(featData)
|
401 |
+
|
402 |
+
if (model.config.getStringConfig("common.preprocessing")[0] == "scale"):
|
403 |
+
scalingMethod = model.config.getStringConfig("common.scaling.method")[0]
|
404 |
+
|
405 |
+
#scale only if there are enough rows
|
406 |
+
nrow = featData.shape[0]
|
407 |
+
minrows = model.config.getIntConfig("common.scaling.minrows")[0]
|
408 |
+
if nrow > minrows:
|
409 |
+
#in place scaling
|
410 |
+
featData = scaleData(featData, scalingMethod)
|
411 |
+
else:
|
412 |
+
#use pre computes scaling parameters
|
413 |
+
spFile = model.config.getStringConfig("common.scaling.param.file")[0]
|
414 |
+
if spFile is None:
|
415 |
+
exitWithMsg("for small data sets pre computed scaling parameters need to provided")
|
416 |
+
scParams = restoreObject(spFile)
|
417 |
+
featData = scaleDataWithParams(featData, scalingMethod, scParams)
|
418 |
+
featData = np.array(featData)
|
419 |
+
|
420 |
+
# target data
|
421 |
+
if includeOutFld:
|
422 |
+
outFieldIndices = model.config.getStringConfig("train.data.out.fields")[0]
|
423 |
+
outFieldIndices = strToIntArray(outFieldIndices, ",")
|
424 |
+
if isDataFile:
|
425 |
+
outData = data[:,outFieldIndices]
|
426 |
+
else:
|
427 |
+
outData = tableSelFieldsFilter(data, outFieldIndices)
|
428 |
+
outData = np.array(outData)
|
429 |
+
foData = (featData.astype(np.float32), outData.astype(np.float32))
|
430 |
+
else:
|
431 |
+
foData = featData.astype(np.float32)
|
432 |
+
return foData
|
433 |
+
|
434 |
+
@staticmethod
|
435 |
+
def saveCheckpt(model):
|
436 |
+
"""
|
437 |
+
checkpoints model
|
438 |
+
|
439 |
+
Parameters
|
440 |
+
model : torch model
|
441 |
+
"""
|
442 |
+
print("..saving model checkpoint")
|
443 |
+
modelDirectory = model.config.getStringConfig("common.model.directory")[0]
|
444 |
+
assert os.path.exists(modelDirectory), "model save directory does not exist"
|
445 |
+
modelFile = model.config.getStringConfig("common.model.file")[0]
|
446 |
+
filepath = os.path.join(modelDirectory, modelFile)
|
447 |
+
state = {"state_dict": model.state_dict(), "optim_dict": model.optimizer.state_dict()}
|
448 |
+
torch.save(state, filepath)
|
449 |
+
if model.verbose:
|
450 |
+
print("model saved")
|
451 |
+
|
452 |
+
@staticmethod
|
453 |
+
def restoreCheckpt(model, loadOpt=False):
|
454 |
+
"""
|
455 |
+
restored checkpointed model
|
456 |
+
|
457 |
+
Parameters
|
458 |
+
model : torch model
|
459 |
+
loadOpt : True if optimizer to be loaded
|
460 |
+
"""
|
461 |
+
if not model.restored:
|
462 |
+
print("..restoring model checkpoint")
|
463 |
+
modelDirectory = model.config.getStringConfig("common.model.directory")[0]
|
464 |
+
modelFile = model.config.getStringConfig("common.model.file")[0]
|
465 |
+
filepath = os.path.join(modelDirectory, modelFile)
|
466 |
+
assert os.path.exists(filepath), "model save file does not exist"
|
467 |
+
checkpoint = torch.load(filepath)
|
468 |
+
model.load_state_dict(checkpoint["state_dict"])
|
469 |
+
model.to(model.device)
|
470 |
+
if loadOpt:
|
471 |
+
model.optimizer.load_state_dict(checkpoint["optim_dict"])
|
472 |
+
model.restored = True
|
473 |
+
|
474 |
+
@staticmethod
|
475 |
+
def processClassifOutput(yPred, config):
|
476 |
+
"""
|
477 |
+
extracts probability label 1 or label with highest probability
|
478 |
+
|
479 |
+
Parameters
|
480 |
+
yPred : predicted output
|
481 |
+
config : config object
|
482 |
+
"""
|
483 |
+
outType = config.getStringConfig("predict.output")[0]
|
484 |
+
if outType == "prob":
|
485 |
+
outputSize = config.getIntConfig("train.output.size")[0]
|
486 |
+
if outputSize == 2:
|
487 |
+
#return prob of pos class for binary classifier
|
488 |
+
yPred = yPred[:, 1]
|
489 |
+
else:
|
490 |
+
#return class value and probability for multi classifier
|
491 |
+
yCl = np.argmax(yPred, axis=1)
|
492 |
+
yPred = list(map(lambda y : y[0][y[1]], zip(yPred, yCl)))
|
493 |
+
yPred = zip(yCl, yPred)
|
494 |
+
else:
|
495 |
+
yPred = np.argmax(yPred, axis=1)
|
496 |
+
return yPred
|
497 |
+
|
498 |
+
@staticmethod
|
499 |
+
def printPrediction(yPred, config, dataSource):
|
500 |
+
"""
|
501 |
+
prints input feature data and prediction
|
502 |
+
|
503 |
+
Parameters
|
504 |
+
yPred : predicted output
|
505 |
+
config : config object
|
506 |
+
dataSource : data source str if file path or 2D array
|
507 |
+
"""
|
508 |
+
#prDataFilePath = config.getStringConfig("predict.data.file")[0]
|
509 |
+
padWidth = config.getIntConfig("predict.feat.pad.size")[0]
|
510 |
+
i = 0
|
511 |
+
if type(dataSource) == str:
|
512 |
+
for rec in fileRecGen(dataSource, ","):
|
513 |
+
feat = (",".join(rec)).ljust(padWidth, " ")
|
514 |
+
rec = feat + "\t" + str(yPred[i])
|
515 |
+
print(rec)
|
516 |
+
i += 1
|
517 |
+
else:
|
518 |
+
for rec in dataSource:
|
519 |
+
srec = toStrList(rec, 6)
|
520 |
+
feat = (",".join(srec)).ljust(padWidth, " ")
|
521 |
+
srec = feat + "\t" + str(yPred[i])
|
522 |
+
print(srec)
|
523 |
+
i += 1
|
524 |
+
|
525 |
+
|
526 |
+
@staticmethod
|
527 |
+
def allTrain(model):
|
528 |
+
"""
|
529 |
+
train with all data
|
530 |
+
|
531 |
+
Parameters
|
532 |
+
model : torch model
|
533 |
+
"""
|
534 |
+
# train mode
|
535 |
+
model.train()
|
536 |
+
for t in range(model.numIter):
|
537 |
+
|
538 |
+
|
539 |
+
# Forward pass: Compute predicted y by passing x to the model
|
540 |
+
yPred = model(model.featData)
|
541 |
+
|
542 |
+
# Compute and print loss
|
543 |
+
loss = model.lossFn(yPred, model.outData)
|
544 |
+
if model.verbose and t % 50 == 0:
|
545 |
+
print("epoch {} loss {:.6f}".format(t, loss.item()))
|
546 |
+
|
547 |
+
# Zero gradients, perform a backward pass, and update the weights.
|
548 |
+
model.optimizer.zero_grad()
|
549 |
+
loss.backward()
|
550 |
+
model.optimizer.step()
|
551 |
+
|
552 |
+
#validate
|
553 |
+
model.eval()
|
554 |
+
yPred = model(model.validFeatData)
|
555 |
+
yPred = yPred.data.cpu().numpy()
|
556 |
+
yActual = model.validOutData
|
557 |
+
if model.verbose:
|
558 |
+
result = np.concatenate((yPred, yActual), axis = 1)
|
559 |
+
print("predicted actual")
|
560 |
+
print(result)
|
561 |
+
|
562 |
+
score = perfMetric(model.accMetric, yActual, yPred)
|
563 |
+
print(formatFloat(3, score, "perf score"))
|
564 |
+
return score
|
565 |
+
|
566 |
+
@staticmethod
|
567 |
+
def batchTrain(model):
|
568 |
+
"""
|
569 |
+
train with batch data
|
570 |
+
|
571 |
+
Parameters
|
572 |
+
model : torch model
|
573 |
+
"""
|
574 |
+
model.restored = False
|
575 |
+
trainData = TensorDataset(model.featData, model.outData)
|
576 |
+
trainDataLoader = DataLoader(dataset=trainData, batch_size=model.batchSize, shuffle=True)
|
577 |
+
epochIntv = model.config.getIntConfig("train.epoch.intv")[0]
|
578 |
+
|
579 |
+
# train mode
|
580 |
+
model.train()
|
581 |
+
|
582 |
+
if model.trackErr:
|
583 |
+
trErr = list()
|
584 |
+
vaErr = list()
|
585 |
+
#epoch
|
586 |
+
for t in range(model.numIter):
|
587 |
+
#batch
|
588 |
+
b = 0
|
589 |
+
epochLoss = 0.0
|
590 |
+
for xBatch, yBatch in trainDataLoader:
|
591 |
+
|
592 |
+
# Forward pass: Compute predicted y by passing x to the model
|
593 |
+
xBatch, yBatch = xBatch.to(model.device), yBatch.to(model.device)
|
594 |
+
yPred = model(xBatch)
|
595 |
+
|
596 |
+
# Compute and print loss
|
597 |
+
loss = model.lossFn(yPred, yBatch)
|
598 |
+
if model.verbose and t % epochIntv == 0 and b % model.batchIntv == 0:
|
599 |
+
print("epoch {} batch {} loss {:.6f}".format(t, b, loss.item()))
|
600 |
+
|
601 |
+
if model.trackErr and model.batchIntv == 0:
|
602 |
+
epochLoss += loss.item()
|
603 |
+
|
604 |
+
#error tracking at batch level
|
605 |
+
if model.trackErr and model.batchIntv > 0 and b % model.batchIntv == 0:
|
606 |
+
trErr.append(loss.item())
|
607 |
+
vloss = FeedForwardNetwork.evaluateModel(model)
|
608 |
+
vaErr.append(vloss)
|
609 |
+
|
610 |
+
# Zero gradients, perform a backward pass, and update the weights.
|
611 |
+
model.optimizer.zero_grad()
|
612 |
+
loss.backward()
|
613 |
+
model.optimizer.step()
|
614 |
+
b += 1
|
615 |
+
|
616 |
+
#error tracking at epoch level
|
617 |
+
if model.trackErr and model.batchIntv == 0:
|
618 |
+
epochLoss /= len(trainDataLoader)
|
619 |
+
trErr.append(epochLoss)
|
620 |
+
vloss = FeedForwardNetwork.evaluateModel(model)
|
621 |
+
vaErr.append(vloss)
|
622 |
+
|
623 |
+
#validate
|
624 |
+
model.eval()
|
625 |
+
yPred = model(model.validFeatData)
|
626 |
+
yPred = yPred.data.cpu().numpy()
|
627 |
+
yActual = model.validOutData
|
628 |
+
if model.verbose:
|
629 |
+
vsize = yPred.shape[0]
|
630 |
+
print("\npredicted \t\t actual")
|
631 |
+
for i in range(vsize):
|
632 |
+
print(str(yPred[i]) + "\t" + str(yActual[i]))
|
633 |
+
|
634 |
+
score = perfMetric(model.accMetric, yActual, yPred)
|
635 |
+
print(yActual)
|
636 |
+
print(yPred)
|
637 |
+
print(formatFloat(3, score, "perf score"))
|
638 |
+
|
639 |
+
#save
|
640 |
+
modelSave = model.config.getBooleanConfig("train.model.save")[0]
|
641 |
+
if modelSave:
|
642 |
+
FeedForwardNetwork.saveCheckpt(model)
|
643 |
+
|
644 |
+
if model.trackErr:
|
645 |
+
FeedForwardNetwork.errorPlot(model, trErr, vaErr)
|
646 |
+
|
647 |
+
if model.config.getBooleanConfig("train.print.weights")[0]:
|
648 |
+
print("model weights")
|
649 |
+
for param in model.parameters():
|
650 |
+
print(param.data)
|
651 |
+
return score
|
652 |
+
|
653 |
+
@staticmethod
|
654 |
+
def errorPlot(model, trErr, vaErr):
|
655 |
+
"""
|
656 |
+
plot errors
|
657 |
+
|
658 |
+
Parameters
|
659 |
+
trErr : training error list
|
660 |
+
vaErr : validation error list
|
661 |
+
"""
|
662 |
+
x = np.arange(len(trErr))
|
663 |
+
plt.plot(x,trErr,label = "training error")
|
664 |
+
plt.plot(x,vaErr,label = "validation error")
|
665 |
+
plt.xlabel("iteration")
|
666 |
+
plt.ylabel("error")
|
667 |
+
plt.legend(["training error", "validation error"], loc='upper left')
|
668 |
+
plt.show()
|
669 |
+
|
670 |
+
@staticmethod
|
671 |
+
def modelPredict(model, dataSource = None):
|
672 |
+
"""
|
673 |
+
predict
|
674 |
+
|
675 |
+
Parameters
|
676 |
+
model : torch model
|
677 |
+
dataSource : data source
|
678 |
+
"""
|
679 |
+
#train or restore model
|
680 |
+
useSavedModel = model.config.getBooleanConfig("predict.use.saved.model")[0]
|
681 |
+
if useSavedModel:
|
682 |
+
FeedForwardNetwork.restoreCheckpt(model)
|
683 |
+
else:
|
684 |
+
FeedForwardNetwork.batchTrain(model)
|
685 |
+
|
686 |
+
#predict
|
687 |
+
if dataSource is None:
|
688 |
+
dataSource = model.config.getStringConfig("predict.data.file")[0]
|
689 |
+
featData = FeedForwardNetwork.prepData(model, dataSource, False)
|
690 |
+
#print(featData)
|
691 |
+
featData = torch.from_numpy(featData)
|
692 |
+
featData = featData.to(model.device)
|
693 |
+
|
694 |
+
model.eval()
|
695 |
+
yPred = model(featData)
|
696 |
+
yPred = yPred.data.cpu().numpy()
|
697 |
+
#print(yPred)
|
698 |
+
|
699 |
+
if model.outputSize >= 2:
|
700 |
+
#classification
|
701 |
+
yPred = FeedForwardNetwork.processClassifOutput(yPred, model.config)
|
702 |
+
|
703 |
+
# print prediction
|
704 |
+
if model.config.getBooleanConfig("predict.print.output")[0]:
|
705 |
+
FeedForwardNetwork.printPrediction(yPred, model.config, dataSource)
|
706 |
+
|
707 |
+
return yPred
|
708 |
+
|
709 |
+
def predict(self, dataSource = None):
|
710 |
+
"""
|
711 |
+
predict
|
712 |
+
|
713 |
+
Parameters
|
714 |
+
dataSource : data source
|
715 |
+
"""
|
716 |
+
return FeedForwardNetwork.modelPredict(self, dataSource)
|
717 |
+
|
718 |
+
@staticmethod
|
719 |
+
def evaluateModel(model):
|
720 |
+
"""
|
721 |
+
evaluate model
|
722 |
+
|
723 |
+
Parameters
|
724 |
+
model : torch model
|
725 |
+
"""
|
726 |
+
model.eval()
|
727 |
+
with torch.no_grad():
|
728 |
+
yPred = model(model.validFeatData)
|
729 |
+
#yPred = yPred.data.cpu().numpy()
|
730 |
+
yActual = model.validOutData
|
731 |
+
score = model.lossFn(yPred, yActual).item()
|
732 |
+
model.train()
|
733 |
+
return score
|
734 |
+
|
735 |
+
@staticmethod
|
736 |
+
def prepValidate(model, dataSource=None):
|
737 |
+
"""
|
738 |
+
prepare for validation
|
739 |
+
|
740 |
+
Parameters
|
741 |
+
model : torch model
|
742 |
+
dataSource : data source
|
743 |
+
"""
|
744 |
+
#train or restore model
|
745 |
+
if not model.restored:
|
746 |
+
useSavedModel = model.config.getBooleanConfig("predict.use.saved.model")[0]
|
747 |
+
if useSavedModel:
|
748 |
+
FeedForwardNetwork.restoreCheckpt(model)
|
749 |
+
else:
|
750 |
+
FeedForwardNetwork.batchTrain(model)
|
751 |
+
model.restored = True
|
752 |
+
|
753 |
+
if dataSource is not None:
|
754 |
+
model.setValidationData(dataSource)
|
755 |
+
|
756 |
+
@staticmethod
|
757 |
+
def validateModel(model, retPred=False):
|
758 |
+
"""
|
759 |
+
pmodel validation
|
760 |
+
|
761 |
+
Parameters
|
762 |
+
model : torch model
|
763 |
+
retPred : if True return prediction
|
764 |
+
"""
|
765 |
+
model.eval()
|
766 |
+
yPred = model(model.validFeatData)
|
767 |
+
yPred = yPred.data.cpu().numpy()
|
768 |
+
model.yPred = yPred
|
769 |
+
yActual = model.validOutData
|
770 |
+
vsize = yPred.shape[0]
|
771 |
+
if model.verbose:
|
772 |
+
print("\npredicted \t actual")
|
773 |
+
for i in range(vsize):
|
774 |
+
print("{:.3f}\t\t{:.3f}".format(yPred[i][0], yActual[i][0]))
|
775 |
+
|
776 |
+
score = perfMetric(model.accMetric, yActual, yPred)
|
777 |
+
print(formatFloat(3, score, "perf score"))
|
778 |
+
|
779 |
+
if retPred:
|
780 |
+
y = list(map(lambda i : (yPred[i][0], yActual[i][0]), range(vsize)))
|
781 |
+
res = (y, score)
|
782 |
+
return res
|
783 |
+
else:
|
784 |
+
return score
|
785 |
+
|
786 |
+
|
787 |
+
|
788 |
+
|
789 |
+
|