-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrainer.py
141 lines (121 loc) · 4.48 KB
/
trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# -*- coding: utf-8 -*-
"""
Created on Thu Dec 5 15:08:41 2019
@author: Saint8312
"""
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import pickle
import os
import data_checker
if __name__ == '__main__':
'''
load and split the dataset
'''
filename = os.getcwd()+'/Data/dataset_ha_alpha_122319.pkl'
dataset = data_checker.data_load(filename)
features = np.array([data['x_vector'] for data in dataset])
labels = np.array([data['y'] for data in dataset])
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=13)
print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)
'''
data regression
'''
rf = RandomForestRegressor(n_estimators= 1000, random_state=42, verbose=1, n_jobs=-1, min_samples_split=3, max_features="sqrt", bootstrap=True, oob_score=True)
rf.fit(x_train, y_train)
'''
model saver
'''
with open(os.getcwd()+"/Model/rf_pp_ha_alpha_split.pkl", "wb") as f:
pickle.dump(rf, f)
'''
train set analysis
'''
#Mean Absolute Error
preds = rf.predict(x_train)
errors = abs(preds - y_train)
print('Mean Absolute Error:', round(np.mean(errors), 2))
#Mean Absolute Percentage Error & Accuracy
mape = 100 * (errors / y_train)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')
#Root Mean Squared Error
rmse = np.sqrt(mean_squared_error(y_train, preds))
print('Root Mean Squared Error :', round(rmse, 2))
#Pearson Correlation Coefficient (PCC) score
pcc = pearsonr(y_train, preds)
print('Pearson Correlation Coefficient :', round(pcc[0],2))
print(preds, y_train)
'''
test set analysis
'''
#Mean Absolute Error
preds = rf.predict(x_test)
errors = abs(preds - y_test)
print('Mean Absolute Error:', round(np.mean(errors), 2))
#Mean Absolute Percentage Error & Accuracy
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')
#Root Mean Squared Error
rmse = np.sqrt(mean_squared_error(y_test, preds))
print('Root Mean Squared Error :', round(rmse, 2))
#Pearson Correlation Coefficient (PCC) score
pcc = pearsonr(y_test, preds)
print('Pearson Correlation Coefficient :', round(pcc[0],2))
# '''
# k-fold cross validation
# '''
# folds = [3,4,5,7,10]
# for fold in folds:
# kfolds=[]
# n=fold
# idx = 0
# kf = KFold(n_splits=n)
# for train_index, test_index in kf.split(features):
# kfold = {}
# print("index training :",idx)
# print("TRAIN:", len(train_index), "TEST:", len(test_index))
# x_train, x_test = features[train_index], features[test_index]
# y_train, y_test = labels[train_index], labels[test_index]
# rf = RandomForestRegressor(n_estimators = 1000, random_state=13, verbose=0)
# rf.fit(x_train, y_train)
#
# idx+=1
#
# #Pearson Correlation Coefficient (PCC) score
# preds = rf.predict(x_train)
# pcc = pearsonr(y_train, preds)
# kfold["pcc_train"] = pcc[0]
# print('PCC train :', round(pcc[0],2))
#
# preds = rf.predict(x_test)
# pcc = pearsonr(y_test, preds)
# kfold["pcc_test"] = pcc[0]
# print('PCC test :', round(pcc[0],2))
# print('===================')
#
# kfold["train_idx"] = train_index
# kfold["test_idx"] = test_index
# kfold["k"] = n
# kfold["idx"] = idx
# kfold["model"] = rf
# kfolds.append(kfold)
# kfolds = sorted(kfolds, key=lambda k: k['pcc_test'], reverse=True)
# print(kfolds[0]['k'], kfolds[0]['pcc_test'])
# #save best model
# with open(os.getcwd()+"/Model/rf_pp_ha_a_"+str(n)+"fold_best.pkl", "wb") as f:
# pickle.dump(kfolds[0], f)
# '''
# model loader
# '''
# with open(os.getcwd()+"/Model/rf_pp_alpha.pkl", "rb") as f:
# rf = pickle.load(f)