This repository has been archived by the owner on Nov 14, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrf.py
336 lines (258 loc) · 13.7 KB
/
rf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
import csv
import argparse
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.base import clone
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, cross_validate, RepeatedKFold
from sklearn.dummy import DummyClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import resample
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import data_preparation as dp
import data_understanding as du
# Set holding the correlated features removed during training
corr_feats = set()
# Loads data from complete_data.csv and returns it in the form of a pandas data frame, depending on the mode (train or test)
def load_data(train):
data = pd.read_csv('./files/complete_data.csv', header=0, delimiter=';')
header = list(dp.complete_data_row.keys()).copy()
if train:
feature_cols = header[1 : len(header) - 1]
x = data[feature_cols]
y = data.status
global corr_feats
x, corr_feats = dp.remove_correlated_attributes(x)
return x, y
else:
return data[header].drop(labels=corr_feats, axis=1)
# Loads data from complete_data.csv and returns it in the form of a pandas data frame with loan_ids
def load_loan_data():
data = pd.read_csv('./files/complete_data.csv', header=0, delimiter=';')
header = list(dp.complete_data_row.keys()).copy()
return data[header]
# Returns the year of a loan request given the loan_id
def get_loan_year(loan_id):
with open('./files/loan_train.csv') as loans:
loan_reader = csv.reader(loans, delimiter=';')
next(loan_reader)
for row in loan_reader:
if len(row) == 7 and int(row[0]) == loan_id:
date = row[2]
return int(date[:2])
# Splits data into new train and test files having in consideration the loan year
def rewrite_loans(data, train_years, test_years):
with open('./files/loan_year_train.csv', 'w', newline='') as loan_year_train, open('./files/loan_year_test.csv', 'w', newline='') as loan_year_test:
train_writer = csv.writer(loan_year_train, delimiter=';')
test_writer = csv.writer(loan_year_test, delimiter=';')
header = list(dp.complete_data_row.keys()).copy()
train_writer.writerow(header)
test_writer.writerow(header)
for train_year in train_years:
for index, row in data.iterrows():
if get_loan_year(row['loan_id']) == train_year:
train_writer.writerow(row)
for test_year in test_years:
for index, row in data.iterrows():
if get_loan_year(row['loan_id']) == test_year:
test_writer.writerow(row)
# Returns x_train, y_train, x_test, y_test splitted accordingly to the train and test dates
def date_split():
train_data = pd.read_csv('./files/loan_year_train.csv', header=0, delimiter=';')
test_data = pd.read_csv('./files/loan_year_test.csv', header=0, delimiter=';')
header = list(dp.complete_data_row.keys()).copy()
feature_cols = header[1 : len(header) - 1]
x_train = train_data[feature_cols]
y_train = train_data.status
x_test = test_data[feature_cols]
y_test = test_data.status
return x_train, y_train, x_test, y_test
# Builds the random forest model and calculates the accuracy and AUC score
def build_model(grid_search=False, rand_search=False, clean_data=True, rem_outlier=False):
dp.arrange_complete_data(True, clean=clean_data, outlier_removal=rem_outlier)
x, y = load_data(True)
# data = load_loan_data()
# rewrite_loans(data, [95], [96])
# x_train, y_train, x_test, y_test = date_split()
x_train, x_test, y_train, y_test = strat_train_test_split(x, y, 0.2)
x_train_balanced, y_train_balanced = smote_and_undersample(x_train, y_train, 0.8, k_neighbors=3)
if rand_search:
hyper_parameter_randomized_search(x_train_balanced, y_train_balanced, x_test, y_test)
if grid_search:
hyper_parameter_grid_search(x_train_balanced, y_train_balanced, x_test, y_test)
print('\nTraining cases: ' + str(len(x_train_balanced)))
print('Test cases: ' + str(len(x_test)))
dummy_classifier(x_train_balanced, y_train_balanced, x_test, y_test)
# Best Weights: {-1: 1, 1: 7}
clf = RandomForestClassifier(max_features='sqrt', criterion='gini', min_samples_split=5, min_samples_leaf=2,
max_depth=None, n_estimators=500, class_weight={-1: 1, 1: 7}, random_state=42)
clf = train(clf, x_train_balanced, y_train_balanced, 10)
test(clf, x_test, y_test)
return clf
# Trains a model and evaluates it using K-Folds
def train(clf, x, y, k=5):
cv = RepeatedKFold(n_splits=k, n_repeats=3, random_state=42)
scores = cross_validate(clf, x, y, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], cv=cv, n_jobs=-1)
print('\n--- Training ---')
print('Accuracy: %.2f' % np.mean(scores['test_accuracy']))
print('Precison: %.2f' % np.mean(scores['test_precision']))
print('Recall: %.2f' % np.mean(scores['test_recall']))
print('F1: %.2f' % np.mean(scores['test_f1']))
print('ROC AUC: %.5f' % np.mean(scores['test_roc_auc']))
return clf.fit(x, y)
# Tests and evaluates a trained model given its training and test data sets
def test(clf, x_test, y_test):
y_pred = clf.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
du.plot_confusion_matrix(cm, ['Unsuccessful', 'Successful'], 'Decision Tree')
get_feature_importance(clf)
print('\n--- Test ---')
print('Accuracy: %.2f' % (accuracy_score(y_test, y_pred)))
print('Precision: %.2f' % (precision_score(y_test, y_pred)))
print('Recall: %.2f' % (recall_score(y_test, y_pred)))
print('F1: %.2f' % (f1_score(y_test, y_pred)))
print('ROC AUC: %.5f' % calc_auc(clf, x_test, y_test))
# Train / Test Stratified Dataset Split
def strat_train_test_split(x, y, test_size):
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=1, stratify=y)
train_counter = Counter(y_train)
test_counter = Counter(y_test)
negative_percent_train = (train_counter[-1] / len(y_train)) * 100
negative_percent_test = (test_counter[-1] / len(y_test)) * 100
print('\nAfter Stratification')
print('Positive training cases:' + str(train_counter[1]))
print('Negative training cases:' + str(train_counter[-1]))
print('Positive test cases:' + str(test_counter[1]))
print('Negative test cases:' + str(test_counter[-1]))
print('\nTraining negative cases ratio: %.2f' % negative_percent_train + '%')
print('Test negative cases ratio: %.2f' % negative_percent_test + '%')
return x_train, x_test, y_train, y_test
# Executes a hyper parameter randomized search on a Random Forest
def hyper_parameter_randomized_search(x_train, y_train, x_test, y_test):
param_grid = {
'max_features': ['sqrt', None, 'log2'],
'max_depth': [1, 2, 3, 4, 5, 6, None],
'min_samples_leaf': [i for i in range(1, 11)],
'min_samples_split': [i for i in range(2, 11)],
'n_estimators': [i * 100 for i in range(1, 11)],
'criterion': ['gini', 'entropy']
}
clf = RandomForestClassifier(max_features=None, max_depth=None, random_state=42)
rand_search = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, n_iter=50, verbose=1,
cv=10, random_state=42, n_jobs=1)
rand_search.fit(x_train, y_train)
print('\n--- Hyper Parameter Random Search Results ---')
print(rand_search.best_params_)
y_pred = rand_search.predict(x_test)
print('Accuracy: %.2f' % (accuracy_score(y_test, y_pred) * 100) + '%')
print('Precision: %.2f' % (precision_score(y_test, y_pred)))
print('Recall: %.2f' % (recall_score(y_test, y_pred)))
print('F1: %.2f' % (f1_score(y_test, y_pred)))
print('AUC Score: %.5f' % calc_auc(rand_search.best_estimator_, x_test, y_test))
# Executes a hyper parameter grid search on a Random Forest
def hyper_parameter_grid_search(x_train, y_train, x_test, y_test):
param_grid = {
'max_features': ['sqrt', None, 'log2'],
'min_samples_leaf': [2, 5, 10],
'min_samples_split': [2, 5, 10],
'n_estimators': [300, 500, 900],
'criterion': ['gini', 'entropy']
}
clf = RandomForestClassifier(max_features=None, max_depth=None, random_state=42)
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, verbose=1)
grid_search.fit(x_train, y_train)
print('\n--- Hyper Parameter Grid Search Results ---')
print(grid_search.best_params_)
y_pred = grid_search.predict(x_test)
print('Accuracy: %.1f' % (accuracy_score(y_test, y_pred) * 100) + '%')
print('Precision: %.2f' % (precision_score(y_test, y_pred)))
print('Recall: %.2f' % (recall_score(y_test, y_pred)))
print('F1: %.2f' % (f1_score(y_test, y_pred)))
print('AUC Score: %.2f' % calc_auc(grid_search.best_estimator_, x_test, y_test))
# Returns a model's AUC
def calc_auc(clf, x_test, y_test):
prob_y = clf.predict_proba(x_test)
prob_y = [p[1] for p in prob_y]
return roc_auc_score(y_test, prob_y)
# Builds, trains and evaluates a dummy classifier
def dummy_classifier(x_train, y_train, x_test, y_test):
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(x_train, y_train)
print('\nDummy Accuracy: %.2f' % (dummy.score(x_test, y_test)))
def smote_and_undersample(x_train, y_train, smote_ss, us_ss=1.0, k_neighbors=5, smote=True, undersample=True):
if smote:
sm = BorderlineSMOTE(sampling_strategy=smote_ss, k_neighbors=k_neighbors, random_state=42)
x_train, y_train = sm.fit_resample(x_train, y_train)
y_counter = Counter(y_train)
print('\nAfter SMOTE')
print('Positive, Negative training Y: %s' % y_counter)
if undersample:
rus = RandomUnderSampler(sampling_strategy=us_ss, random_state=42)
x_train, y_train = rus.fit_resample(x_train, y_train)
y_counter = Counter(y_train)
print('After Random Undersampling')
print('Positive, Negative training Y: %s' % y_counter)
return x_train, y_train
# Balances the training set given a ratio
def undersample_majority_class(x_train, y_train, ratio):
x_train_majority = x_train[y_train.values == 1]
y_train_majority = y_train[y_train.values == 1]
x_train_minority = x_train[y_train.values == -1]
y_train_minority = y_train[y_train.values == -1]
sample_no = int(len(x_train_minority.index) * ratio)
x_train_majority_downsampled = resample(x_train_majority, replace=False, n_samples=sample_no, random_state=123)
y_train_majority_downsampled = resample(y_train_majority, replace=False, n_samples=sample_no, random_state=123)
x_train_balanced = pd.concat([x_train_majority_downsampled, x_train_minority])
y_train_balanced = pd.concat([y_train_majority_downsampled, y_train_minority])
print('\nAfter Undersampling')
print('Positive | Negative training X: ' + str(len(x_train_balanced[y_train_balanced.values == 1])) + ' | ' +
str(len(x_train_balanced[y_train_balanced.values == -1])))
print('Positive | Negative training Y: ' + str(len(y_train_balanced[y_train_balanced.values == 1])) + ' | ' +
str(len(y_train_balanced[y_train_balanced.values == -1])))
return x_train_balanced, y_train_balanced
# Displays the relevance of each predictive attribute in respect to the final prediction
def get_feature_importance(clf):
print('\n--- Feature Importance ---\n')
for i in range(len(clf.feature_importances_)):
print(list(dp.complete_data_row.keys())[i + 1] + ': ' + '%.2f' % (clf.feature_importances_[i] * 100) + '%')
# Saves an image representing one of the forest's decision trees
def visualize_tree(clf):
fig = plt.figure(figsize=(100, 100))
plot_tree(clf.estimators_[0], feature_names=list(dp.complete_data_row.keys()).copy()[1 : len(dp.complete_data_row.keys()) - 1], class_names=['0', '1'],
filled=True)
fig.savefig('./figures/decision_tree.png')
plt.close()
# Runs the model on the competition data
def run_model(clf):
with open('./files/prediction.csv', 'w', newline='') as predictions:
pred_writer = csv.writer(predictions, delimiter=',')
pred_writer.writerow(['Id', 'Predicted'])
dp.arrange_complete_data(False)
x = load_data(False)
loan_ids = x['loan_id'].copy()
x.drop(['loan_id'], axis=1, inplace=True)
#y_pred = clf.predict(x)
y_prob = clf.predict_proba(x)
for i, row in x.iterrows():
pred_writer.writerow([int(loan_ids[i]), y_prob[i][1]])
def main():
parser = argparse.ArgumentParser(description='Random Forest Classifier')
parser.add_argument('-t', dest='test', action='store_true', default=False, help='Generate Kaggle test set predictions')
parser.add_argument('-v', dest='vis_tree', action='store_true', default=False, help='Generate image of the Decision Tree')
parser.add_argument('-g', dest='grid_search', action='store_true', default=False, help='Perform hyper-parameter grid search')
parser.add_argument('-r', dest='rand_search', action='store_true', default=False, help='Perform hyper-parameter random search')
parser.add_argument('-c', dest='clean', action='store_true', default=False, help='Clean CSVs')
parser.add_argument('-o', dest='outlier', action='store_true', default=False, help='Remove Outliers')
args = parser.parse_args()
clf = build_model(grid_search=args.grid_search, rand_search=args.rand_search, clean_data=args.clean,
rem_outlier=args.outlier)
if args.vis_tree:
visualize_tree(clf)
if args.test:
run_model(clf)
if __name__ == '__main__':
main()