-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbranch.py
97 lines (77 loc) · 3.58 KB
/
branch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python
"""branch.py: main test logic
Divides user test data in training/test, builds the training model, then tests on the rest
"""
import numpy, re, sqlite3, sys, time, os
from load_training_features import save_training_features, feature_keys
from features import pull_features_for_user
from sklearn import linear_model
from sklearn.utils import check_X_y
# Returns dict where keys are bad loanee user ids
def get_bad_loanees_map(bad_loanees_filename):
bad_loanees = {}
with open(bad_loanees_filename) as f:
bad_loan_data = f.readlines()
for l in bad_loan_data:
data = re.match('^#<(.*)>$', l)
if data:
for feature in data.group(1).split(","):
(k, v) = feature.split(": ")
if (k.strip() == 'User id'):
v = v.strip('\S"')
bad_loanees[int(v)] = 1
return bad_loanees
def main():
if len(sys.argv) < 2:
print "usage: %s basedir delinquent_id_file" % sys.argv[0]
print " - basedir is the directory contaning user logs"
print " - delinquent_id_file is the file listing delinquent loanee ids"
sys.exit(0)
training_db = 'training_db'
training_table = 'training_features'
# Get a list of available users, split into 70% training, 30% testing
user_ids = [int(user_id) for user_id in os.listdir(sys.argv[1])]
split_index = int(0.7 * len(user_ids))
training_user_ids = user_ids[:split_index]
testing_user_ids = user_ids[split_index:]
# Save training data
save_training_features(sys.argv[1], training_user_ids, training_db, training_table)
# Assemble the query. For the time fields, use difference in seconds from now to make ranges
# more manageable. Also note the logs are in milliseconds
selects = ', '.join(
["case when %s = 0 then cast(strftime('%%s','now') as int) else cast(strftime('%%s', 'now') as int) - (%s/1000) end as %s" % (x, x, x)
if feature_keys[x] == 'timestamp' else x for x in feature_keys])
conn = sqlite3.connect(training_db)
with conn:
cur = conn.execute('SELECT %s FROM %s' % (selects, training_table))
cols = [desc[0] for desc in cur.description]
rows = cur.fetchall()
conn.close()
bad_loanees = get_bad_loanees_map(sys.argv[2])
# assemble list of good/bad classifications in same order as rows
classification = [bad_loanees.get(row[cols.index('user_id')], 0) for row in rows]
# Train the model
clf = linear_model.SGDClassifier(loss='log')
clf.fit(rows, classification)
# Go through the testing IDs and test our model
num_correct = 0
for test_user_id in testing_user_ids:
pulled_features = pull_features_for_user(sys.argv[1], int(test_user_id))
transformed_features = []
for key in feature_keys:
val = pulled_features.get(key, 0)
if feature_keys[key] == 'timestamp':
if not val:
val = int(time.time())
else:
val = int(time.time()) - val/1000
transformed_features.append(val)
prediction = clf.predict([transformed_features])
print "Predicted %d for %d" % (prediction, test_user_id)
correct = bad_loanees.get(int(test_user_id), 0) == prediction
if correct:
num_correct += 1
print "Prediction was %scorrect" % ('' if correct else 'not ')
print "Model got %d/%d (%d) correct" % (num_correct, len(testing_user_ids), float(num_correct) / len(testing_user_ids))
if __name__ == "__main__":
main()