-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeatures.py
156 lines (133 loc) · 5.52 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python
"""features.py: functions to pull features from user logs
Given a user directory, returns a map of features to use for classification
based on the call logs, contact lists, and sms logs
"""
import json, os, re, sys
# Sanity check for over big numbers
max_max_value = 100000
# Given a user log directory, return map of features
def pull_features_for_user(basedir, user_id):
features = { 'user_id': user_id }
devices = os.listdir('%s/%d' % (basedir, user_id))
# TODO - check if user has more than one device
for device in devices:
features.update(pull_call_log_features('users/%d/%s' % (user_id, device)))
features.update(pull_contact_list_features('users/%d/%s' % (user_id, device)))
features.update(pull_sms_log_features('users/%d/%s' % (user_id, device)))
return features
def pull_call_log_features(basedir):
if not os.path.exists('%s/call_log' % basedir):
return {}
call_logs = os.listdir('%s/call_log' % basedir)
call_log_data = []
for call_log in call_logs:
with open('%s/call_log/%s' % (basedir, call_log)) as data_file:
call_log_data += json.load(data_file)
call_log_features = {}
earliest_call = 0
latest_call = 0
unique_calls = {}
for call in call_log_data:
datetime = int(call['datetime'])
if datetime < earliest_call or earliest_call == 0:
earliest_call = datetime
if datetime > latest_call:
latest_call = datetime
unique_calls[call['phone_number']] = 1
return {
'earliest_call': earliest_call,
'latest_call': latest_call,
'unique_calls': int(len(unique_calls)),
'total_calls': int(len(call_log_data)),
}
def pull_contact_list_features(basedir):
if not os.path.exists('%s/contact_list' % basedir):
return {}
contact_lists = os.listdir('%s/contact_list' % basedir)
contact_list_data = []
for contact_list in contact_lists:
with open('%s/contact_list/%s' % (basedir, contact_list)) as data_file:
contact_list_data += json.load(data_file)
max_times_contacted = 0
unique_contacts_with_phone = 0
for contact in contact_list_data:
if contact['times_contacted'] > max_times_contacted:
max_times_contacted = contact['times_contacted']
if contact['phone_numbers']:
unique_contacts_with_phone += 1
return {
'max_times_contacted': max_times_contacted,
'unique_contacts': int(len(contact_list_data)),
'unique_contacts_with_phone': unique_contacts_with_phone,
}
def pull_sms_log_features(basedir):
if not os.path.exists('%s/sms_log' % basedir):
return {}
sms_logs = os.listdir('%s/sms_log' % basedir)
sms_log_data = []
for sms_log in sms_logs:
with open('%s/sms_log/%s' % (basedir, sms_log)) as data_file:
sms_log_data += json.load(data_file)
sms_log_features = {}
earliest_sms = 0
latest_sms = 0
unique_sms = {}
probable_loaned_before = 0
probable_max_loan = 0
probable_credit_before = 0
probable_max_credit = 0
probable_missed_payment = 0
for sms in sms_log_data:
if sms['datetime'] < earliest_sms or earliest_sms == 0:
earliest_sms = sms['datetime']
if sms['datetime'] > latest_sms:
latest_sms = sms['datetime']
unique_sms[sms['sms_address']] = 1
message = sms['message_body'].lower()
# Look for money amount in the message. The different formats are tricky to
# match with a single regex so look in order, with decimal and without, ksh after,
# then with decimal and without, ksh before. Haven't seen a message that mixes
# amount formats in the same message
money_amounts = re.findall('([\d,]+)(?:\.\d{1,2})?(?:\s)?ksh', message)
if not money_amounts:
money_amounts = re.findall('ksh(?:s)?(?:\.)?(?:\s)*([\d,]+)(?:\.\d{1,2})?', message)
# if 'ksh' in message and not money_amounts and sys.argv[2] == '1':
# print sms
# print money_amounts
# if money_amounts and sys.argv[2] == '0':
# print sms
# print money_amounts
if money_amounts:
# int() doesn't like commas
money_amounts = filter(None, [x.replace(',', '') for x in money_amounts])
if money_amounts:
max_val = min(max(map(int, money_amounts)), max_max_value)
# print money_amounts
# print max_val
if 'loan' in message:
probable_loaned_before = 1
if max_val > probable_max_loan:
probable_max_loan = max_val
elif 'balance' in message:
probable_credit_before = 1
if max_val > probable_max_credit:
probable_max_credit = max_val
if 'missed' in message or 'unpaid' in message:
probable_missed_payment = 1
return {
'earliest_sms': earliest_sms,
'latest_sms': latest_sms,
'unique_sms': int(len(unique_sms)),
'total_sms': int(len(sms_log_data)),
'probable_loaned_before': probable_loaned_before,
'probable_max_loan': probable_max_loan,
'probable_credit_before': probable_credit_before,
'probable_max_credit': probable_max_credit,
'probable_missed_payment': probable_missed_payment,
}
def main():
user_features = pull_features_for_user(sys.argv[1], int(sys.argv[2]))
print user_features
if __name__ == "__main__":
main()