-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_classes.py
376 lines (300 loc) · 15.8 KB
/
data_classes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
"""
The Subject and DataFile classes are defined in this file. This is
something of an experiment with object-oriented programming/persistent
databases in Python. The idea is to generate an instance of the DataFile
class for each log file. Instances of the DataFile class will contain
information about the log file used to create them, including:
task: one of 'task1', 'task2', ..., 'task6', identifies the task number
filename: the name of the original log file
task_headers: names of relevant data fields
practice_headers: names of relevant data fields from practice trials
sibling: True/False depending on whether there is an 's' in the file name
ID: 4-digit ID number
key: A unique identifier including country prefix, group number, and ID number.
Example: 'PEs231010'
group: a 2-digit group number
device: A number specific to the device used to administer the tasks, i.e., "IIN005"
date: The date of task administration (per the file name)
time: time of administration (per the file name)
summary: A dictionary containing summary data for the task
trial_by_trial: A list of dictionaries with data for each trial from the task
practice: same as trial by trial, but for practice trials
The Subject class contains all task information for a given subject.
"""
import exception_classes as e
class Subject:
"""
A class containing all data (including trial by trial and summary statistics)
and metadata (subject ID, device number, sibling, time) from the output task
log files.
"""
def __init__(self, ID, group, sibling, key):
self.ID = ID # Should be a four digit number
self.group = group # Generally a two-digit number
self.sibling = sibling # True/False bool
self.data = {} # A dictionary to be filled with DataFile objects
self.key = key
def __str__(self):
"""
Returns a string representation of the object
"""
return '<Subject Number: %s Group: %s Sibling: %s>' % (self.ID, self.group, str(self.sibling))
def add_data(self, task, data_object):
"""
Updates the subject object's data
dictionary
:param task: A string, one of 'task1', 'task2', ..., 'task6'.
:param data_object: An instance of the DataFile class
:return: None
"""
if task not in ['task1', 'task2', 'task3', 'task4', 'task5', 'task6']:
raise e.TaskNameError(task)
elif isinstance(data_object, DataFile):
self.data[task] = data_object
else:
raise TypeError(data_object)
def write_summary(self, out_file, overwrite=False):
import csv
import os
# Get summary data for all tasks in one dictionary
full_summary = self.summarize_data()
# Get headers in alphabetical order so that data will be written
# in task order
headers = sorted(full_summary.keys())
# If the file already exists, we want to append to it
# without writing header rows. However, if overwrite
# is true, the whole file will just be overwritten.
if os.path.isfile(out_file) and not overwrite:
with open(out_file, "a") as out:
writer = csv.DictWriter(out, headers)
out.write(",".join(str(x) for x in [self.key, self.ID, self.group, self.sibling]) + ",")
writer.writerow(full_summary) # Should only be one row
else: # Open a new file (or overwrite old one) and write to it
with open(out_file, "w") as out:
writer = csv.DictWriter(out, headers)
# Add header for subject ID
out.write('Key,SubID,Group,Sibling,')
writer.writeheader()
out.write(",".join(str(x) for x in [self.key, self.ID, self.group, self.sibling]) + ",")
writer.writerow(full_summary)
def dump_trial_by_trial(self, task, out_file, overwrite=False):
"""
This function takes a list of dictionaries
and writes them to a .csv file.
"""
import csv
import os
# If the file already exists, we want to append to it
# without writing header rows. However, if overwrite
# is true, the whole file will just be overwritten.
if os.path.isfile(out_file) and not overwrite:
with open(out_file, "a") as out:
writer = csv.DictWriter(out, self.data[task].task_headers)
# Add ID, time, group, and sibling as first columns
for trial in self.data[task].trial_by_trial:
out.write(
",".join(
str(x) for x in [self.key, self.ID, self.group, self.sibling, self.data[task].device,
self.data[task].time]) + ",")
writer.writerow(trial)
else: # Open a new file (or overwrite old one) and write to it
with open(out_file, "w") as out:
writer = csv.DictWriter(out, self.data[task].task_headers)
out.write('Key,SubID,Group,Sibling,Device,Time,')
writer.writeheader()
for row in self.data[task].trial_by_trial:
out.write(
",".join(
str(x) for x in [self.key, self.ID, self.group, self.sibling, self.data[task].device,
self.data[task].time]) + ",")
writer.writerow(row)
def summarize_data(self):
"""
Returns a dictionary with summary data from all tasks
"""
# Get summary data for all tasks in one dictionary
full_summary = {}
for task in self.data:
full_summary.update(self.data[task].summary)
return full_summary
class DataFile:
"""
A class, each instance of which is built from
a log file
"""
def __init__(self, log_file):
import os
self.log_file = log_file
self.filename = os.path.basename(log_file.name)
self.parse_file_name(self.filename)
self.set_task_headers()
self.set_practice_headers()
self.parse_file_data()
self.summarize()
del self.log_file # Just because you can't pickle files
def parse_file_name(self, file_name):
"""
Given a log file name of a specific format
for the YL data, will determine and assign values to attributes:
self.task --> A string, one of 'task1', 'task2', ..., 'task6'
self.sibling --> True/False
self.group --> a two-digit number string
self.ID --> a (hopefully) four-digit number string, though this is allowed to vary.
self.key --> a unique identifier for the subject
self.date --> the date when the file was generated
self.time --> the time when the file was generated
self.device --> a string identifying the device on which the task was administered
A file of the defined format is:
'PE211005_IIN028_task1_5-15-2013-16-13-32'
The PE is just a prefix (for Peru?). The two numbers following
the PE (21 in this case) indicate membership to a certain group.
the numbers after 'IIN' specify the device on which the tasks were
completed. The number following 'task' indicates to which task
the log file corresponds. The string of numbers at the end
(5-15-2013-16-13-32) correspond to the date and time of completion
(month-day-year-hour-minute-second). The letters 'PE' may or may not
be followed by an 's', which indicates that that particular subject
is a younger sibling of another subject with an identical ID number (other than the s).
Example:
The file name: 'PE211005_IIN028_task1_5-15-2013-16-13-32' will yield the following result:
self.task = 'task1'
self.sibling = False (since there's no 's' after 'PE')
self.group = '21'
self.ID = '1005'
self.key = 'PE211005'
self.date = '5/15/2013'
self.time = '16:13:32'
self.device = 'IIN028'
"""
import os
import re
import warnings
name, extension = os.path.splitext(os.path.basename(file_name))
# Only accept csv files.
if extension != ".csv":
raise e.BadFileNameError("File: %s does not have extension, '.csv'" % file_name)
# Split the file name into components. If filename = 'PE211005_IIN028_task1_5-15-2013-16-13-32',
# name components should be = ['PE211005', 'IIN028', 'task1', '5-15-2013-16-13-32']
subject, device, task, date_and_time = name.split("_")
self.key = subject
if task not in ['task1', 'task2', 'task3', 'task4', 'task5', 'task6']:
raise e.BadFileNameError(
"Invalid task designation in file: %s\nExpected one of 'task1', 'task2', ..., 'task6'\n" % file_name)
else:
self.task = task
# Further split the subject string, which should be something like 'PE211005'
sub_components = re.search(r'([a-zA-Z][a-zA-Z]s?)(\d\d)(\d+)', subject, re.IGNORECASE)
# This should split the subject data into match.group(1) = 'PE',
# match.group(2) = '21', and match.group(3) = 1005. Note that
# this assumes that the group number is always two digits, but
# allows for the subject number to vary in length.
if not sub_components:
raise e.BadFileNameError(file_name)
if len(sub_components.groups()) == 3:
# Check if sibling
if not sub_components.group(1):
raise e.BadFileNameError(
"File name of unexpected format:\n\t%s\n\tExpected two-letter prefix" % file_name)
if 's' in sub_components.group(1).lower(): # ignoring upper/lower case
self.sibling = True
else:
self.sibling = False
# Record group
self.group = sub_components.group(2) # two digits following PE/PEs
# Record subject number
self.ID = sub_components.group(3) # all digits following
self.device = device
# We expect 6 digits following PE (or PEs). If there is a different number,
# warn the user, but don't raise an error.
if len(self.ID) != 4:
warnings.warn("File name of unexpected format:\n\t %s\n\tExpected a 2-digit group number and "
"4-digit ID number. \nUsing SubID = %s" % (file_name, self.ID))
else: # The subject info isn't divided as expected
raise e.BadFileNameError("File of unexpected format:\n\t%s" % file_name)
# Split the date and time
split_date = date_and_time.split("-")
if len(split_date) != 6:
raise e.BadFileNameError("Problem reading date/time information from file: %s\n" % file_name)
else:
self.date = "/".join(split_date[:3])
self.time = ":".join(split_date[3:])
def set_task_headers(self):
"""
Assigns a value to self.practice_headers according to the value
of self.task. Said value will be in the from of a list of
strings (header names), which correspond to the data that we are interested in extracting
from the log files. This function is for task trials, specifically, as the headers
for practice trials differ slightly from those associated with actual task trials.
:return: None. Value of self.practice_headers is assigned (as a list of strings)
"""
if self.task == 'task1':
self.task_headers = ["TrialNum", "NumBadTouches", "Score", "Score-incorrect only"]
elif self.task == 'task2':
self.task_headers = ['TrialNum', 'TargetSide', 'TimeOut', 'ReactionTime', 'TouchPosition',
'DistanceFromCenter', 'PressedSide', 'GoalSide', 'Correct', 'SwitchRule',
'SwitchSide']
elif self.task == 'task3':
self.task_headers = ['TrialNum', 'NumDots', 'ShownDots', 'Delay', 'TimeOut', 'EarlyResponse',
'DotPressed', 'ReactionTime', 'TouchPosition', 'DistanceFromCenter', 'Rank']
elif self.task == 'task4':
self.task_headers = ['Block', 'PercentCorrect', 'AvgDistanceFromCenter', 'AvgResponseTime']
elif self.task == 'task5':
self.task_headers = ['Task', 'EndCondition', 'Duration', 'NumGoodTouches', 'NumBadTouches',
'NumRepeats', 'AvgTimePerTarget', 'StandardDeviation', 'AvgTimePerAction',
'AvgTargetsPerArea', 'AvgLocation', 'AvgFirstTen', 'AvgLastTen',
'AvgDistancePerTarget']
elif self.task == 'task6':
self.task_headers = ["TrialNum", "NumBadTouches", "Score", "Score-incorrect only"]
def set_practice_headers(self):
"""
Assigns a value to self.practice_headers according to the value
of self.task. Said value will be in the from of a list of
strings (header names), which correspond to the data that we are interested in extracting
from the log files. This function is for practice trials, specifically, as the headers
for practice trials differ slightly from those associated with actual task trials.
:return: None. Value of self.practice_headers is assigned (as a list of strings)
"""
if self.task == 'task1':
self.practice_headers = ["TrialNum", "NumBadTouches", "Score"]
elif self.task == 'task2':
self.practice_headers = ['TrialNum', 'TargetSide', 'TimeOut', 'ReactionTime', 'TouchPosition',
'DistanceFromCenter', 'PressedSide', 'GoalSide', 'Correct']
elif self.task == 'task3':
self.practice_headers = ['TrialNum', 'NumDots', 'ShownDots', 'Delay', 'TimeOut', 'EarlyResponse',
'DotPressed', 'ReactionTime', 'TouchPosition', 'DistanceFromCenter']
elif self.task == 'task4':
self.practice_headers = ['TrialNum', 'Correct', 'ResponseTime', 'TouchPosition', 'DistanceFromCenter']
elif self.task == 'task5':
self.practice_headers = ['EndCondition', 'Duration', 'NumGoodTouches', 'NumBadTouches',
'NumRepeats', 'AvgTimePerTarget', 'StandardDeviation', 'AvgTimePerAction',
'AvgTargetsPerArea', 'AvgLocation', 'AvgFirstTen', 'AvgLastTen',
'AvgDistancePerTarget']
elif self.task == 'task6':
self.practice_headers = ["TrialNum", "NumBadTouches", "Score"]
def parse_file_data(self):
"""
Call the read_log_file function
"""
import parser_functions as sub
self.practice, self.trial_by_trial = sub.read_log_file(self.task, self.log_file, self.task_headers,
self.practice_headers)
def summarize(self):
"""
Generates task-specific summary data according to the value of self.task.
The actual computation of the summary data relies on a call to a
task-specific function, imported from summarize.py
:return: None. Assigns a value to self.summary
"""
import summarize
if self.task == 'task1':
self.summary = summarize.get1(self.trial_by_trial)
elif self.task == 'task2':
self.summary = summarize.get2(self.trial_by_trial)
elif self.task == 'task3':
self.summary = summarize.get3(self.trial_by_trial)
elif self.task == 'task4':
self.summary = summarize.get4(self.trial_by_trial)
elif self.task == 'task5':
self.summary = summarize.get5(self.trial_by_trial)
elif self.task == 'task6':
self.summary = summarize.get6(self.trial_by_trial)