-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSPSSread.py
709 lines (661 loc) · 24.9 KB
/
SPSSread.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
"""
SPSS.py
A Python module for importing SPSS files
(c) Alan James Salmoni
Released under the Affero General Public License
Notes: This only imports types 7 subtypes 3, 4, 5 and 6. Other subtypes are:
7: Multiple response set definitions
8: Data Entry for Windows (DEW) information
10: TextSmart information
11: Measurement level, column width and alignment for each variable - DONE!
13:
14:
17: text field defining variable attributes
20: Single string character encoding
21: Encodes value labels for long string variables
USAGE:
call SPSS.SPSSFile(args)
args are:
-all: to immediately import the file without waiting for commands to open
and read it
-pickle: to return the SPSS file pickled as a Python object (string format)
-help: to print this out
"""
import struct
import sys
import pickle
def pkint(vv):
"""
An auxilliary function that returns an integer from a 4-byte word.
The integer is packed in a tuple.
"""
try:
return struct.unpack("i",vv)
except: # what is the error?
return 0
def pkflt(vv):
"""
An auxilliary function returns a double-precision float from an 8-byte word
The float is packed in a tuple.
"""
try:
return struct.unpack("d",vv)
except:
return 0.0
def pkstr(vv):
"""
An auxilliary function that returns a string from an 8-byte word. The
string is NOT packed.
"""
bstr = ''
for i in str(vv):
bstr = bstr + struct.unpack("s",i)[0]
return bstr
class variable(object):
"""
This class contains a variable and its attributes. Each variable within
the SPSS file causes an instantiation of this class. The file object
contains a list of these in self.variablelist.
"""
def __init__(self):
self.name = None # 8 char limit
self.namelabel = None
self.data = []
self.missingmarker = None
self.missingd = []
self.missingr = []
self.type = None # 0 = numeric, 1 = string, -1 = string continuation
self.printformatcode = []
self.writeformatcode = []
self.labelvalues = []
self.labelfields = []
class SPSSFile(object):
def __init__(self, *args):
self.filename = args[0]
self.fin = None
self.typecode = []
self.labelmarker = []
self.missingvals = []
self.missingvalmins = []
self.missingvalmaxs = []
self.documents = ''
self.variablelist = []
self.rawvarlist = []
self.numvars = 0
self.variablesets = None
self.datevars = []
if '-all' in args:
self.OpenFile()
self.GetRecords()
def OpenFile(self):
"""
This method trys to open the SPSS file.
"""
try:
self.fin = open(self.filename, "rb")
except IOError:
print "Cannot open file"
self.fin = None
def GetRecords(self):
"""
This method read a 4-byte word and works out what record type it is,
and then despatches to the correct method. This continues until the
'999' code is reached (and of dictionary) upon which the data are read.
"""
self.GetRecordType1()
while 1:
IN = pkint(self.fin.read(4))[0]
if IN == 2:
# get record type 2
self.GetRecordType2()
elif IN == 3:
# get record type 3
self.GetRecordType3()
elif IN == 6:
# get record type 6
pass
elif IN == 7:
# get record type 7
self.GetRecordType7()
elif IN == 999:
# last record end
self.fin.read(4)
self.GetData()
self.fin.close()
self.fin = None #need to remove file object for pickling
return
return
def GetRecordType1(self):
"""
This method reads in a type 1 record (file meta-data).
"""
self.recordtype = self.fin.read(4)
self.eyecatcher = self.fin.read(60)
self.filelayoutcode = pkint(self.fin.read(4))
self.numOBSelements = pkint(self.fin.read(4))
self.compressionswitch = pkint(self.fin.read(4))
self.caseweightvar = pkint(self.fin.read(4))
self.numcases = pkint(self.fin.read(4))
self.compressionbias = (self.fin.read(8))
self.metastr = self.fin.read(84)
def GetRecordType2(self):
"""
This method reads in a type 2 record (variable meta-data).
"""
x = variable()
IN = pkint(self.fin.read(4))[0]
x.typecode = IN
if x == 0:
x.type = "Numeric"
else:
x.type = "String"
if x.typecode != -1:
IN = pkint(self.fin.read(4))[0]
x.labelmarker = IN
IN = pkint(self.fin.read(4))[0]
x.missingmarker = IN
IN = self.fin.read(4)
x.decplaces = ord(IN[0])
x.colwidth = ord(IN[1])
x.formattype = self.GetPrintWriteCode(ord(IN[2]))
IN = self.fin.read(4)
x.decplaces_wrt = ord(IN[0])
x.colwidth_wrt = ord(IN[1])
x.formattype_wrt = self.GetPrintWriteCode(ord(IN[2]))
IN = pkstr(self.fin.read(8))
nameblankflag = True
x.name = IN
for i in x.name:
if ord(i) != 32:
nameblankflag = False
if x.labelmarker == 1:
IN = pkint(self.fin.read(4))[0]
x.labellength = IN
if (IN % 4) != 0:
IN = IN + 4 - (IN % 4)
IN = pkstr(self.fin.read(IN))
x.label = IN
else:
x.label = ''
for i in range(abs(x.missingmarker)):
self.fin.read(8)
if x.missingmarker == 0:
# no missing values
x.missingd = None
x.missingr = (None,None)
elif (x.missingmarker == -2) or (x.missingmarker == -3):
# range of missing values
val1 = pkflt(self.fin.read(8))[0]
val2 = pkflt(self.fin.read(8))[0]
x.missingr = (val1, val2)
if x.missingmarker == -3:
IN = pkflt(self.fin.read(8))[0]
x.missingd = IN
else:
x.missings = None
elif (x.missingmarker > 0) and (x.missingmarker < 4):
# n(mval) missing vals
tmpmiss = []
for i in range(x.missingmarker):
IN = pkflt(self.fin.read(8))[0]
tmpmiss.append(IN)
x.missingd = tmpmiss
x.missingr = None
if not nameblankflag:
self.variablelist.append(x)
self.rawvarlist.append(len(self.variablelist))
elif x.typecode == -1:
# read the rest
try:
self.rawvarlist.append(self.rawvarlist[-1])
except:
self.rawvarlist.append(None)
self.fin.read(24)
def GetRecordType3(self):
"""
This method reads in a type 3 and a type 4 record. These always occur
together. Type 3 is a value label record (value-field pairs for
labels), and type 4 is the variable index record (which variables
have these value-field pairs).
"""
# now record type 3
self.r3values = []
self.r3labels = []
IN = pkint(self.fin.read(4))[0]
values = []
fields = []
for labels in range(IN):
IN = self.fin.read(8)
IN = pkflt(IN)[0]
values.append(IN)
l = ord(self.fin.read(1))
if (l % 8) != 0:
l = l + 8 - (l % 8)
IN = pkstr(self.fin.read(l-1))
fields.append(IN)
# get record type 4
t = pkint(self.fin.read(4))[0]
if t == 4:
numvars = pkint(self.fin.read(4))[0]
# IN is number of variables
labelinds = []
for i in range(numvars):
IN = pkint(self.fin.read(4))[0]
# this is index, store it
labelinds.append(IN)
for i in labelinds:
ind = self.rawvarlist[i-1]
self.variablelist[ind-1].labelvalues = values
self.variablelist[ind-1].labelfields = fields
else:
print "Invalid subtype (%s)"%t
return
#sys.exit(1)
def GetRecordType6(self):
"""
This method retrieves the document record.
"""
# document record, only one allowed
IN = pkint(self.fin.read(4))[0]
self.documents = pkstr(self.fin.read(80*IN))
def GetRecordType7(self):
"""
This method is called when a type 7 record is encountered. The
subtype is then worked out and despatched to the proper method. Any
subtypes that are not yet programmed are read in and skipped over, so
not all subtype methods are yet functional.
"""
# get subtype code
subtype = pkint(self.fin.read(4))[0]
if subtype == 3:
self.GetType73()
elif subtype == 4:
self.GetType74()
elif subtype == 5:
self.GetType75()
elif subtype == 6:
self.GetType76()
elif subtype == 11:
self.GetType711()
elif subtype == 13:
self.GetType713()
else:
self.GetType7other()
def GetType73(self):
"""
This method retrieves records of type 7, subtype 3. This is for
release and machine specific integer information (eg, release
number, floating-point representation, compression scheme code etc).
"""
# this is for release and machine-specific information
FPrep = ["IEEE","IBM 370", "DEC VAX E"]
endian = ["Big-endian","Little-endian"]
charrep = ["EBCDIC","7-bit ASCII","8-bit ASCII","DEC Kanji"]
datatype = pkint(self.fin.read(4))[0]
numelements = pkint(self.fin.read(4))[0]
if numelements == 8:
self.releasenum = pkint(self.fin.read(4))[0]
self.releasesubnum = pkint(self.fin.read(4))[0]
self.releaseidnum = pkint(self.fin.read(4))[0]
self.machinecode = pkint(self.fin.read(4))[0]
self.FPrep = FPrep[pkint(self.fin.read(4))[0] - 1]
self.compressionscheme = pkint(self.fin.read(4))[0]
self.endiancode = endian[pkint(self.fin.read(4))[0] - 1]
self.charrepcode = charrep[pkint(self.fin.read(4))[0] - 1]
else:
print "Error reading type 7/3"
return
#sys.exit(1)
def GetType74(self):
"""
This method retrieves records of type 7, subtype 4. This is for
release and machine-specific OBS-type information (system missing
value [self.SYSMIS], and highest and lowest missing values.
"""
# release & machine specific OBS information
datatype = pkint(self.fin.read(4))[0]
numelements = pkint(self.fin.read(4))[0]
if (numelements == 3) and (datatype == 8):
self.SYSMIS = pkflt(self.fin.readline(8))[0]
self.himissingval = pkflt(self.fin.readline(8))[0]
self.lomissingval = pkflt(self.fin.readline(8))[0]
else:
print "Error reading type 7/4"
return
#sys.exit(1)
def GetType75(self):
"""
This method parses variable sets information. This is not
functional yet.
"""
# variable sets information
datatype = pkint(self.fin.read(4))[0]
numelements = pkint(self.fin.read(4))[0]
self.variablesets = pkstr(self.fin.read(4 * numelements))
def GetType76(self):
"""
This method parses TRENDS data variable information. This is not
functional yet.
"""
# TRENDS data variable information
datatype = pkint(self.fin.read(4))[0]
numelements = pkint(self.fin.read(4))[0]
# get data array
self.explicitperiodflag = pkint(self.fin.read(4))[0]
self.period = pkint(self.fin.read(4))[0]
self.numdatevars = pkint(self.fin.read(4))[0]
self.lowestincr = pkint(self.fin.read(4))[0]
self.higheststart = pkint(self.fin.read(4))[0]
self.datevarsmarker = pkint(self.fin.read(4))[0]
for i in xrange(1, self.numdatevars + 1):
recd = []
recd.append(pkint(self.fin.read(4))[0])
recd.append(pkint(self.GetDateVar(self.fin.read(4))[0]))
recd.append(pkint(self.fin.read(4))[0])
self.datevars.append(recd)
def GetType711(self):
"""
This method retrieves information about the measurement level, column
width and alignment.
"""
measure = ["Nominal", "Ordinal", "Continuous"]
align = ["Left", "Right", "Centre"]
datatype = pkint(self.fin.read(4))[0]
numelements = pkint(self.fin.read(4))[0] / 3
for ind in range(numelements):
var = self.variablelist[ind]
IN = pkint(self.fin.read(datatype))[0]
var.measure = measure[IN - 1]
IN = pkint(self.fin.read(datatype))[0]
var.displaywidth = IN
IN = pkint(self.fin.read(datatype))[0]
var.align = align[IN]
def GetType713(self):
"""
This method retrieves information about the long variable names
record.
"""
datatype = pkint(self.fin.read(4))[0]
numelements = pkint(self.fin.read(4))[0]
IN = self.fin.read(numelements)
key = ''
value = ''
word = key
for byte in IN:
if ord(byte) == "=":
word = value
elif byte == '09':
word = key
else:
word = word + byte
def GetType7other(self):
"""
This method is called when other subtypes not catered for are
encountered. See the introdoction to this module for more
information about their contents.
"""
datatype = pkint(self.fin.read(4))[0]
numelements = pkint(self.fin.read(4))[0]
self.Other7 = self.fin.read(datatype * numelements)
def GetData(self):
"""
This method retrieves the actual data and stores them into the
appropriate variable's 'data' attribute.
"""
self.cluster = []
for case in range(self.numcases[0]):
for i, var in enumerate(self.variablelist):
if var.typecode == 0: # numeric variable
N = self.GetNumber()
if N == "False":
print "Error returning case %s, var %s"%(case, i)
sys.exit(1)
var.data.append(N)
elif (var.typecode > 0) and (var.typecode < 256):
S = self.GetString(var)
if S == "False":
print "Error returning case %s, var %s"%(case, i)
sys.exit(1)
var.data.append(S)
def GetNumber(self):
"""
This method is called when a number / numeric datum is to be
retrieved. This method returns "False" (the string, not the Boolean
because of conflicts when 0 is returned) if the operation is not
possible.
"""
if self.compressionswitch == 0: # uncompressed number
IN = self.fin.read(8)
if len(IN) < 1:
return "False"
else:
return pkflt(IN)[0]
else: # compressed number
if len(self.cluster) == 0: # read new bytecodes
IN = self.fin.read(8)
for byte in IN:
self.cluster.append(ord(byte))
byte = self.cluster.pop(0)
if (byte > 1) and (byte < 252):
return byte - 100
elif byte == 252:
return "False"
elif byte == 253:
IN = self.fin.read(8)
if len(IN) < 1:
return "False"
else:
return pkflt(IN)[0]
elif byte == 254:
return 0.0
elif byte == 255:
return self.SYSMIS
def GetString(self, var):
"""
This method is called when a string is to be retrieved. Strings can be
longer than 8-bytes long if so indicated. This method returns SYSMIS
(the string not the Boolean) is returned due to conflicts.
"""
if self.compressionswitch == 0:
IN = self.fin.read(8)
if len(IN) < 1:
return self.SYSMIS
else:
return pkstr(IN)
else:
ln = ''
while 1:
if len(self.cluster) == 0:
IN = self.fin.read(8)
for byte in IN:
self.cluster.append(ord(byte))
byte = self.cluster.pop(0)
if (byte > 0) and (byte < 252):
return byte - 100
if byte == 252:
return self.SYSMIS
if byte == 253:
IN = self.fin.read(8)
if len(IN) < 1:
return self.SYSMIS
else:
ln = ln + pkstr(IN)
if len(ln) > var.typecode:
return ln
if byte == 254:
if ln != '':
return ln
if byte == 255:
return self.SYSMIS
def GetPrintWriteCode(self, code):
"""
This method returns the print / write format code of a variable. The
returned value is a tuple consisting of the format abbreviation
(string <= 8 chars) and a meaning (long string). Non-existent codes
have a (None, None) tuple returned.
"""
if type(code) != int:
return
if code == 0:
return ('','Continuation of string variable')
elif code == 1:
return ('A','Alphanumeric')
elif code == 2:
return ('AHEX', 'alphanumeric hexadecimal')
elif code == 3:
return ('COMMA', 'F format with commas')
elif code == 4:
return ('DOLLAR', 'Commas and floating point dollar sign')
elif code == 5:
return ('F', 'F (default numeric) format')
elif code == 6:
return ('IB', 'Integer binary')
elif code == 7:
return ('PIBHEX', 'Positive binary integer - hexadecimal')
elif code == 8:
return ('P', 'Packed decimal')
elif code == 9:
return ('PIB', 'Positive integer binary (Unsigned)')
elif code == 10:
return ('PK', 'Positive packed decimal (Unsigned)')
elif code == 11:
return ('RB', 'Floating point binary')
elif code == 12:
return ('RBHEX', 'Floating point binary - hexadecimal')
elif code == 15:
return ('Z', 'Zoned decimal')
elif code == 16:
return ('N', 'N format - unsigned with leading zeros')
elif code == 17:
return ('E', 'E format - with explicit power of ten')
elif code == 20:
return ('DATE', 'Date format dd-mmm-yyyy')
elif code == 21:
return ('TIME', 'Time format hh:mm:ss.s')
elif code == 22:
return ('DATETIME', 'Date and time')
elif code == 23:
return ('ADATE', 'Date in mm/dd/yyyy form')
elif code == 24:
return ('JDATE', 'Julian date - yyyyddd')
elif code == 25:
return ('DTIME', 'Date-time dd hh:mm:ss.s')
elif code == 26:
return ('WKDAY', 'Day of the week')
elif code == 27:
return ('MONTH', 'Month')
elif code == 28:
return ('MOYR', 'mmm yyyy')
elif code == 29:
return ('QYR', 'q Q yyyy')
elif code == 30:
return ('WKYR', 'ww WK yyyy')
elif code == 31:
return ('PCT', 'Percent - F followed by "%"')
elif code == 32:
return ('DOT', 'Like COMMA, switching dot for comma')
elif (code >= 33) and (code <= 37):
return ('CCA-CCE', 'User-programmable currency format')
elif code == 38:
return ('EDATE', 'Date in dd.mm.yyyy style')
elif code == 39:
return ('SDATE', 'Date in yyyy/mm/dd style')
else:
return (None, None)
def GetDateVar(self, code):
datetypes = [ "Cycle","Year","Quarter","Month","Week","Day","Hour",
"Minute","Second","Observation","DATE_"]
try:
return datetypes[code]
except IndexError:
return None
def GetNames(self):
"""
This method retrieves all the names for all the variables. If the file has
not already been read, strange results (though probably a blank) will be
returned.
"""
names = []
for variable in self.variablelist:
names.append(variable.name)
return names
def GetLabels(self):
"""
This method returns the labels of all the variables
"""
labels = []
for variable in self.variablelist:
labels.append(variable.label)
return labels
def GetTypeCodes(self):
"""
This method returns the typecodes of each variable.
"""
typecodes = []
for variable in self.variablelist:
typecodes.append(variable.typecode)
return typecodes
def GetRow(self, row):
"""
This method returns a row of data
"""
if (row < 0) or (row > self.numcases):
return None
else:
row = []
for ind, variable in enumerate(self.variablelist):
row.append(variable.data[ind])
return row
if __name__ == '__main__':
args = sys.argv
args.pop(0)
x = SPSSFile(args)
if "-pickle" in args:
p = pickle.dumps(x)
print p
if "-help" in args:
print "SPSS file importer for Python"
print "(c) 2008 Alan James Salmoni [salmoni at gmail]"
print "Command line arguments:"
print "SPSS.SPSSFile file args"
print "file is valid file name of SPSS (.sav) file"
print "Args:"
print "-all: immediately open and import the file"
print "-pickle: return the SPSS file as a pickled Python object (string)"
print "-help: print this"
# How to use this
f = 'C:\Documents and Settings\Authorized User\My Documents\Documents\AQ.sav'
#x = SPSSFile(f)
#x.OpenFile()
#x.GetRecords()
# then use commands like this to access data & metadata
# FILE META-DATA:
# x.eyecatcher # shows OS, machine, SPSS version etc
# x.numOBSelements # puted number of variables (use x.numvars instead)
# x.compressionswitch # 0 if not compressed
# x.metastr # creation date, time, file label.
# x.variablelist # list of variable objects contained within
# x.numvars # number of variables.
# x.documents # documentation record (if any)
# VARIABLE META-DATA:
# y = x.variablelist
# y.data # the data
# y.name # 8-byte variable name
# y.label # longer string label
# y.decplaces # number of decimal places
# y.colwidth # column width
# y.formattype # print format code (the exact data type)
# y.labelvalues # values for substitute labels
# y.labelfields # fields for substitute labels
# y.missingd # list of discrete missing values
# y.missingr # upper and lower bounds of a range of missing values
# and many more.
# Check dir: lower case starts are attributes, others are methods
# EXTRAS:
# * GetNames method to return names from all variables
# * GetRows method to return data from particular row
# * GetLabels method to return labels from all variables
# * GetTypeCodes method to return variables' typecodes
# NEED TO ADD:
# * working methods for various type 7 subtypes (all meta-data, some documented, not all)
# * Any others?