-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCSVtoSqlite.py
executable file
·109 lines (90 loc) · 3.88 KB
/
CSVtoSqlite.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
'''
Created on Feb 1, 2016
@author: timothyjohnson
Converts large table with ~2.6 million rows and 70 variables to sqlite table
using pandas.
Errors and Issues of Note:
For some reason python.exe shuts down after importing about 700k rows into
sqlite database. Perhaps due to using Python 32 bit version.
I made the script for just 1/4 of the table, containing 700k rows, which
can be run for the other tables as inputs later.
'''
import pandas as pd
import sqlite3
import time
import numpy as np
#start time
start_time = time.time()
#set path to folder containing excel file, insert your folder location.
path = [r"F:\TimData\SQLiteTest" + "\\"]
def getColumnNames(headercsv):
'''get column names by reading first row of data'''
df = pd.read_csv(path[0] + headercsv, nrows = 1)
return df.columns.values
def defineDataTypes(columns):
'''define the data types of each column in a dict to pass to csv read'''
dtypedict = {}
#dict of column data types, iterate through column list above
colcount = 0
datatype = None
for column in columns:
if colcount in (55,56,58,59,61,62,64):
datatype = np.dtype('a64') # 64-character string
else:
datatype = np.int32 # 32-bit integer
dtypedict[columns[colcount]]=datatype
colcount+=1
return dtypedict
def getNumberOfLinesCSV(noheaderCSV):
'''Get number of lines in the CSV file without headers, used to iterate
through
'''
nlines = 0
for line in open(path[0] + noheaderCSV):
nlines+=1
return nlines
def CSVtoSQLite(nlines,noheaderCSV,dtypedict,columns,sqldatabase, sqlouttable):
'''Using pandas to convert csv with no headers to specified SQLite database
table. Using dtypedict to set the datatypes of the columns and columns
to set the columns needed to append to each iteration. Nlines is used to
iterate through the total # of lines
'''
# connect to database
cnx = sqlite3.connect(path[0]+sqldatabase)
cnx.text_factory = str
# Iteratively read CSV and dump lines into the SQLite table
itercount = 1
#data has to be broken up into chunks to parse from pandas df to sqlite
#table.Set at 100k rows but can be manipulated with.
for i in range(0, nlines, 100000):
df = pd.read_csv(path[0] + noheaderCSV,
dtype = dtypedict, #dict of column data types
header=None, # no header, define column header manually later
low_memory=False, # otherwise will receive data type warning due
# to mixed types
nrows=100000, # number of rows to read at each iteration
skiprows=i) # skip rows that were already read
# columns to read
df.columns = columns
df.to_sql(name=sqlouttable, con=cnx,
index=False, # don't use CSV file index
index_label='TableID', # use a unique column from DataFrame
# as index
if_exists='append')
itercount+=1
cnx.close()
def main():
#had to create two csv's, one with header columns and one without, so I
#can use the one with headers to get a list of the column names. Could
#be done more cleanly most likely.
csv_headers = 'FullMODISdata_700k_headers.csv'
csv_noheaders = 'FullMODISdata_700k_noheaders.csv'
sqldatabase = "MODISSQL_700k.sqlite"
sqlouttable = "MODIS_data_first700k"
columns = getColumnNames(csv_headers)
dtypedict = defineDataTypes(columns)
nlines = getNumberOfLinesCSV(csv_noheaders)
CSVtoSQLite(nlines,csv_noheaders,dtypedict,columns,sqldatabase, sqlouttable)
if __name__ == "__main__":
main()
print time.time() - start_time, "seconds, finished"