-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDATA_CLEANING_PROJECT.sql
199 lines (135 loc) · 4.45 KB
/
DATA_CLEANING_PROJECT.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
/*
Cleaning Data Using SQL queries
*/
-- Loading the data
SELECT *
FROM NashvilleHouseData
-- 1. Standardize "SaleDate" Column
SELECT SaleDate, CONVERT(DATE, SaleDate)
FROM NashvilleHouseData
ALTER TABLE NashvilleHouseData
ADD SaleDateConverted DATE;
UPDATE NashvilleHouseData
SET SaleDateConverted = CONVERT(DATE, SaleDate)
-----------------------------------------------------------------------
-- 2.Populate the 'PropertyAddress' column
SELECT count(*) AS NullValues
FROM NashvilleHouseData
WHERE PropertyAddress IS NULL;
SELECT
a.ParcelID,
a.PropertyAddress,
b.ParcelID,
b.PropertyAddress,
ISNULL(a.PropertyAddress, b.PropertyAddress) AS Address
FROM NashvilleHouseData a
JOIN NashvilleHouseData b
ON a.ParcelID = b.ParcelID
AND a.[UniqueID ] <> b.[UniqueID ]
WHERE a.PropertyAddress IS NULL
UPDATE a
SET PropertyAddress = ISNULL(a.PropertyAddress, b.PropertyAddress)
FROM NashvilleHouseData a
JOIN NashvilleHouseData b
ON a.ParcelID = b.ParcelID
AND a.[UniqueID ] <> b.[UniqueID ]
WHERE a.PropertyAddress IS NULL
---------------------------------------------------------------------------------------
-- 3. Breaking the 'Address' into individual columns (address, city, state)
-- 3-a. 'PropertyAddress'
SELECT PropertyAddress
FROM NashvilleHouseData
SELECT
PropertyAddress,
SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress) - 1) AS Address,
SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress) + 1, LEN(PropertyAddress)) AS City
FROM NashvilleHouseData
ALTER TABLE NashvilleHouseData
ADD PropertySplitAddress nvarchar(255)
ALTER TABLE NashvilleHouseData
ADD PropertySplitCity nvarchar(255)
UPDATE NashvilleHouseData
SET PropertySplitAddress = SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress) - 1)
UPDATE NashvilleHouseData
SET PropertySplitCity = SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress) + 1, LEN(PropertyAddress))
-- 3-b. Owner Addres
SELECT OwnerAddress
FROM NashvilleHouseData
SELECT
PARSENAME(REPLACE(OwnerAddress, ',', '.'), 1) AS State,
PARSENAME(REPLACE(OwnerAddress, ',', '.'), 2) AS City,
PARSENAME(REPLACE(OwnerAddress, ',', '.'), 3) AS Address
FROM NashvilleHouseData
ALTER TABLE NashvilleHouseData
ADD OwnerSplitAddress nvarchar(255)
ALTER TABLE NashvilleHouseData
ADD OwnerSplitCity nvarchar(255)
ALTER TABLE NashvilleHouseData
ADD OwnerSplitState nvarchar(255)
UPDATE NashvilleHouseData
SET OwnerSplitAddress = PARSENAME(REPLACE(OwnerAddress, ',', '.'), 3)
UPDATE NashvilleHouseData
SET OwnerSplitCity = PARSENAME(REPLACE(OwnerAddress, ',', '.'), 2)
UPDATE NashvilleHouseData
SET OwnerSplitState = PARSENAME(REPLACE(OwnerAddress, ',', '.'), 1)
SELECT *
FROM NashvilleHouseData
---------------------------------------------------------------------------------------
-- 4. Change Y and N to Yes and No in 'SoldAsVacant' column
SELECT DISTINCT(SoldAsVacant)
FROM NashvilleHouseData
SELECT
DISTINCT(SoldAsVacant),
COUNT(SoldAsVacant) AS Value_Count
FROM NashvilleHouseData
GROUP BY SoldAsVacant
ORDER BY Value_Count DESC
UPDATE NashvilleHouseData
SET SoldAsVacant = CASE
WHEN SoldAsVacant = 'Y' THEN 'Yes'
WHEN SoldAsVacant = 'N' THEN 'No'
ELSE SoldAsVacant
END
SELECT
DISTINCT(SoldAsVacant),
COUNT(SoldAsVacant) AS Value_Count
FROM NashvilleHouseData
GROUP BY SoldAsVacant
ORDER BY Value_Count DESC
-------------------------------------------------------------------------------------
-- 5. Find duplicates in the data
WITH ROW_CTE AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY ParcelID,
PropertyAddress,
SaleDateConverted,
LegalReference,
SalePrice
ORDER BY UniqueID) AS row_num
FROM NashvilleHouseData
)SELECT *
FROM ROW_CTE
WHERE row_num > 1
/*
Deleting the duplicates if needed
(since it is not a good practice to delete information from the raw date loaded into the database)
*/
WITH ROW_CTE AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY ParcelID,
PropertyAddress,
SaleDateConverted,
LegalReference,
SalePrice
ORDER BY UniqueID) AS row_num
FROM NashvilleHouseData
)DELETE
FROM ROW_CTE
WHERE row_num > 1
-- 5. Removing unnecessary columns
ALTER TABLE NashvilleHouseData
DROP COLUMN SaleDate,
PropertyAddress,
OwnerAddress
SELECT *
FROM NashvilleHouseData