-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1_SL_Afric_DataPreparationExploration.R
63 lines (47 loc) · 2.02 KB
/
1_SL_Afric_DataPreparationExploration.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#--------------------------------------------------------------
# Read and prepare the data
#--------------------------------------------------------------
library(tidyverse)
vicData <- read_csv(file = "VicRoadFatalData.csv")
str(vicData) # check the data
#Remove variables
vicDataPre <- vicData %>%
select(-DRIVER_ID,
-VEHICLE_ID,
-OWNER_POSTCODE, #It has too many levels
-ACCIDENT_NO,
- DAY_OF_WEEK,
-fatal,
-accident_cnt)
#Create new variables based on data and time
vicDataPre <- vicDataPre %>%
mutate(hour = (hour(ACCIDENTTIME)),
ACCIDENTDATE = as.Date(ACCIDENTDATE, format = "%d/%m/%Y"),
month = (month(ACCIDENTDATE)),
year = (year(ACCIDENTDATE)),
hour_fac = factor(hour), #Use factors for glm but continuous for
month_fac = factor(month), #for tree-based methods
year_fac = factor(year)) %>%
rename(AGE_GROUP = `Age Group`) %>% #Rename variable to remove space which cause problems for some methods
select(-ACCIDENTTIME, -ACCIDENTDATE)
#Convert character variables to factor (necessary for some methods)
vicDataPre[] <- lapply(vicDataPre, function(x) if(is.character(x)) as.factor(x) else x)
#--------------------------------------------------------------
# Split the data into training and testing
#--------------------------------------------------------------
n <- nrow(vicDataPre)
set.seed(123)
indexTrain <- sample(1:n, round(n*0.8)) #Do 80/20 split
vicDataTrain <- vicDataPre[indexTrain, ]
vicDataTest <- vicDataPre[-indexTrain, ]
#--------------------------------------------------------------
# Exploratory plots
#--------------------------------------------------------------
#Plot by hour
fat_hour <- vicDataPre %>%
group_by(hour_fac, SEX) %>%
summarise(rate = mean(fatal_cnt))
fat_hour_plot <- ggplot(fat_hour %>% filter(SEX != "U")) +
geom_line(aes(x = hour_fac, y = rate, group = SEX, colour = SEX)) +
labs(title = "Fatality rate by hour", x = "hour")
print(fat_hour_plot)