forked from drrachellowe/hydromet_dengue
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path00_load_packages_data.R
211 lines (165 loc) · 8.02 KB
/
00_load_packages_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# Combined effects of hydrometeorological hazards and urbanisation on dengue risk in Brazil: a spatiotemporal modelling study
# Rachel Lowe (2021)
# https://github.com/drrachellowe/hydromet_dengue
# R script to prepare data and lagged variables for INLA-DLNM modelling
# install INLA
# install.packages("INLA", repos=c(getOption("repos"), INLA="https://inla.r-inla-download.org/R/testing"), dep=TRUE)
# load INLA
library(INLA)
# select other packages
packages <- c("data.table", "tidyverse", "sf", "sp", "spdep",
"dlnm", "tsModel", "hydroGOF","RColorBrewer",
"geofacet", "ggpubr", "ggthemes")
# install.packages
# lapply(packages, install.packages, character.only = TRUE)
# load packages
lapply(packages, library, character.only = TRUE)
# load shape file for Brazil
map <- read_sf("data/shape_brazil.shp")
# dim(map)
# Create adjacency matrix
nb.map <- poly2nb(as_Spatial(map$geometry))
g.file <- "output/map.graph"
if (!file.exists(g.file)) nb2INLA(g.file, nb.map)
# load pre-defined grid of Brazilian states for geofacet plots
# note: could use pre-loaded grid = "br_states_grid1" in geofacet package, would need match state names
grid <- read.csv("data/br_states_grid.csv")
# head(grid)
# load data
# note dengue data available from Jan 2001
# Climate data included for 2000 to obtained lagged values prior to 2001
data <- fread("data/data_2000_2019.csv", header = T)
# head(data)
# summary climate variables
summary(data$tmin)
summary(data$tmax)
summary(data$pdsi)
# Note, Palmer Drought Severity Index data not available for island Fernando de Noronha (code 26019)
# Create lagged variables
# define matrices of lagged terms for monthly mean climate variables
# set maximum lag
nlag = 6
# Minimum temperature (Tmin)
lag_tmin <- tsModel::Lag(data$tmin, group = data$micro_code, k = 0:nlag)
# Maximum temperature (Tmax)
lag_tmax <- tsModel::Lag(data$tmax, group = data$micro_code, k = 0:nlag)
# Palmer drought severity index (PDSI)
lag_pdsi <- tsModel::Lag(data$pdsi, group = data$micro_code, k = 0:nlag)
# Remove year 2000 from lagged climate variables
lag_tmin <- lag_tmin[data$year > 2000,]
lag_tmax <- lag_tmax[data$year > 2000,]
lag_pdsi <- lag_pdsi[data$year > 2000,]
# remove year 2000 from dengue dataframe
data <- data[data$year > 2000,]
# define dimensions
# re-define time indicator to set 1 to Jan 2001
data$time <- data$time - 12
# total number of months
ntime <- length(unique(data$time))
# total number of years
nyear <- length(unique(data$year))
# total number of microregions
nmicro <- length(unique(data$micro_code))
# total number of states
nstate <- length(unique(data$state_code))
# define cross-basis matrix (combining nonlinear exposure and lag functions)
# set lag knots
lagknot = equalknots(0:nlag, 2)
# Tmin
var <- lag_tmin
basis_tmin <- crossbasis(var,
argvar = list(fun = "ns", knots = equalknots(data$tmin, 2)),
arglag = list(fun = "ns", knots = nlag/2))
# Tmax
var <- lag_tmax
basis_tmax <- crossbasis(var,
argvar = list(fun = "ns", knots = equalknots(data$tmax, 2)),
arglag = list(fun = "ns", knots = nlag/2))
# PDSI
var <- lag_pdsi
basis_pdsi <- crossbasis(var,
argvar = list(fun="ns", knots = equalknots(data$pdsi, 2)),
arglag = list(fun="ns", knots = lagknot))
# test linear interaction with % residents living in urban areas
# centre the urban variable at different levels of urbanisation (25th, 50th and 75th percentiles)
# from highly urbanised to more rural
summary(data$urban)
# set indicator to zero at point of interest (centring point)
# re-parameterise model to extract different predictions
urban_ind1 <- data$urban - quantile(data$urban, p = 0.75) # highly urbanised
urban_ind2 <- data$urban - quantile(data$urban, p = 0.5) # intermediate
urban_ind3 <- data$urban - quantile(data$urban, p = 0.25) # more rural
# test linear interaction with frequency of water shortages between 2000-2016
# centre the water shortage variable at different levels of frequency of shortages (25th, 50th and 75th percentiles)
# from high to low frequency
summary(data$water_shortage)
# set indicator to zero at point of interest (centring point)
# re-parameterise model to extract different predictions
water_ind1 <- data$water_shortage - quantile(data$water_shortage, p = 0.75) # high frequency shortages
water_ind2 <- data$water_shortage - quantile(data$water_shortage, p = 0.5) # intermediate
water_ind3 <- data$water_shortage - quantile(data$water_shortage, p = 0.25) # low frequency shortages
# Multiply each cross-basis variable by the linear terms (see Gasparrini et al. EHP 2015)
# note: exploit the column by column product
# multiply the PDSI cross-basis variables by the urban linear terms
urban_basis1_pdsi <- basis_pdsi*urban_ind1
urban_basis2_pdsi <- basis_pdsi*urban_ind2
urban_basis3_pdsi <- basis_pdsi*urban_ind3
# multiply the PDSI cross-basis variables by the water shortage linear terms
water_basis1_pdsi <- basis_pdsi*water_ind1
water_basis2_pdsi <- basis_pdsi*water_ind2
water_basis3_pdsi <- basis_pdsi*water_ind3
# assign unique column names to cross-basis matrix for inla() model
# note: not necessary for glm(), gam() or glm.nb() models
colnames(basis_tmin) = paste0("basis_tmin.", colnames(basis_tmin))
colnames(basis_tmax) = paste0("basis_tmax.", colnames(basis_tmax))
colnames(basis_pdsi) = paste0("basis_pdsi.", colnames(basis_pdsi))
colnames(urban_basis1_pdsi) = paste0("urban_basis1_pdsi.", colnames(urban_basis1_pdsi))
colnames(urban_basis2_pdsi) = paste0("urban_basis2_pdsi.", colnames(urban_basis2_pdsi))
colnames(urban_basis3_pdsi) = paste0("urban_basis3_pdsi.", colnames(urban_basis3_pdsi))
colnames(water_basis1_pdsi) = paste0("water_basis1_pdsi.", colnames(water_basis1_pdsi))
colnames(water_basis2_pdsi) = paste0("water_basis2_pdsi.", colnames(water_basis2_pdsi))
colnames(water_basis3_pdsi) = paste0("water_basis3_pdsi.", colnames(water_basis3_pdsi))
# create indices for INLA models
# note: for INLA models an index should start with 1 and with the max value equal to the length of unique values
# create microregion index
data$micro_index <- rep(1:nmicro, ntime)
# create state index
# state length
k <- unique(data$state_code)
for (j in 1:nstate)
{
data$state_index[data$state_code == k[j]] <- j
}
# create year index
# set first year (in this case 2001) to 1
data$year_index <- data$year - 2000
# set up data and priors for INLA model
# set data for models
Y <- data$dengue_cases # response variable
N <- length(Y) # total number of data points
E <- data$population/10^5 # model offset so that response is equivalent to an incidence rate per 100,000 people
T1 <- data$month # for random effect to account for annual cycle (seasonality)
T2 <- data$year_index # for random effect to account for inter-annual variability
S1 <- data$micro_index # for microregion spatial random effect
S2 <- data$state_index # for state interaction with month random effect
Vu <- data$urban # include level of urbanisation (% pop living in urban areas) variable along with linear urban interaction
Vw <- data$water_shortage # include frequency of water shortages along with linear water shortage interaction
# create dataframe for model testing
df <- data.frame(Y, E, T1, T2, S1, S2, Vu, Vw)
# define priors
precision.prior <- list(prec = list(prior = "pc.prec", param = c(0.5, 0.01)))
# inla model function
# include formula and set defaults for data, family (to allow other prob dist models e.g. Poisson) and config (to allow for sampling)
mymodel <- function(formula, data = df, family = "nbinomial", config = FALSE)
{
model <- inla(formula = formula, data = data, family = family, offset = log(E),
control.inla = list(strategy = 'adaptive'),
control.compute = list(dic = TRUE, config = config,
cpo = TRUE, return.marginals = FALSE),
control.fixed = list(correlation.matrix = TRUE,
prec.intercept = 1, prec = 1),
control.predictor = list(link = 1, compute = TRUE),
verbose = FALSE)
model <- inla.rerun(model)
return(model)
}