Skip to content

Commit a25c7b9

Browse files
authored
Update README.md
1 parent 20e81eb commit a25c7b9

File tree

1 file changed

+65
-62
lines changed

1 file changed

+65
-62
lines changed

Diff for: README.md

+65-62
Original file line numberDiff line numberDiff line change
@@ -3,41 +3,74 @@
33
## How to install and apply DC-AIPW and DC-TMLE on a data set
44

55
```{r}
6-
credentials::set_github_pat() # it asks for token.
76
devtools::install_github("momenulhaque/Crossfit") # it will install the package
8-
library(Crossfit) # Now the package is ready to use
7+
library(Crossfit)
8+
```
9+
Now the package is ready to use. It supports applying both DC-AIPW and DC-TMLE for two cases-
10+
11+
### Case 1: Without parallelization
912

13+
1. Install the required R packages
1014

11-
################# Without parallelization ##################
15+
```{r}
1216
library(tidyverse)
1317
require(furrr)
1418
require(tibble)
1519
require(SuperLearner)
20+
```
21+
2. Define the learners that you want to use in superlearner training
1622

17-
# I used only two learners
23+
```{r}
1824
#Logistic Regression
25+
SL.glm.DCDR <- function(...){
26+
SL.glm(...)
27+
}
28+
29+
#4 degree GAM
30+
SL.gam4.DCDR <- function(...){
31+
SL.gam(..., deg.gam=4)
32+
}
33+
34+
#6 degree GAM
35+
SL.gam6.DCDR <- function(...){
36+
SL.gam(..., deg.gam=6)
37+
}
38+
39+
#Neural Network
40+
SL.nnet.DCDR <- function(...){
41+
SL.nnet(..., size=4)
42+
}
43+
44+
#Random forest
45+
SL.randomForest.DCDR <- function(...){
46+
SL.randomForest(..., ntree=500, nodesize=20)
47+
}
1948
20-
SL.glm.DCDR <- function(...){
21-
SL.glm(...)
22-
}
49+
#Empirical mean
50+
SL.mean.DCDR <- function(...){
51+
SL.mean(...)
52+
}
53+
2354
24-
#4 degree GAM
25-
SL.gam4.DCDR <- function(...){
26-
SL.gam(..., deg.gam=4)
27-
}
55+
learners <- c("SL.glm.DCDR", "SL.gam4.DCDR", "SL.gam6.DCDR", "SL.nnet.DCDR", "SL.randomForest.DCDR", "SL.mean.DCDR")
56+
```
2857

29-
# data can be from Crossfit package
58+
3. Defining the data you want to use
59+
60+
```{r}
61+
# Read the data set that you want to use. An example data set "data" can be found in this package.
3062
df = data
63+
```
64+
3. Defining model parameters
3165

66+
```{r}
3267
exposure="statin"
3368
outcome="Y"
3469
35-
3670
covarsT <- c("age", "ldl_log", "risk_score") # covariate for exposure
3771
covarsO <- c("age", "ldl_log", "risk_score") # covariate for outcome
3872
39-
learners <- c("SL.glm.DCDR", "SL.gam4.DCDR")
40-
73+
# Here `V=5' indicates the number of cross-validation folds that is applied in the superlearner training.
4174
control <- SuperLearner.CV.control(V=5)
4275
4376
## Wrapper functions
@@ -53,14 +86,21 @@ tmle_sim <- function(df, num_cf = 3, n_split, seed){
5386
num_cf, n_split, rand_split = TRUE, seed)
5487
return(tmle_output)
5588
}
56-
57-
58-
89+
```
90+
3. Estimating the average causal effect
91+
Here the parameters **num_cf** is the number of repeatation, **n_split** is the number of splits and **seed** can be usefull for comparing different methods.
92+
```{r}
5993
aipw_result_p3 = aipw_sim(df=data, num_cf = 3, n_split = 3, seed = 123)
6094
tmle_result_p3 = tmle_sim(df=data, num_cf = 3, n_split = 3, seed = 123)
95+
```
96+
97+
### Case 2: With parallelization
98+
The parallelization is very usefull while simulation study is conducted for large number of times. The steps are similar to case 1, except some additional steps-
6199

100+
1. Do the steps 1-3 that is described in case 1 under parallel packages.
62101

63102

103+
```{r}
64104
############### With parallelization ##############
65105
library(parallel)
66106
cl <- makeCluster(detectCores())
@@ -73,61 +113,24 @@ parallel::clusterEvalQ(cl, {
73113
require(tibble)
74114
require(SuperLearner)
75115
76-
#Logistic Regression
77-
78-
SL.glm.DCDR <- function(...){
79-
SL.glm(...)
80-
}
81-
82-
#4 degree GAM
83-
SL.gam4.DCDR <- function(...){
84-
SL.gam(..., deg.gam=4)
85-
}
86-
87-
df = data # data is from Crossfit package
88-
89-
exposure="statin"
90-
outcome="Y"
91-
92-
93-
covarsT <- c("age", "ldl_log", "risk_score") # covariate for exposure
94-
covarsO <- c("age", "ldl_log", "risk_score") # covariate for outcome
95-
96-
learners <- c("SL.glm.DCDR", "SL.gam4.DCDR")
97-
98-
control <- SuperLearner.CV.control(V=5)
99-
100-
101-
## Wrapper functions
102-
103-
aipw_sim <- function(df, num_cf = 3, n_split, seed){
104-
aipw_output <- aipw_multiple_p(df, exposure, outcome, covarsT, covarsO, learners, control,
105-
num_cf, n_split, rand_split = TRUE, seed)
106-
return(aipw_output)
107-
}
108-
109-
tmle_sim <- function(df, num_cf = 3, n_split, seed){
110-
tmle_output <- tmle_multiple_p(df, exposure, outcome, covarsT, covarsO, learners, control,
111-
num_cf, n_split, rand_split = TRUE, seed)
112-
return(tmle_output)
113-
}
114-
116+
# Run the codes from Step 1
117+
# Run the codes from Step 2
118+
# Run the codes from Step 3
115119
116120
})
117121
122+
```
118123

119124

120-
# I just made a list of two same `data' to apply clusterMap function.
121-
125+
2. Apply the following codes for estimating average causal effect under parallel packages-
126+
All the data set should be stored in a list object. I just made a list of two data sets using the same data `data' to apply clusterMap function.
122127

128+
```{r}
123129
aipw_result_p3 = clusterMap(cl, function(df, seed) aipw_sim(df, num_cf = 3, n_split = 3, seed),
124130
df=list(data, data) , seed=list(1:2))
125131
126-
127132
tmle_result_p3 = clusterMap(cl, function(df, seed) tmle_sim(df, num_cf = 3, n_split = 3, seed),
128133
df=list(data, data) , seed=list(1:2))
129134
130135
131-
132-
133136
```

0 commit comments

Comments
 (0)