-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbz_regress_model.R
317 lines (234 loc) · 12.1 KB
/
bz_regress_model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
###################################################################################
# regression model for Bz
# This can be used to recreate all the figures and tables from the
# 2022 manuscript by Riley, Reiss, and Moestl.
# Written by Pete Riley (pete@predsci.com)
# This is a pre-release version suitable for the editors and reviewers of the
# manuscript. The final version will be cleaned and curated and made available
# via a GitHub repository.
###################################################################################
# this function takes an input vector and renormalises it so that it runs
# from 0 to 1.
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
###################################################################################
# MAIN PROGRAMME
# load libraries, some of them may not be needed for the main plots, but they
# are all available from the "Tools -> Install packages..." option
library(tidyverse)
library(caret)
library(randomForest)
library(varImp)
library(car)
library(RColorBrewer) # for the color palette
library(xgboost)
library(relaimpo)
library(earth)
library(DALEX)
library(MASS)
library(leaps)
library(stargazer)
# Set these options to print out summaries of the analysis to the screen
# and as a LaTeX-formatted output for the manuscript.
printSummary = T
printLatex = T
#####################################################################################
# get the parameter data
# During the initial analysis, I used data from the Reiss et al. 2021 paper, which had 348 observations
# (i.e., ICMEs).
# Chris had expanded the ICME database since that initial study, and provided an additional several
# hundred events.
# Additionally, as well as using Bz min as the output variable, Chris also provided a dataset
# with B-max, which had led to even higher associations in Martin's original paper, thus,
# the analysis can be run on one of three datasets:
dataset_A = "reiss21b-features-icme-sheath.csv" # original data with B-min target
dataset_B = "features_521_events_bz_target.csv" # revised data with B-min target
dataset_C = "features_521_events_btot_target.csv" # revised data with Btot target
# For the main results of the paper, the following line should be set to dataset_A
# However, to play with different options, which are summarized briefly in the
# paper, set this dataset to dataset_B.
# I didn't pursue the analysis for Btot, however, since we have the data
# it's included as dataset_C for completeness.
# Finally, as an option for the reader, a new dataset (say, D) could be
# created that is some optimum version of _B, perhaps focusing on those
# events that have the fasted ejecta. I would suspect that the
# relationships would be even stronger for these events. However, this is
# beyond the scope of this paper...but potentially useful if this
# analysis is going to be reformulated into a forecast tool.
dataset = read.csv(paste("/Users/pete/Dropbox/research/Bz_regress/",dataset_A,sep=""),header=T)
# although it probably doesn't make a difference for the ML techniques we're applying here,
# it's probably good practice to strip the ID variable from the dataframe.
# In principle, this could be a way that a particular algorithm could latch onto
# recogising a particular example.
dataset <- dataset[-1]
# print out table 1 to summarise the properties of the dataset
if (printSummary) {
summary(dataset)
}
if (printLatex) {
stargazer(dataset)
}
#####################################################################################
# PLOTTING OPTIONS
# 1 = scatterplot
whPlot = 1
#####################################################################################
# Since the features' min/max values can vary significantly from one to the next
# cf. speed (100's), field strength (1's), and temperature (1e5's), it might be
# important to renormalize the data either by doing a min/max renormalisation
# or a z-score standardization. It turns out that none of them make a qualitative difference
# in the results, and, in many cases, not a quantitative difference,
# at least to within two sig. fig. However, for the purposes
# of comparison, using either the min-max or z-score renormalization might
# provide more information to the reader in the tables, so we should choose one of them.
# Since we're not splitting the data into a training and evaluation dataset,
# there would be no issues with introducing a new observation that exceeds the
# min/max of the training set, so having a bounded range for all of the
# features would seem to be the most informative way to renormalise the
# data.
# However, if there are any outliers, then the min/max renormalization could
# artificially compress datasets with outliers. From table 1, there is evidence
# for some outliers, such as the max.vt value. Without going through each datapoint
# separately, it would be better to avoid this issue, so we'll settle on
# using the z-score renormalisation.
# First capture the 'raw', un-normalised data. Need this for making
# table 1, which is a summary of the explanatory parameters and their
# variations.
dataset_raw = dataset
# here is the renormalization based on min/max approach
dataset_n <- as.data.frame(lapply(dataset[1:(dim(dataset)[2])], normalize))
# here's the renormalization based on the z-score approach
dataset_z <- as.data.frame(scale(dataset[1:(dim(dataset)[2])]))
dataset <- dataset_z
#####################################################################################
# Build a linear model for Bz-min (or Bmax) using various techniques
# There are many techniques that can be applied from a simple regression
# to several complicated ML approaches.
# Specify here which approach to do. However, for all of them, an initial formula
# must be created.
# wh_tech can be set as follows:
# 1: simple multiple regression with some set of predefined explanatory (input) vars.
# 2: stepwise regression model using MASS package
# 3: Random Forest Method
# 4: xgboost method
# 5: MARS method
# 6: Step-wise Regression Method
# 7: DALEX method
wh_tech = 1
#####################################################################################
# STATISTICAL ANALYSIS
#####################################################################################
# define several formulae for using in the analysis:
# here's the formula for the ALL predictors:
formula1 = Target ~ .
# Here is the formula for the most significant variables from formula 1
formula2 = Target ~ mean.bt. + std.bt. + cv.bt. + mean.vt. + cv.vt. + mean.np. + max.np. + max.tp. + minmax.tp.
# This is a formula based on a subjective subset of variables after the initial round of analysis was complete.
# It can be adjusted to play with different combinations in an ad hoc way.
#formula3 = Target ~ std.bz. + std.bt. + min.bz. + + mean.bz. + std.by. + mean.np. + minmax.vt. + max.by. + mean.bt. + min.by.
# formula3 = Target ~ mean.bt. + std.bt. + cv.bt. + mean.np. + max.np. + max.tp.
# get rid of max.tp. for this analysis
formula3 = Target ~ mean.bt. + std.bt. + cv.bt. + mean.np. + max.np.
# set the main formula to be analysed to be one of these three options.
formula_anal = formula1 # this is the complete set of input variables.
# technically, the simplest model is the following, but it's also needed
# by the more sophisticated ML packages, so it's defined here.
# This needs to be tidied up for final release, but the two main
# regressors are 1 (which is the _anal regressor) and 3, which is the subset
# of the input variables that are most significant.
regressor1 = lm(formula = formula1, data = dataset)
regressor2 = lm(formula = formula2, data = dataset)
regressor3 = lm(formula = formula3, data = dataset)
regressor = lm(formula = formula_anal, data = dataset)
#####################################################################################
if (wh_tech == 1) { # Simplest technique - multiple regression
print("Full set of Features:")
if (printSummary) {
summary(regressor)
}
if (printLatex) {
stargazer(regressor)
}
print("Most significant five features from full ")
if (printSummary) {
summary(regressor3)
}
if (printLatex) {
stargazer(regressor3)
}
}
#####################################################################################
if (wh_tech == 2) { # stepwise regression model using MASS package
step.model <- stepAIC(regressor, direction = "both",trace = FALSE)
if (printSummary) {
summary(step.model)
}
if (printLatex) {
stargazer(step.model)
}
}
#####################################################################################
if (wh_tech == 3) { # Random Forest Method
# Note that the "Random" part of this technique means that the same results
# as in Table 5 will never be reproduced. However, the same trends should
# always be there, particularly for the most important variables.
regressor_RF <- randomForest(formula = formula_anal,data = dataset,importance=TRUE)
output = caret::varImp(regressor_RF, conditional = TRUE) # conditional=True, adjusts for correlations between predictors
output
}
#####################################################################################
if (wh_tech == 4) { # xgboost method
# as with the Random Forest, you get slightly different results
# each time you run this. Got to love statistics!
regressor_xg_1=train(formula1,data = dataset,method = "xgbTree", trControl = trainControl("cv", number = 10),scale=T)
#regressor_xg_2=train(formula2,data = dataset,method = "xgbTree", trControl = trainControl("cv", number = 10),scale=T)
output1 = caret::varImp(regressor_xg_1)
output1
#output2 = caret::varImp(regressor_xg_2)
#output2
}
#####################################################################################
if (wh_tech == 5) { # MARS
regressor_MARS <- earth(formula_anal, data=dataset) # build model
ev <- evimp (regressor_MARS) # estimate variable importance
plot (ev)
}
#####################################################################################
if (wh_tech == 6) { # Step-wise Regression Method
base.mod <- lm(Target ~ 1 , data = dataset) # base intercept only model
all.mod <- lm(formula_anal , data = dataset) # full model with all predictors
stepMod <- step(base.mod, scope = list(lower = base.mod, upper = all.mod), direction = "both", trace = 1, steps = 1000) # perform step-wise algorithm
#shortlistedVars <- names(unlist(stepMod[[1]])) # get the shortlisted variable.
#shortlistedVars <- shortlistedVars[!shortlistedVars %in% "(Intercept)"] # remove intercept
#shortlistedVars
}
#####################################################################################
if (wh_tech == 7) { # DALEX Method
regressor_DALEX <- randomForest(formula_anal,data = dataset, importance=TRUE) # fit the random forest with default parameter
explained_rf <- explain(regressor_DALEX, data = dataset, y=dataset$Target) # Variable importance with DALEX
varimps = variable_importance(explained_rf, type='raw') # Get the variable importances
print(varimps)
plot(varimps)
}
#####################################################################################
#####################################################################################
# PLOTS
#####################################################################################
#####################################################################################
if (whPlot == 1) {
# dev.new()
#formula = ~Target + min.np. + mean.np. + max.np. + mean.bt. + max.bt. + min.by.+ max.by.
formula3 = ~Target + mean.bt. + std.bt. + cv.bt. + mean.np. + max.np.
#formula = formula1
#formula = formula3
# note that the syntax for formula3 here is different than that
# required for the regressor analysis. This is, apparently a
# bug/feature, perhaps related to some legacy syntax.
scatterplotMatrix(formula3, data=dataset ,
#reg.line="" , smoother="", #col=my_colors ,
#smoother.args=list(col="grey") , cex=1.5 ,
#pch=c(15,16,17) ,
main="Scatterplot Matrix for significant explanatory variables")
}
#####################################################################################