-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCourseWork_Script_T2.R
179 lines (137 loc) · 4.8 KB
/
CourseWork_Script_T2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
### MTH1004 CW Report 2
## SETUP
load("MTH1004T2CW.RData")
library(tidyverse)
### REPORT 1
## PART A
head(Rainfall)
mu = mean(Rainfall$Excess)
sigma = sd(Rainfall$Excess)
sum(is.na(Rainfall$Excess)) #check for unusable rows
n = length(Rainfall$Excess)
ggplot(Rainfall) +
geom_point(aes(x = 1:145, y = Excess)) +
labs(x = "Day", y = "Excess Rainfall (mm)")
# considering exponential model via method of moments
lambda = 1/mu
# considering gamma model via method of moments
alpha = mu^2/sigma^2
beta = mu/sigma^2
# visualizing both distributions superimposed on sample data
ggplot(Rainfall) +
geom_histogram(aes(x = Excess, y = ..density..), bins = 40) +
stat_function(geom="line", fun=dexp, args = list(rate=lambda),
linewidth = 1) +
stat_function(geom="line", fun=dgamma, args = list(shape=alpha, rate=beta),
linewidth = 1, colour = "red") +
lims(y=c(0, 0.05)) +
labs(x="Excess Rainfall (mm)", y="Density")
# q-q plots to determine which of exponential and gamma models fit better
ggplot(Rainfall, aes(sample = Excess)) +
stat_qq(distribution = qexp, dparams = list(rate=lambda)) +
stat_qq(distribution = qgamma, dparams = list(shape=alpha, rate=beta),
colour="red") +
geom_abline(intercept=0, slope=1) +
labs(x="Model quantiles", y="Sample quantiles (mm)")
# standard error by simulation for gamma dist params as it is the better fit
a = numeric(10000)
b = numeric(10000)
for(i in 1:10000) {
y = rgamma(n, alpha, beta)
a[i] = mean(y)^2/sd(y)^2
b[i] = mean(y)/sd(y)^2
}
st_error_a = sd(a)
st_error_b = sd(b)
ggplot() +
geom_histogram(aes(x=a), bins=30) +
geom_vline(xintercept=alpha, colour="red") +
labs(x="Alpha", y="Count in Simulation")
ggplot() +
geom_histogram(aes(x=b), bins=30) +
geom_vline(xintercept=beta, colour="red") +
labs(x="Beta", y="Count in Simulation")
## PART B
# considering given model via method of moments
scale = (0.5*mu)*(1+((mu^2)/sigma^2))
shape = 0.5-((mu^2)/(2*(sigma^2)))
# visualizing distribution superimposed on sample data
ggplot(Rainfall) +
geom_histogram(aes(x = Excess, y = ..density..), bins = 40) +
stat_function(geom="line", fun=dmodel, args = list(scale=scale, shape=shape),
linewidth = 1, colour="blue") +
lims(y=c(0, 0.055)) +
labs(x="Excess Rainfall (mm)", y="Density")
# observing q-q plot with the given model compared with previous tries
ggplot(Rainfall, aes(sample=Excess)) +
stat_qq(distribution = qexp, dparams = list(rate=lambda)) +
stat_qq(distribution = qgamma, dparams = list(shape=alpha, rate=beta),
colour="red") +
stat_qq(distribution = qmodel, dparams = list(scale=scale, shape=shape),
colour="blue") +
geom_abline(intercept=0, slope=1) +
labs(x="Model quantiles", y="Sample quantiles (mm)")
# finding standard error by simulation for the given model
s = numeric(10000)
g = numeric(10000)
for(i in 1:10000) {
y = rmodel(n, scale=scale, shape=shape)
s[i] = (0.5*mean(y))*(1+((mean(y)^2)/sd(y)^2))
g[i] = 0.5-((mean(y)^2)/(2*(sd(y)^2)))
}
st_error_s = sd(s)
st_error_g = sd(g)
ggplot() +
geom_histogram(aes(x=s), bins=40) +
geom_vline(xintercept=scale, colour="red") +
labs(x="Scale", y="Count in Simulation")
ggplot() +
geom_histogram(aes(x=g), bins=40) +
geom_vline(xintercept=shape, colour="red") +
labs(x="Shape", y="Count in Simulation")
## PART C
# finding m-year return level x with the given model
m = 10
p = 0.01
prob = 1/(365*m*p)
x_minus_25 = qgamma(1-prob, shape=alpha, rate=beta)
x = x_minus_25 + 25
### REPORT 2
## PART A
head(Antibiotic)
sum(is.na(Antibiotic$Outcome)) #check for unusable rows
n = length(Antibiotic$Outcome)
ggplot(Antibiotic) +
geom_bar(aes(x=Outcome, y=..prop..)) +
labs(x = "Outcome", y = "Proportion")
# estimating parameter p for binomial dist via method of moments
p = sum(Antibiotic$Outcome==1)/n
# due to large n and large p, this dist has the features:
# (when approximating to normal)
muBin = n*p
sigmaBin = sqrt(n*p*(1-p))
# finding approx. 95% confidence interval for the mean by standardizing
# this binomial dist to normal
interval = muBin + qnorm(c(0.025, 0.975))*sigmaBin/sqrt(n)
# this constitutes the following proportions of the sample
proportion_as_percent = interval/n * 100
## PART B
# repeating the above analysis per hospital individually
hospAdata = Antibiotic %>%
filter(Hospital == "A") %>%
select(Outcome)
nA = length(hospAdata$Outcome)
pA = sum(hospAdata$Outcome==1)/nA
muBinA = nA*pA
sigmaBinA = nA*pA*(1-pA)
intervalA = muBinA + qnorm(c(0.025, 0.975))*sigmaBinA/sqrt(nA)
prop_as_percentA = intervalA/nA * 100
hostBdata = Antibiotic %>%
filter(Hospital == "B") %>%
select(Outcome)
nB = length(hostBdata$Outcome)
pB = sum(hostBdata$Outcome==1)/nB
muBinB = nB*pB
sigmaBinB = nB*pB*(1-pB)
intervalB = muBinB + qnorm(c(0.025, 0.975))*sigmaBinB/sqrt(nB)
prop_as_percentB = intervalB/nB * 100