-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSMSCG_Phytoplankton_AbundanceMetricsComparisons.R
290 lines (239 loc) · 9.19 KB
/
SMSCG_Phytoplankton_AbundanceMetricsComparisons.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#SMSCG
#phytoplantkon
#comparisons among data types:
#Organisms/mL
#Biovolume (microns^3)/mL
#Carbon/mL
#Extracted Chlor-a
#Sonde Fluorescence
#perhaps calculate and compare means by region and week of the year
#this would be a rough first step
#sample sizes underlying the means will likely be quite different
#and phytoplankton can probably vary a lot among days within a week
#load packages
library(tidyverse) #suite of packages for data science
library(readxl) #reading in data from excel spreadsheets
library(ggplot2) #making plots
library(lubridate) #formatting dates
# 1. Read in the Data----------------------------------------------
# Dataset is on SharePoint site for the SMSCG action
# Define path on SharePoint site for data
sharepoint_path <- normalizePath(
file.path(
Sys.getenv("USERPROFILE"),
"California Department of Water Resources/SMSCG - Summer Action - Data"
)
)
#read in phytoplankton community composition summary data
#includes organisms/mL and biovolume/mL
phyto_com<-read_csv(file = paste0(sharepoint_path,"./Phytoplankton/SMSCG_phytoplankton_sample_summary_2020.csv"))
#looks like columns are formatted correctly, even the date and time
#read in the sonde data, including fluorescence used to estimate primary productivity
sonde<-read_csv(file = paste0(sharepoint_path,"./2020 data for USBR/des_data.csv"))
#columns mostly formatted correctly but need to format the date-time column
#read in extracted chlorophyll-a concentrations
chlora<-read_excel(path = paste0(sharepoint_path,"./2020 data for USBR/DWR SuisunMarsh Chl All Stations WY2020.xlsx"))
#glimpse(chlora)
#looks like columns are formatted correctly even the date column; oddly there's no time data
#lots of columns I won't need, so should pare down the data set
#format the extracted chlorophyll-a data set-----------------
#look at list of station names
#unique(chlora$QST_NAME)
#unique(chlora$QST_FULL_STATION_NAME)
#looks like they are probably all unique names
#but many are long, so maybe keep the station number too
#look at station number
#unique(chlora$QST_NUMBER)
#same number of stations as with station name
#so confirms that the station names are all truly unique
#look at reporting limit
#unique(chlora$RES_REPORTING_LIMIT)
#0.5 1.0 2.5
#there are three values so keep this column
#look at sample depth
#unique(chlora$SAM_DEPTH)
#all collected 1 m from surface
#so don't need to keep this column
#look at sample type column
#unique(chlora$SPP_DESCRIPTION)
#"Normal Sample" "Duplicate Sample"
#makes sense
#look at sample description
#unique(chlora$SAM_DESCRIPTION)
#"Chlorophyll A Grab Sample"
#all the same so don't need this column
#look at units column
#unique(chlora$UNS_NAME)
#"ug/L"
#all the same so can drop this column
#reduce the data set to just the needed columns
#QST_NAME
#QST_NUMBER
#SAM_COLLECTION_DATE
#ANL_ANALYTE_NAME
#RES_RESULT
#RES_REPORTING_LIMIT
#UNS_NAME
#SPP_DESCRIPTION
chlora_sub <- chlora %>%
#simplify column names
rename(station_name = QST_NAME
,station_code = QST_NUMBER
,date = SAM_COLLECTION_DATE
,analyte = ANL_ANALYTE_NAME
,result = RES_RESULT
,report_limit = RES_REPORTING_LIMIT
,sample_type = SPP_DESCRIPTION) %>%
#subset and reorder columns to just those needed
select(station_name
,station_code
,date
,analyte
,result
,report_limit
,sample_type)
#glimpse(chlora_sub)
#are there values below the reporting limit in this data set?
#chlora_sub$diff<-chlora_sub$result-chlora_sub$report_limit
#replim<-chlora_sub %>%
# filter(diff<0)
#there is one case where result is zero
#so it looks like values below the reporting limit are very rare
#and those values are included as zeros
#create data frame that pairs stations with regions
#RV = Sacramento River, ME = East Suisun Marsh, MW = West Suisun Marsh, GZ = Grizzly Bay
#not 100% sure about region assignments
#was pretty difficult to match names in this data set to those in the study plan (Table 3)
stations_rand<-unique(chlora_sub[,c("station_name","station_code")])
stations<-stations_rand[order(stations_rand$station_code),]
stations$region<-c(rep("GZ",2), "RV","MW",rep("ME",2), rep("MW",3),"ME" )
#could get GPS coordinates from CDEC
#use join to add region names to main extracted chlorophyll data set
chlora_reg<-left_join(chlora_sub,stations) %>%
#remove two high outliers for MW (over 150 mg/L)
#go back and look into these later
filter(result < 150)
#glimpse(chlora_reg)
#boxplot of chlorophyll by region
chlora_reg$region <- factor(chlora_reg$region, levels=c('GZ','MW','ME','RV'))
(plot_rg_chlora_bx<-ggplot(data=chlora_reg, aes(x = region, y = result)) +
geom_boxplot()+
labs(x="Region", y="Chlorophyll-a (mg/L)")
)
#ggsave(file = paste0(sharepoint_path,"./Plots/ChlorophyllByRegion_Boxplot.png"),type ="cairo-png",width=8, height=5,units="in",dpi=300)
#add a week of the year column based on date
chlora_reg$week <- week(chlora_reg$date)
#summarize by region and week
chlora_samp_mean<-chlora_reg %>%
group_by(region,week,analyte) %>%
summarize(samp_mean = mean(result))
#note that both normal and duplicate samples are included in these averages
#break chlorophyll and Pheophytin into separate columns
lab_pigment<-chlora_samp_mean %>%
pivot_wider(names_from = analyte, values_from = samp_mean)
#glimpse(lab_pigment)
#format the sonde data------------------------
#look at analyte column
#unique(sonde$analyte_name)
#four analytes: "Specific Conductance", "Temperature", "Turbidity", "Fluorescence"
#format the sonde data
fluor<-sonde %>%
#filter to just the fluorescence data
filter(analyte_name == "Fluorescence") %>%
#format the date-time column
mutate(date_time = mdy_hm(time))
#look at station column
#unique(fluor$cdec_code)
#six stations: "NSL" "BDL" "HUN" "RVB" "MAL" "GZL"
#look at unit name
#unique(fluor$unit_name)
#"FU"
#look at qaqc flags
#unique(fluor$qaqc_flag_id)
#"Approved" "Provisional"
#keep this column for now because some data are provisional
#how many provisional values for each station?
prov_count<-fluor %>%
#filter(qaqc_flag_id == "Provisional") %>%
group_by(cdec_code,qaqc_flag_id) %>%
summarize(count = n())
#lots of provisional data across all stations
#all data for GZL and RVB are provisional
#create data frame that matches CDEC names and regions
cdec<-data.frame(
cdec_code = c("NSL", "BDL", "HUN", "RVB", "MAL", "GZL"),
region = c("ME", "ME","MW","RV", "RV","GZ")
)
#glimpse(cdec)
#use join to add region names to main fluorescence data set
fluor_reg<-left_join(fluor,cdec)
glimpse(fluor_reg)
#add a column for week of the year by extracting it from date
fluor_reg <- fluor_reg %>%
mutate(week = week(date_time))
fluor_week_sum <- fluor_reg %>%
#calculate mean fluorescence by week and region
group_by(region, week) %>%
summarize(fluorescence = mean(value,na.rm=T))
#need the na.rm=T because some of BDL data missing
#look closer at BDL during weeks 27-31
#bdl <- fluor_reg %>%
# filter(cdec_code == "BDL" & week > 26 & week < 32)
#all these data are NAs; perhaps the probe malfunctioned
#but it could be some sort of data download error or something too
#boxplot of fluorescence by region
fluor_reg$region <- factor(fluor_reg$region, levels=c('GZ','MW','ME','RV'))
(plot_rg_fluor_bx<-ggplot(data=fluor_reg, aes(x = region, y = value)) +
geom_boxplot()+
labs(x="Region", y="Fluorescence")
)
#ggsave(file = paste0(sharepoint_path,"./Plots/FluorescenceByRegion_Boxplot.png"),type ="cairo-png",width=8, height=5,units="in",dpi=300)
#summarize the phytoplankton community data by region and week---------
phyto_com_sum <- phyto_com %>%
mutate(week = week(date)) %>%
group_by(region, week) %>%
summarize(tot_den = mean(tot_den),
tot_bvol = mean(tot_bvol)
)
#combine the three data set and plot correlations by region-------
#create list of data sets to combine
data_list <- list(phyto_com_sum, lab_pigment, fluor_week_sum)
#join data sets by region and week
all_data <- data_list %>%
reduce(full_join, by = c("region","week")) %>%
#rename the lab pigment columns
rename(chlorophyll = "Chlorophyll a",
pheophytin = "Pheophytin a"
)
#plot phytoplankton density vs phytoplankton biovolume
(plot_den_bvol <-ggplot(all_data, aes(x=tot_den, y=tot_bvol))+
geom_point() +
facet_wrap(~region, nrow=2)
)
#overall, seems like there is some relationship
#hard to make out because of outlier in RV (high biovolume value relative to density)
#no data for GZ, which makes sense for these data
#plot chlorophyll vs pheophytin
(plot_chl_pheo <-ggplot(all_data, aes(x=chlorophyll, y=pheophytin))+
geom_point() +
facet_wrap(~region, nrow=2)
)
#possibly a relationship but hard to tell because of outlier in MW
#plot chlorophyll vs fluorescence
(plot_chl_fluor <-ggplot(all_data, aes(x=chlorophyll, y=fluorescence))+
geom_point() +
facet_wrap(~region, nrow=2)
)
#not very compelling but look at MW outlier
#plot chlorophyll vs total biovolume
(plot_chl_bvol <-ggplot(all_data, aes(x=chlorophyll, y=tot_bvol))+
geom_point() +
facet_wrap(~region, nrow=2)
)
#again, the MW outlier
#plot fluorescence vs total biovolume
(plot_fluor_bvol <-ggplot(all_data, aes(x=fluorescence, y=tot_bvol))+
geom_point() +
facet_wrap(~region, nrow=2)
)
#again RV outlier