-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSMSCG_Phytoplankton_DataVisualization.R
976 lines (814 loc) · 36.5 KB
/
SMSCG_Phytoplankton_DataVisualization.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
#SMSCG
#phytoplantkon data
#data visualizations
#responses
#total organisms per ml
#total cells per ml
#total biovolume per ml
#To do list
#see if all the phyla are up to date; Tiffany's file is probably out of date
#make some plots to look at cells per mL which is a response I added to the source file
#compare DFW and EMP samples when they are collected close to same time
#maybe do nmds by region
#load packages
library(tidyverse) #variety of data science tools
library(lubridate) #format datesx
library(scales) #log scale axes in ggplot
#library(diathor) #diatom trait data
# 1. Read in the Data----------------------------------------------
#read in the aggregated and formatted phytoplankton data
phyto_vis<-read_csv("EDI/data_output/SMSCG_phytoplankton_formatted_2020-2021.csv")
#looks like column types are all correct, even date and time
#format data---------------
#EMP stations: Subset to just the five relevant to SMSCG
#D22, D4, NZ032, NZS42, D7
#create data frame that groups names of stations that represent the same general location and adds the three broad regions
#RV = Sacramento River, ME = East Suisun Marsh, MW = West Suisun Marsh, GB = Grizzly Bay
#Note: for 2022, changed from GZB to MtMouth
station_key <- data.frame(
station = c("706", "D22","704","801", "802", "D4","609","MON","610","605", "606","NZ032","NZS42","GZB","602","D7"),
station_comb = c(rep("706", 2), "704",rep("801", 3),"609","MON","610", "605", rep("606",2),"NZS42","GZB",rep("602",2)),
region = c(rep("RV",6), rep("ME",3), rep("MW",4),rep("GB",3))
)
#glimpse(station_key)
#join the sample data and the new station/region data
#by using inner_join, only the SMSCG relevant stations are kept
phyto_gates<- inner_join(phyto_vis,station_key) %>%
#add a column for month by extracting month from date
mutate(month = month(date)
,year = year(date)
) %>%
#remove the June samples from EMP because we don't have any other June samples
filter(month!=6)
#check for NAs
check_na <- phyto_gates[rowSums(is.na(phyto_gates)) > 0,]
#the only NAs are for the time for one sample
#export 2020 data as csv for publishing on EDI
edi <- phyto_gates %>%
filter(year=="2020") %>%
rename(biovolume_per_ml = biovolume_per_ml_new) %>%
select(
station
,region
,date
,time
,kingdom
,phylum
,class
,phyto_form
,genus
,taxon
,organisms_per_ml
,biovolume_per_ml
) %>%
#arrange(date,time)
glimpse()
#write_csv(edi,"EDI/data_output/SMSCG_phytoplankton_formatted_2020_EDI_bvol_corr.csv")
#how many genera didn't match up with higher taxonomy?
#sum(is.na(phyto_gates$class))
#there are now zero rows that aren't matched with higher level taxonomy
#look at distinct genera and taxa
#taxon_na<-phyto_gates %>%
# filter(is.na(class)) %>%
# distinct(genus, taxon) %>%
# arrange(genus,taxon)
#5 genera and 6 taxa that weren't in higher taxonomy spreadsheet
#need to add them
#look at number of samples per station
s_samp_count<-phyto_gates %>%
distinct(station_comb, date) %>%
group_by(station_comb) %>%
summarize(count = n(), .groups = 'drop')
#count total number of samples
sum(s_samp_count$count) #138
#look at number of samples per region
r_samp_count<-phyto_gates %>%
distinct(region, date) %>%
group_by(region) %>%
summarize(count = n(), .groups = 'drop')
#sample size varies a lot among regions; river is most; bay is fewest
#look at number of samples per region and month
rm_samp_count<-phyto_gates %>%
distinct(region,station_comb, month,date) %>%
group_by(region, month) %>%
summarize(count = n(), .groups = 'drop')
range(rm_samp_count$count)
#generally 8-14 samples per month x region
#look at number of samples per region and month and year
ry_samp_count<-phyto_gates %>%
distinct(region,station_comb, year, month,date) %>%
group_by(region, year,month) %>%
summarize(count = n(), .groups = 'drop') %>%
arrange(year,region)
range(ry_samp_count$count)
#look closer at Grizzly Bay samlpes
gzb <- phyto_gates %>%
filter(region=="GB")
#try to get diatom trait data using the diathor package--------------
#NOTE: so far, not having much luck; at least some of the examples worked
#first make a df with just the diatoms
#and format it into a matrix like this package needs
#diatoms_only<-phyto_gates %>%
# filter(phylum == "Ochrophyta") %>%
#for diaThorAll function, there must be a column called "species"
# rename(species=taxon) %>%
#create combined station-date column
# unite(station_date, station, date) %>%
#reduce to just needed columns
# select(species
# ,station_date
# ,organisms_per_ml
# ) %>%
#convert from long to wide
# pivot_wider(names_from = station_date, values_from = organisms_per_ml) %>%
#convert all NAs to zeros
# replace(is.na(.), 0) %>%
#make species a factor and everything else integers
#NOTE that abundances are actually densities instead of counts but fine for now
# mutate(across(c(species),as.factor)
# ,across(c('STN 609_2020-08-12':'NZS42_2020-10-13'),as.integer)
# ) %>%
# glimpse()
#data("diat_sampleData")
#glimpse(diat_sampleData)
#species is a factor and all other columns are integers
#Run diaThorAll to get all the outputs from the sample data with the default settings, and store
#the results into the “results” object, to also retain the output within R
#NOTE: this function runs a whole pipeline of functions and takes a while
#results <- diaThorAll(diat_sampleData) #If the sample data was used
#I got their example to work
#loadedData <- diat_loadData(species_df=diat_sampleData,resultsPath=sharepoint_path) # load data with the diat_loadData() function
#results <- diat_ips(loadedData )
#this isn't working still
#loadedData <- diat_loadData(species_df=diatoms_only,resultsPath=sharepoint_path)
#results <- diat_ips(loadedData )
#plot time series of density and biovolume by station-----------
#summarize density and biovolume data by sample (station x date combo)
#so sum all the taxon specific densities and biovolumes within a sample
s_phyto_sum<-phyto_gates %>%
group_by(region, station_comb, station, year, month, date, time) %>%
summarize(
tot_den = sum(organisms_per_ml)
,tot_bvol_old = sum(biovolume_per_ml_old)
,tot_bvol_new = sum(biovolume_per_ml_new)
,.groups = 'drop'
) %>%
#make month and year factors
mutate_at(vars(year,month), factor)
#write_csv(s_phyto_sum,file = paste0(sharepoint_path,"./Plots/SMSCG_phytoplankton_sample_summary_2020.csv"))
#reorder region factor levels for plotting
s_phyto_sum$region <- factor(s_phyto_sum$region, levels=c('GB','MW','ME','RV'))
#plot total phytoplankton density by station
(plot_st_tot_den <-ggplot(s_phyto_sum, aes(x=date, y=tot_den, group=year))+ #specified what to plot on the x and y axes
geom_line() +
geom_point() +
facet_wrap(~station_comb, nrow=2)
)
#plot total phytoplankton biovolume OLD by station
(plot_st_tot_bvol <-ggplot(s_phyto_sum, aes(x=date, y=tot_bvol_old, group=year))+
geom_line() +
geom_point() +
facet_wrap(~station_comb, nrow=2)
)
#plot total phytoplankton biovolume NEW by station
(plot_st_tot_bvol <-ggplot(s_phyto_sum, aes(x=date, y=tot_bvol_new, group=year))+
geom_line() +
geom_point() +
facet_wrap(~station_comb, nrow=2)
)
#plot time series of density and biovolume by region and month-----------
#function to calculate standard error
se <- function(x) sd(x)/sqrt(length(x))
#so calculate means for densities and biovolumes
#across stations within regions and across dates within months
r_phyto_sum<-s_phyto_sum %>%
group_by(region, month) %>%
summarize(
tot_den_avg = mean(tot_den),
tot_den_se = se(tot_den),
tot_bvol_old_avg = mean(tot_bvol_old),
tot_bvol_old_se = se(tot_bvol_old),
tot_bvol_new_avg = mean(tot_bvol_new),
tot_bvol_new_se = se(tot_bvol_new)
,.groups = 'drop'
)
#glimpse(r_phyto_sum)
#reorder region factor levels for plotting
r_phyto_sum$region <- factor(r_phyto_sum$region, levels=c('GB','MW','ME','RV'))
#plot mean total phytoplankton density by region
(plot_rg_tot_den <-ggplot(r_phyto_sum, aes(x=month, y=tot_den_avg))+
geom_line() +
geom_point() +
geom_errorbar(aes(ymin=tot_den_avg-tot_den_se, ymax=tot_den_avg+tot_den_se), width = 0.2) +
facet_wrap(~region)
)
#plot mean total phytoplankton biovolume OLD by region
(plot_rg_tot_bvol_old <-ggplot(r_phyto_sum, aes(x=month, y=tot_bvol_old_avg))+
geom_line() +
geom_point() +
geom_errorbar(aes(ymin=tot_bvol_old_avg-tot_bvol_old_se, ymax=tot_bvol_old_avg+tot_bvol_old_se), width = 0.2) +
facet_wrap(~region)+
labs(x="Month", y="Total Biovolume (cubic microns per mL)")
)
#plot mean total phytoplankton biovolume NEW by region
(plot_rg_tot_bvol_new <-ggplot(r_phyto_sum, aes(x=month, y=tot_bvol_new_avg))+
geom_line() +
geom_point() +
geom_errorbar(aes(ymin=tot_bvol_new_avg-tot_bvol_new_se, ymax=tot_bvol_new_avg+tot_bvol_new_se), width = 0.2) +
facet_wrap(~region)+
labs(x="Month", y="Total Biovolume (cubic microns per mL)")
)
#ggsave(file = paste0(sharepoint_path,"./Plots/SMSCG_Phyto_LinePlot_TotalBiovolume_RegionMonth.png"),type ="cairo-png",width=8, height=5,units="in",dpi=300)
#boxplots of total density and biovolume by region-------
(plot_rg_tot_den_bx<-ggplot(data=s_phyto_sum, aes(x = region, y = tot_den)) +
geom_boxplot()+
geom_jitter() #adds all points to plot, not just outliers
)
#OLD
(plot_rg_tot_bvol_old_bx<-ggplot(data=s_phyto_sum, aes(x = region, y = tot_bvol_old)) +
geom_boxplot()+
#geom_jitter()+ #adds all points to plot, not just outliers; need to remove default outlier points to avoid duplication
labs(x = "Region", y = "Phytoplankton Biovolume (cubic microns per mL)")+
scale_x_discrete(labels = c('Grizzly Bay','West Suisun Marsh','East Suisun Marsh','Lower Sacramento'))
)
#NEW
(plot_rg_tot_bvol_new_bx<-ggplot(data=s_phyto_sum, aes(x = region, y = tot_bvol_new)) +
geom_boxplot()+
#geom_jitter()+ #adds all points to plot, not just outliers; need to remove default outlier points to avoid duplication
labs(x = "Region", y = "Phytoplankton Biovolume (cubic microns per mL)")+
scale_x_discrete(labels = c('Grizzly Bay','West Suisun Marsh','East Suisun Marsh','Lower Sacramento'))
)
#ggsave(file = paste0(sharepoint_path,"./Plots/SMSCG_Phyto_BoxPlot_TotalBiovolume_Region_Report.png"),type ="cairo-png",width=8, height=5,units="in",dpi=300)
#plot number of genera by station and region-------
#determine how many species typically within each genera
#if generally one, then easier to just use genus for plots/analysis
ssp_count<-phyto_gates %>%
distinct(genus, taxon) %>%
group_by(genus) %>%
summarize(count = n(), .groups = 'drop')
#use histogram to visualize distribution of species per genera
(plot_spp_per_genus_hist<-ggplot(data=ssp_count, aes(x = count)) +
geom_histogram()
)
#most genera contain 1-2 species but some contain up to 15
#look closer at genera with many species
genera_rich<-phyto_gates %>%
distinct(genus, taxon) %>%
filter(genus=="Nitzschia" | genus =="Navicula") %>%
arrange(genus)
#not just unique because of variants of same name (though there are variants of genus sp.)
#actually quite a few different species within these genera present
#create df that counts genera by sample
phyto_genera_sum<-phyto_gates %>%
distinct(region, station_comb,date, month,genus) %>%
group_by(region, station_comb,date, month) %>%
summarize(genera = n(), .groups = 'drop')
#use histogram to visualize distribution of genera per sample
(plot_genera_per_sample_hist<-ggplot(data=phyto_genera_sum, aes(x = genera)) +
geom_histogram()
)
#generally 1-7 but some up to 15 genera per sample
#plot time series of number of genera by station
(plot_st_genera <-ggplot(phyto_genera_sum, aes(x=date, y=genera))+
geom_line() +
geom_point() +
facet_wrap(~station_comb)
)
#number of genera per region
phyto_genera_sum_rg<-phyto_gates %>%
distinct(region,date, month,genus) %>%
group_by(region,date, month) %>%
summarize(genera = n(), .groups = 'drop')
#plot time series of number of genera by region
(plot_st_genera <-ggplot(phyto_genera_sum_rg, aes(x=date, y=genera))+
geom_line() +
geom_point() +
facet_wrap(~region)
)
#stacked bar plot showing density and biovolume by phyla and station-------
#summarize density and biovolume data by sample and phylum
#so sum densities and biovolumes within a sample by phylum
s_phyto_phylum_sum<-phyto_gates %>%
group_by(region, station_comb, station, date, year,month,phylum) %>%
summarize(
tot_den = sum(organisms_per_ml)
,tot_bvol_old = sum(biovolume_per_ml_old)
,tot_bvol_new = sum(biovolume_per_ml_new)
,.groups = 'drop'
)
#stacked bar plot time series of density by phylum and station
(plot_st_den_per_phylum <-ggplot(s_phyto_phylum_sum
, aes(x=date, y= tot_den, fill = phylum))+
geom_bar(position = "stack", stat = "identity") +
ylab("Phytoplankton Density (Organisms / mL)") + xlab("Date") +
#scale_colour_discrete(drop=T, limits = levels(phylum$phylum))+
#scale_fill_manual(name = "Phylum",
# values=phylum_colors,
# breaks=phylum_names,
# labels = phylum_names)+
facet_grid(~station_comb
#,labeller = labeller(island = island_label)
))
#cyanobacteria dominates the community numerically at all stations and times
#figure out why there is "NA" category for phylum
#stacked bar plot time series of biovolume OLD by phylum and station
(plot_st_bvol_old_per_phylum <-ggplot(s_phyto_phylum_sum, aes(x=date, y= tot_bvol_old, fill = phylum))+
geom_bar(position = "stack", stat = "identity") +
ylab("Phytoplankton Biovolume") + xlab("Date") +
#scale_colour_discrete(drop=T, limits = levels(phylum$phylum))+
#scale_fill_manual(name = "Phylum",
# values=phylum_colors,
# breaks=phylum_names,
# labels = phylum_names)+
facet_grid(~station_comb
#,labeller = labeller(island = island_label)
))
#mostly cyanobactera as well as diatoms and allies
#stacked bar plot time series of biovolume NEW by phylum and station
(plot_st_bvol_new_per_phylum <-ggplot(s_phyto_phylum_sum, aes(x=date, y= tot_bvol_new, fill = phylum))+
geom_bar(position = "stack", stat = "identity") +
ylab("Phytoplankton Biovolume") + xlab("Date") +
#scale_colour_discrete(drop=T, limits = levels(phylum$phylum))+
#scale_fill_manual(name = "Phylum",
# values=phylum_colors,
# breaks=phylum_names,
# labels = phylum_names)+
facet_grid(~station_comb
#,labeller = labeller(island = island_label)
))
#stacked bar plot showing phyla by region and month-------
#summarize density and biovolume data by region, month and phylum
#so calculate mean densities and biovolumes within a region and date by phylum
r_phyto_phylum_sum<-s_phyto_phylum_sum %>%
group_by(region, month, phylum) %>%
summarize(
tot_den_avg = mean(tot_den),
tot_den_se = se(tot_den),
tot_bvol_old_avg = mean(tot_bvol_old),
tot_bvol_old_se = se(tot_bvol_old),
tot_bvol_new_avg = mean(tot_bvol_new),
tot_bvol_new_se = se(tot_bvol_new),
.groups = 'drop'
)
#NOTE: calculating the means probably isn't this simple
#probably need to create data frame with all combos of samples x taxa
#then include zeros for cases in which no individuals of a taxon where found
#to make sure those are part of the mean calculation
#the main thing is to get the number of samples in calculation correct (ie, denominator)
#stacked bar plot time series of density by phylum and station
(plot_rg_den_per_phylum <-ggplot(r_phyto_phylum_sum
, aes(x=month, y= tot_den_avg, fill = phylum))+
geom_bar(position = "stack", stat = "identity") +
ylab("Phytoplankton Density (Organisms / mL)") + xlab("Date") +
#scale_colour_discrete(drop=T, limits = levels(phylum$phylum))+
#scale_fill_manual(name = "Phylum",
# values=phylum_colors,
# breaks=phylum_names,
# labels = phylum_names)+
facet_grid(~region
#,labeller = labeller(island = island_label)
))
#cyanobacteria dominates the community numerically at all stations and times
#figure out why there is "NA" category for phylum
#stacked bar plot time series of biovolume OLD by phylum and station
(plot_rg_bvol_old_per_phylum <-ggplot(r_phyto_phylum_sum
, aes(x=month, y= tot_bvol_old_avg, fill = phylum))+
geom_bar(position = "stack", stat = "identity") +
ylab("Phytoplankton Biovolume") + xlab("Month") +
#scale_colour_discrete(drop=T, limits = levels(phylum$phylum))+
#scale_fill_manual(name = "Phylum",
# values=phylum_colors,
# breaks=phylum_names,
# labels = phylum_names)+
facet_grid(~region
#,labeller = labeller(island = island_label)
))
#mostly cyanobactera as well as diatoms and allies
#stacked bar plot time series of biovolume NEW by phylum and station
(plot_rg_bvol_new_per_phylum <-ggplot(r_phyto_phylum_sum
, aes(x=month, y= tot_bvol_new_avg, fill = phylum))+
geom_bar(position = "stack", stat = "identity") +
ylab("Phytoplankton Biovolume") + xlab("Month") +
#scale_colour_discrete(drop=T, limits = levels(phylum$phylum))+
#scale_fill_manual(name = "Phylum",
# values=phylum_colors,
# breaks=phylum_names,
# labels = phylum_names)+
facet_grid(~region
#,labeller = labeller(island = island_label)
))
#ggsave(file = paste0(sharepoint_path,"./Plots/SMSCG_Phyto_StackedBar_TotalBiovolume_AllPhyto_Region.png"),type ="cairo-png",width=8, height=5,units="in",dpi=300)
#stacked bar plot showing phyla by region-------
#summarize density and biovolume data by region only
r_phyto_phylum_sum_rg<-phyto_gates %>%
#sum densities and biovolumes within a sample by phylum
group_by(region, station_comb, date, month,phylum) %>%
summarize(
tot_den = sum(organisms_per_ml)
,tot_bvol_old = sum(biovolume_per_ml_old)
,tot_bvol_new = sum(biovolume_per_ml_new)
, .groups = 'drop') %>%
#calculate mean densities and biovolumes within regions by phylum
group_by(region, phylum) %>%
summarize(
tot_den_avg = mean(tot_den),
tot_bvol_old_avg = mean(tot_bvol_old),
tot_bvol_new_avg = mean(tot_bvol_new)
, .groups = 'drop'
)
#reorder region factor levels for plotting
r_phyto_phylum_sum_rg$region <- factor(r_phyto_phylum_sum_rg$region, levels=c('GB','MW','ME','RV'))
#stacked bar plot time series of diatom biovolume OLD by region, phylum
ggplot(r_phyto_phylum_sum_rg, aes(x=region, y= tot_bvol_old_avg, fill = phylum))+
geom_bar(position = "stack", stat = "identity") +
ylab("Biovolume (cubic microns per mL)") + xlab("Region")
#scale_colour_discrete(drop=T, limits = levels(phylum$phylum))+
#scale_fill_manual(name = "Phylum",
# values=phylum_colors,
# breaks=phylum_names,
# labels = phylum_names)
#stacked bar plot time series of diatom biovolume NEW by region, phylum
ggplot(r_phyto_phylum_sum_rg, aes(x=region, y= tot_bvol_new_avg, fill = phylum))+
geom_bar(position = "stack", stat = "identity") +
ylab("Biovolume (cubic microns per mL)") + xlab("Region")
#scale_colour_discrete(drop=T, limits = levels(phylum$phylum))+
#scale_fill_manual(name = "Phylum",
# values=phylum_colors,
# breaks=phylum_names,
# labels = phylum_names)
#ggsave(file = paste0(sharepoint_path,"./Plots/SMSCG_Phyto_StackedBar_AllPhyto_Region.png"),type ="cairo-png",width=6, height=5,units="in",dpi=300)
#stacked barplots for diatoms-------------
#summarize diatom density and biovolume data by sample (station x date combo)
#so sum all the taxon specific densities and biovolumes within a sample
s_diatom_sum<-phyto_gates %>%
#subset data to just the phylum with diatoms
filter(phylum == "Ochrophyta") %>%
group_by(region, station_comb, station,year, month, date, time) %>%
summarize(
d_tot_den = sum(organisms_per_ml)
,d_tot_bvol_old = sum(biovolume_per_ml_old)
,d_tot_bvol_new = sum(biovolume_per_ml_new)
, .groups = 'drop'
) %>%
#make month a factor
mutate_at(vars(month,year), factor)
#there are 7 samples without diatoms
#combine diatom data with total phyto data
#join diatom data set and all phyto data set
glimpse(s_phyto_sum)
glimpse(s_diatom_sum)
s_pd_sum <- left_join(s_phyto_sum,s_diatom_sum)
#replace NAs with zeros for density and biovolume
s_pd_sum_z <- s_pd_sum %>%
replace_na(list(d_tot_den = 0, d_tot_bvol_old = 0, d_tot_bvol_new = 0))
#summarize diatom density and biovolume data by class, region, and month
diatom_sum<-phyto_gates %>%
#subset data to just the phylum with diatoms
filter(phylum == "Ochrophyta") %>%
#sum densities and biovolumes within a sample by class
group_by(region, station_comb, date, month,class) %>%
summarize(
tot_den = sum(organisms_per_ml)
,tot_bvol_old = sum(biovolume_per_ml_old)
,tot_bvol_new = sum(biovolume_per_ml_new)
, .groups = 'drop') %>%
#calculate mean densities and biovolumes within regions and months by class
group_by(region, month, class) %>%
summarize(
tot_den_avg = mean(tot_den),
tot_den_se = se(tot_den),
tot_bvol_old_avg = mean(tot_bvol_old),
tot_bvol_old_se = se(tot_bvol_old),
tot_bvol_new_avg = mean(tot_bvol_new),
tot_bvol_new_se = se(tot_bvol_new)
, .groups = 'drop'
)
#reorder region factor levels for plotting
diatom_sum$region <- factor(diatom_sum$region, levels=c('GB','MW','ME','RV'))
#stacked bar plot time series of diatom biovolume OLD by region, month, class
ggplot(diatom_sum, aes(x=month, y= tot_bvol_old_avg, fill = class))+
geom_bar(position = "stack", stat = "identity") +
ylab("Phytoplankton Biovolume") + xlab("Date") +
#scale_colour_discrete(drop=T, limits = levels(phylum$phylum))+
#scale_fill_manual(name = "Phylum",
# values=phylum_colors,
# breaks=phylum_names,
# labels = phylum_names)+
facet_grid(~region
#,labeller = labeller(island = island_label)
)
#stacked bar plot time series of diatom biovolume NEW by region, month, class
ggplot(diatom_sum, aes(x=month, y= tot_bvol_new_avg, fill = class))+
geom_bar(position = "stack", stat = "identity") +
ylab("Phytoplankton Biovolume") + xlab("Date") +
#scale_colour_discrete(drop=T, limits = levels(phylum$phylum))+
#scale_fill_manual(name = "Phylum",
# values=phylum_colors,
# breaks=phylum_names,
# labels = phylum_names)+
facet_grid(~region
#,labeller = labeller(island = island_label)
)
#ggsave(file = paste0(sharepoint_path,"./Plots/SMSCG_Phyto_StackedBar_Diatom_RegionMonth.png"),type ="cairo-png",width=9, height=5,units="in",dpi=300)
#similar stacked bar plot but averaged over months too
#facilitate comparisons among regions
#summarize density and biovolume data by region only
diatom_sum_rg<-phyto_gates %>%
#subset data to just the phylum with diatoms
filter(phylum == "Ochrophyta") %>%
#sum densities and biovolumes within a sample by class
group_by(region, station_comb, date, month,class) %>%
summarize(
tot_den = sum(organisms_per_ml)
,tot_bvol_old = sum(biovolume_per_ml_old)
,tot_bvol_new = sum(biovolume_per_ml_new)
, .groups = 'drop') %>%
#calculate mean densities and biovolumes within regions by class
group_by(region, class) %>%
summarize(
tot_den_avg = mean(tot_den),
tot_den_se = se(tot_den),
tot_bvol_old_avg = mean(tot_bvol_old),
tot_bvol_old_se = se(tot_bvol_old),
tot_bvol_new_avg = mean(tot_bvol_new),
tot_bvol_new_se = se(tot_bvol_new)
, .groups = 'drop'
)
#reorder region factor levels for plotting
diatom_sum_rg$region <- factor(diatom_sum_rg$region, levels=c('GB','MW','ME','RV'))
#stacked bar plot time series of diatom biovolume OLD by region, class
ggplot(diatom_sum_rg, aes(x=region, y= tot_bvol_old_avg, fill = class))+
geom_bar(position = "stack", stat = "identity") +
ylab("Biovolume (cubic microns per mL)") + xlab("Region")
#scale_colour_discrete(drop=T, limits = levels(phylum$phylum))+
#scale_fill_manual(name = "Phylum",
# values=phylum_colors,
# breaks=phylum_names,
# labels = phylum_names)+
#stacked bar plot time series of diatom biovolume NEW by region, class
ggplot(diatom_sum_rg, aes(x=region, y= tot_bvol_new_avg, fill = class))+
geom_bar(position = "stack", stat = "identity") +
ylab("Biovolume (cubic microns per mL)") + xlab("Region")
#scale_colour_discrete(drop=T, limits = levels(phylum$phylum))+
#scale_fill_manual(name = "Phylum",
# values=phylum_colors,
# breaks=phylum_names,
# labels = phylum_names)+
#ggsave(file = paste0(sharepoint_path,"./Plots/SMSCG_Phyto_StackedBar_Diatom_Region.png"),type ="cairo-png",width=6, height=5,units="in",dpi=300)
#boxplot of total phyto biovolume and diatom biovolume by region and month---------
#first reshape data frame
#use version with zeros for samples without diatoms (instead of NAs)
s_pd_sum_l<- s_pd_sum_z %>%
#reduce data frame to just needed columns
select("region","station_comb","year","month","date","tot_bvol_old","tot_bvol_new","d_tot_bvol_old","d_tot_bvol_new") %>%
#convert wide to long
pivot_longer(c("tot_bvol_old","tot_bvol_new","d_tot_bvol_old","d_tot_bvol_new"), names_to = "type", values_to = "tot_bvol") %>%
#converts some columns from character to factor
mutate_at(vars(region,type), factor) %>%
#reorder factor levels for plotting
mutate(region = factor(region, levels=c('RV','ME','MW','GB'))
,type = factor(type, levels=c("tot_bvol_old","tot_bvol_new","d_tot_bvol_old","d_tot_bvol_new"))
#create new columns with better names for labeling facets
,type2 = recode(
type, "tot_bvol_old" = "All Phytoplankton OLD","tot_bvol_new" = "All Phytoplankton NEW","d_tot_bvol_old" = "Diatoms OLD","d_tot_bvol_new" = "Diatoms NEW")
,region2 = recode(
region, "RV" = "Lower Sac", "ME" = "E Suisun Marsh", "MW" = "W Suisun Marsh", "GB"="Grizzly Bay")
,month2 = recode(
month, "7"="July" ,"8"="August", "9"="September", "10"="October")
) %>%
#convert from cubic microns per mL to cubic mm per mL
mutate(tot_bvol2 = tot_bvol/1000000000)
glimpse(s_pd_sum_l)
#plot: col: phyto and diatoms, row: region, x-axis: month
(plot_rm_pd_bvol_bx<-ggplot(data=s_pd_sum_l
, aes(x = month
, y = tot_bvol2
#, y = log(tot_bvol2)
)) +
geom_boxplot(fill="darkolivegreen4")+
#ylim(0,0.05)+ #drops a high end outlier for all phyto, july, sac river
facet_grid(region2~type2)+
labs(x = "Month"
, y = bquote("Biovolume"~(mm^3~mL^-1) )
#, y = bquote("Biovolume"~LN(mm^3~mL^-1) )
)+
scale_x_discrete(labels = c("July","August", "September", "October"))
)
#log transforming data means we lose 7 data points for the diatoms plot because they are zeros
#could switch month and region to better emphasize region comparisons but either month nor region differ
#create separate panels for 2020 and 2021
#this is necessary in part because Grizzly Bay samples are new in 2021
#also just use the new and correct calculations
#2020
s_pd_sum_l_n20 <- s_pd_sum_l %>%
#just keep the new calculations
filter(grepl("new",type) & date < "2021-01-01") %>%
#create new columns with better names for labeling facets
mutate(type2 = recode(
type, "tot_bvol_new" = "2020 All Phytoplankton","d_tot_bvol_new" = "2020 Diatoms"
) )
unique(s_pd_sum_l_n20$type) #tot_bvol_new d_tot_bvol_new
range(s_pd_sum_l_n20$date) #"2020-07-14" "2020-10-22"
#2021
s_pd_sum_l_n21 <- s_pd_sum_l %>%
#just keep the new calculations
filter(grepl("new",type) & date > "2021-01-01") %>%
mutate(type2 = recode(
type, "tot_bvol_new" = "2021 All Phytoplankton","d_tot_bvol_new" = "2021 Diatoms"
))
unique(s_pd_sum_l_n21$type) #tot_bvol_new d_tot_bvol_new
range(s_pd_sum_l_n21$date) #"2020-07-14" "2020-10-22"
#2020 plot: col: phyto and diatoms, row: region, x-axis: month
group.colors20 <- c('2020 All Phytoplankton' = "darkolivegreen", "2020 Diatoms"= "darkolivegreen2")
ggplot(data=s_pd_sum_l_n20, aes(x = region2, y = tot_bvol2, fill=type2
#, y = log(tot_bvol2)
)) +
geom_boxplot(
#fill="darkolivegreen4"
)+
#ylim(0,0.05)+ #drops a high end outlier for all phyto, july, sac river
facet_grid(month2~type2)+
labs(x = "Region"
, y = bquote("Biovolume"~(mm^3~mL^-1) )
#, y = bquote("Biovolume"~LN(mm^3~mL^-1) )
)+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) +
annotation_logticks(sides="l")+
scale_fill_manual(values=group.colors20)+
theme(legend.position="none"
, axis.text.x = element_text(angle = 90, hjust = 1)
)
#ggsave(file = "Data/phytoplankton/SMSCG_Phyto_Boxplot_Phyto&Diatom_Region&Month_2020.png",type ="cairo-png",scale=0.9,width=8.5, height=6,units="in",dpi=300)
#2021 plot: col: phyto and diatoms, row: region, x-axis: month
group.colors21 <- c('2021 All Phytoplankton' = "darkolivegreen", "2021 Diatoms"= "darkolivegreen2")
ggplot(data=s_pd_sum_l_n21, aes(x = region2, y = tot_bvol2, fill=type2
#, y = log(tot_bvol2)
)) +
geom_boxplot()+
#ylim(0,0.05)+ #drops a high end outlier for all phyto, july, sac river
facet_grid(month2~type2)+
labs(x = "Region"
, y = bquote("Biovolume"~(mm^3~mL^-1) )
#, y = bquote("Biovolume"~LN(mm^3~mL^-1) )
)+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) +
annotation_logticks(sides="l")+
scale_fill_manual(values=group.colors21)+
theme(legend.position="none"
, axis.text.x = element_text(angle = 90, hjust = 1)
)
#ggsave(file = "Data/phytoplankton/SMSCG_Phyto_Boxplot_Phyto&Diatom_Region&Month_2021.png",type ="cairo-png",scale=0.9,width=8.5, height=8,units="in",dpi=300)
#generate effect sizes for total phyto and diatoms across regions and months
#needed for USBR report
effsz<-s_pd_sum_l %>%
filter(region!="GB" & grepl("new",type)) %>%
group_by(type,year) %>%
summarize(
bvol_mean = mean(tot_bvol2)
,bvol_sd = sd(tot_bvol2)
, .groups = 'drop')
#total phyto: compare 2020 vs 2021
effsz$bvol_mean[1]/effsz$bvol_mean[2] #2.122569
#diatoms: compare 2020 vs 2021
effsz$bvol_mean[3]/effsz$bvol_mean[4] #1.654778
#proportion of diatom biovolume in samples
#2020
effsz$bvol_mean[3]/effsz$bvol_mean[1] #0.5478529
#2021
effsz$bvol_mean[4]/effsz$bvol_mean[2] #0.7027262
#compare regions
effszr<-s_pd_sum_l %>%
filter(region!="GB" & grepl("new",type)) %>%
group_by(type,region) %>%
summarize(
bvol_mean = mean(tot_bvol2)
,bvol_sd = sd(tot_bvol2)
, .groups = 'drop')
#diatoms: compare RV vs. MW
effszr$bvol_mean[6]/effszr$bvol_mean[4] #2.240234
#Statistics: total phyto biovolume--------
#glimpse(s_phyto_sum)
#create subset without Grizzly Bay samples
#samples sizes are too small for this region in 2021 and it is missing from 2020
s_phyto_sum_g <- s_phyto_sum %>%
filter(region!="GB")
#predictors to include: region, month, year
#could include station as a random effect. there are only 9 though
#build model
tbmod_new = glm(tot_bvol_new ~ region * month* year, data = s_phyto_sum_g)
#model checking plots
#plot(tbmod_new)
#plots aren't great
#residuals vs fitted: spread gets larger with higher values
#Q-Q plot: points stray pretty far off diagonal at upper end
#scale-location: line drifts up with higher values
#residuals vs leverage: looks OK
#redo analysis with log transformed response
tblmod_new_lg = glm(log(tot_bvol_new) ~ region * month* year, data = s_phyto_sum_g)
#model checking plots
#plot(tblmod_new_lg)
#looks like log transformation did the trick
#could also try a different error distribution like gamma
#look at model results
summary(tblmod_new_lg)
drop1(tblmod_new_lg, test="Chi")
#three-way interaction term isn't significant; p=0.2615
tblmod_new_lg2 = glm(log(tot_bvol_new) ~ region + month + year + region:month + region:year, data = s_phyto_sum_g)
#look at model results
summary(tblmod_new_lg2)
drop1(tblmod_new_lg2, test="Chi")
#neither two-way interaction is significant
#redo model with log transformed response and without interaction term
tblmod_new_lg3 = glm(log(tot_bvol_new) ~ region + month + year, data = s_phyto_sum_g)
#plot(tblmod_new_lg3)
#look at model results
summary(tblmod_new_lg3)
drop1(tblmod_new_lg3, test="Chi")
#neither region (p=0.6708) nor month (p=0.5262) are significant but year is (p=1.387e-05)
#Statistics: total diatom biovolume--------
#predictors to include: region, month, maybe salinity
#could include station as a random effect. there are only 9 though
#Note: there are 7 zeros that are currently excluded from the data set used in these models
#s_pd_sum: these 27 samples are NAs
#s_pd_sum_z: these 27 samples are zeros
zeros <-filter(s_pd_sum_z, d_tot_bvol_new == 0)
#RV = 13, ME = 6, MW = 8
#create subset without Grizzly Bay samples
#samples sizes are too small for this region in 2021 and it is missing from 2020
s_diatom_sum_g <- s_diatom_sum %>%
filter(region!="GB") %>%
mutate(year=factor(year)
,region=factor(region)) %>%
glimpse()
#predictors to include: region, month, year
#could include station as a random effect. there are only 9 though
#build model
tbmod_new = glm(d_tot_bvol_new ~ region * month* year, data = s_diatom_sum_g)
#model checking plots
#plot(tbmod_new)
#plots aren't great
#residuals vs fitted: spread gets larger with higher values
#Q-Q plot: points stray pretty far off diagonal at upper end
#scale-location: line drifts up with higher values
#residuals vs leverage: looks OK
#redo analysis with log transformed response
tblmod_new_lg = glm(log(d_tot_bvol_new) ~ region * month* year, data = s_diatom_sum_g)
#model checking plots
#plot(tblmod_new_lg)
#looks like log transformation did the trick
#could also try a different error distribution like gamma
#look at model results
summary(tblmod_new_lg)
drop1(tblmod_new_lg, test="Chi")
#three-way interaction term isn't significant; p=0.5824
tblmod_new_lg2 = glm(log(d_tot_bvol_new) ~ region + month + year + region:month + region:year, data = s_diatom_sum_g)
#look at model results
summary(tblmod_new_lg2)
drop1(tblmod_new_lg2, test="Chi")
#neither two-way interaction is significant, though region:month is somewhat close (p=0.09659)
#redo model with log transformed response and without interaction term
tblmod_new_lg3 = glm(log(d_tot_bvol_new) ~ region + month + year, data = s_diatom_sum_g)
#plot(tblmod_new_lg3)
#look at model results
summary(tblmod_new_lg3)
drop1(tblmod_new_lg3, test="Chi")
#region (p=0.03468)
#month (p=0.65924)
#year is (p=3.835e-05)
#pairwise comparisons among the three regions
tblmod_new_lg4 = aov(log(d_tot_bvol_new) ~ region + month + year, data = s_diatom_sum_g)
TukeyHSD(tblmod_new_lg4,"region")
names(s_diatom_sum_g)
#look at Grizzly Bay stations----------
#s_samp_count shows sample sizes
#602 = 13, GZB = 2
#look closer at Grizzly bay samples
gzb_sum <- gzb %>%
distinct(station_comb, station, date) %>%
arrange(date,station_comb,station)
#create a subset of 2021 GB samples with density and biovolume summed by sample
gb_sum <- s_phyto_sum %>%
filter(region=="GB" & year=="2021")
#plot time series of biovolume and density by station
ggplot(gb_sum, aes(x=date, y=tot_den, group=station,col=station))+ #specified what to plot on the x and y axes
geom_line() +
geom_point()
#plot total phytoplankton biovolume NEW by station
ggplot(gb_sum, aes(x=date, y=tot_bvol_new, group=station,col=station))+
geom_line() +
geom_point()
#next look closer at taxonomic composition
gb_phylum_sum <- s_phyto_phylum_sum %>%
filter(region=="GB" & year=="2021")
#stacked bar plot time series of density by phylum and station
ggplot(gb_phylum_sum, aes(x=station, y= tot_den,fill = phylum))+
geom_bar(position = "stack", stat = "identity") +
ylab("Phytoplankton Density (Organisms / mL)") + xlab("Station")+
facet_grid(~date)
#stacked bar plot time series of biovolume NEW by phylum and station
ggplot(gb_phylum_sum, aes(x=station, y= tot_bvol_new,fill = phylum))+
geom_bar(position = "stack", stat = "identity") +
ylab("Phytoplankton Biovolume (cubic microns per mL)") + xlab("Station")+
facet_grid(~date)