Alessandro Filazzola, Octavia Mahdiyan, Kevin Blagrave, Carolyn Ewins, Arnab Shuvo, Tanzil Sadid, Luke Moslenko, Mohammad Arshad, Derek Gray, Roberto Quinlan, Catherine O’Reilly & Sapna Sharma
Part of the York University Aquatic Research Group
Measures of chlorophyll represent the primary productivity in freshwater lakes that is often used by managers as proxy for water quality. However, the abundance of chlorophyll in lakes is dependent on many interacting factors such as spatial heterogeneity (within and among lakes), climate, and anthropogenic disturbance. Aquatic research examining water chemistry frequently include measures of chlorophyll values and this data is readily available in published manuscripts or online repositories. Thus, there is an opportunity to synthesize a global database on the water chemistry of lakes with focus on chlorophyll. The purpose of this project is to generate a database of freshwater lakes across the globe and through time. We intend to conduct a systematic review examining over 3000 published manuscripts that examined lake chlorophyll and supplement this data with online repositories such as KnB, Dryad, and Pangaea. Using the geospatial coordinates of the lakes we can relate measures of chlorophyll to climate patterns, anthropogenic disturbances, and other changes over time. This database will be used by researchers to improve our understanding of how chlorophyll responds to global change and assist aquatic managers responsible for maintaining water quality in lakes.
A systematic literature search was conducted using Web of Science for all articles between 2000 and 2018. This time frame was chosen because it captures the majority of the literature that measured chlorophyll values. Papers were excluded that were not primary articles and were not relevant (e.g. virology, sociology, physics, etc)
Data was be extracted from papers or supplemental materials. Authors will be contacted for their available data if not available online. Additional datasets will be obtained used online repostiories (e.g. Dryad, KnB, Github) and more general searches online.
chlorophyll
AND lake*
#libraries
library(tidyverse)
library(ggmap)
library(dataone)
## get standard error function
se <- function(x) sqrt(var(x)/length(x))
options(scipen=6) ## lower threshold for scientific notation
## Select KNB repo for download and database identifiers
cn <- CNode("PROD")
mn <- getMNode(cn, "urn:node:KNB")
id <- "doi:10.5063/F1RV0M1S"
uuid <- "urn:uuid:317170a4-fbe5-4b20-8d32-bdf896b2b75e"
## Get Meta data
library(XML)
metadata <- rawToChar(getObject(mn, id))
doc = xmlRoot(xmlTreeParse(metadata, asText=TRUE, trim = TRUE, ignoreBlanks = TRUE))
tf <- tempfile()
saveXML(doc, tf)
## [1] "C:\\Users\\alexf\\AppData\\Local\\Temp\\RtmpY9anko\\file15f07aed77c0"
file.show(tf)
## Get Raw data
databasePath <- getPackage(mn, id=uuid, dirPath=".", unzip=T)
## List files
list.files(paste0(databasePath,"/data/"))
## [1] "A_global_database_of_chlorophyll_and_water.xml"
## [2] "ChlaData.csv"
## [3] "doi_10.5063_F1RV0M1S-METADATA.pdf"
## [4] "methodsData.csv"
## [5] "MS_citations.csv"
## [6] "Repo_citations.csv"
## [7] "resourceMap_urn_uuid_317170a4-fbe5-4b20-8d32-bdf896b2b75e.rdf.xml"
data <- read.csv("data//ChlaData.csv")
## plot out global distribution of lake chl
mp <- NULL
mapWorld <- borders("world", colour="gray50", fill="gray50") # create a layer of borders
mp <- ggplot() + mapWorld
mp <- mp+ geom_point(data=data , aes(x=Lon, y=Lat), size=1, alpha=0.3, color="#176117") + xlab("Longitude") + ylab("Latitude")
mp
## get standard error function
se <- function(x) sqrt(var(x)/length(x))
## Plot distribution of Chl values
ggplot(data, aes(x=log(ChlValues))) + geom_density(fill="#9ACC7C") + xlab("Log transformed Chlorophyll a (mg/L)") + ylab("Frequency of observered values") +
annotate("text", x=-1, y=0.3, label=paste("minimum = 0 mg/L"), size=5) + ## min
annotate("text", x=-1, y=0.27, label=paste("maximum = ",max(data$ChlValues), "mg/L"), size=5) + ## max
annotate("text", x=-1, y=0.24, label=paste("median ± SE = ",round(median(data$ChlValues),3), "±", formatC(round(se(data$ChlValues),4), digits=4,format="f") , "mg/L"), size=5) +theme(axis.text=element_text(size=12),
axis.title=element_text(size=14,face="bold"))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + xlim(-10,3)
# + geom_vline(xintercept = log(2.5/1000), lty=2) + geom_vline(xintercept = log(0.001), lty=2)
method <- read.csv("data//methodsDataV2.csv", stringsAsFactors = F)
## merge methods file, chlorophyll, and survey information into one file
methodID <- merge(data, method, by="StudyID", all.x=T)
## method for calculating Chl
chlsurvey <- methodID %>% group_by(Survey.Type) %>% summarize(chl=mean(ChlValues), error=se(ChlValues)*1.96, n=length(ChlValues)) %>% filter(!Survey.Type %in% c("not described","undescribed","")) %>% filter(!is.na(Survey.Type))
plot1 <- ggplot(chlsurvey, aes(x=Survey.Type, y=chl))+ geom_bar(stat="identity", fill="#9ACC7C", color="black") + geom_errorbar(aes(x=Survey.Type, ymin=chl-error, ymax=chl+error), lwd=1, width=0.2) + xlab("") + ylab("Chlorophyll a (mg / L)")+theme(axis.text=element_text(size=12),
axis.title=element_text(size=14,face="bold"))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
scale_y_continuous(expand = c(0, 0), limits = c(0,0.08)) + annotate(geom ="text", x=0.5, y=0.075, label="a", size=10)
## water survey location
chllocation <- methodID %>% group_by(Depth.qual) %>% summarize(chl=mean(ChlValues), error=se(ChlValues)*1.96, n=length(ChlValues)) %>% filter(!Depth.qual %in% c("not described","undescribed")) %>% filter(!is.na(Depth.qual))
plot2 <- ggplot(chllocation, aes(x=Depth.qual, y=chl))+ geom_bar(stat="identity", fill="#9ACC7C", color="black") + geom_errorbar(aes(x=Depth.qual, ymin=chl-error, ymax=chl+error), lwd=1, width=0.5) + xlab("") + ylab("")+theme(axis.text=element_text(size=12),
axis.title=element_text(size=14,face="bold"))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
scale_y_continuous(expand = c(0, 0), limits = c(0,0.04))+ annotate(geom ="text", x=0.5, y=0.075, label="b", size=10)
library(gridExtra)
grid.arrange(plot1, plot2, ncol=2)
## compare methods
method %>% group_by(Depth.qual) %>% summarize(n=length(Depth.quant))
## # A tibble: 4 x 2
## Depth.qual n
## <chr> <int>
## 1 integrated 68
## 2 specific depth 22
## 3 surface 118
## 4 undescribed 77
chl <- data %>% mutate(Chl.ug = ChlValues*1000) %>% mutate(trophic=cut(Chl.ug, breaks=c(-Inf, 7, 9, 26, 75, Inf), labels=c("Oligotrophic","Oligo-mesotrophic","Mesotrophic","Eutrophic","Hyper-eutrophic")))
## breaks taken from NWT Government Worksheet https://www.enr.gov.nt.ca/sites/enr/files/chlorophyll.pdf
chl.n <- chl %>% group_by(trophic) %>% summarize(n=length(Chl.ug))
## plot out lake frequency
ggplot(chl.n, aes(x=trophic, y=n)) + geom_bar(stat="identity")
## Unique Lake IDs
data[,"UniqueLakeLocation"] <- paste0(data$Lat, data$Lon, sep=";")
data2 <-data
data2 <- data2[!duplicated(data2$UniqueLakeLocation),]
## number of unique locations
nrow(data2)
## [1] 15302
## Number of Countries surveyed
library(maps)
data[,"CountryOfficial"] <- map.where(database="world", data$Lon, data$Lat)
## separate subcountry identifiers
country <- gsub("?:.*", "", data$CountryOfficial)
length(unique(country))
## [1] 72
## water chem values
data %>% filter(!is.na(TN)) %>% summarize(low=min(TN), high=max(TN), avg=mean(TN), n=length(TN), prop=n/nrow(data)) ## amount of TN values
## low high avg n prop
## 1 0 20.574 0.9079891 39457 0.1729296
data %>% filter(!is.na(TP)) %>% summarize(low=min(TP), high=max(TP), avg=mean(TP), n=length(TP), prop=n/nrow(data))## amount of TP values
## low high avg n prop
## 1 0 3.6 0.04215493 111872 0.4903054
data %>% filter(!is.na(DO)) %>% summarize(low=min(DO), high=max(DO), avg=mean(DO), n=length(DO), prop=n/nrow(data))## amount of DO values
## low high avg n prop
## 1 1.3286 67.7 9.835084 761 0.003335262
data %>% filter(!is.na(DOC)) %>% summarize(low=min(DOC), high=max(DOC), avg=mean(DOC), n=length(DOC), prop=n/nrow(data))## amount of DOC values
## low high avg n prop
## 1 0.0001 1 0.007775983 10517 0.04609323
data %>% filter(!is.na(pH)) %>% summarize(low=min(pH), high=max(pH), avg=mean(pH), n=length(pH), prop=n/nrow(data)) ## amount of pH
## low high avg n prop
## 1 5.5 10.5 7.989574 12936 0.05669507
data %>% filter(!is.na(ChlValues)) %>% summarize(low=min(ChlValues), high=max(ChlValues), avg=mean(ChlValues), n=length(ChlValues), prop=n/nrow(data))
## low high avg n prop
## 1 0 4.33 0.01749021 228168 1
## lake char values
data %>% filter(!is.na(Depth.max)) %>% summarize(low=min(Depth.max), high=max(Depth.max), avg=mean(Depth.max), n=length(Depth.max), prop = n/nrow(data)) ## max lake depth
## low high avg n prop
## 1 0 310 15.64502 188205 0.8248527
data %>% filter(!is.na(Depth.mean)) %>% summarize(low=min(Depth.mean), high=max(Depth.mean), avg=mean(Depth.mean), n=length(Depth.mean), prop = n/nrow(data))## mean lake depth
## low high avg n prop
## 1 0.2 154 6.999963 72786 0.3190018
data %>% filter(!is.na(Secchi)) %>% summarize(low=min(Secchi), high=max(Secchi), avg=mean(Secchi), n=length(Secchi), prop = n/nrow(data)) ## Secchi depth
## low high avg n prop
## 1 0 61.7 2.758682 195782 0.8580607
data %>% filter(!is.na(SurfaceArea)) %>% summarize(low=min(SurfaceArea), high=max(SurfaceArea), avg=mean(SurfaceArea), n=length(SurfaceArea), prop=n/nrow(data)) ## surface area
## low high avg n prop
## 1 0 32056.74 25.11228 211975 0.9290304
data %>% filter(!is.na(LakeVolume)) %>% summarize(low=min(LakeVolume), high=max(LakeVolume), avg=mean(LakeVolume), n=length(LakeVolume), prop=n/nrow(data)) ## surface area
## low high avg n prop
## 1 111700 9400000000 878218246 104 0.0004558045
chlaLong <- data %>% gather(variable, value, 9:19)
chlaLong[chlaLong$variable=="pH","value"] <- 10^chlaLong[chlaLong$variable=="pH","value"]
ggplot(chlaLong, aes(y=log10(value))) + geom_boxplot() + facet_grid(.~variable, scales="free") +theme(axis.text=element_text(size=12),
axis.title=element_text(size=14,face="bold"))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
## Warning: Removed 1437916 rows containing non-finite values (stat_boxplot).
## Select only repo datasets
data$StudyID <- as.character(data$StudyID)
repoData <- data[grep("repo*", data$StudyID),]
## Number of lakes
repoData %>% group_by(StudyID) %>% summarize(n=length(unique(UniqueLakeName)))
## # A tibble: 14 x 2
## StudyID n
## <chr> <int>
## 1 repo1 39
## 2 repo10 2
## 3 repo11 24
## 4 repo12 49
## 5 repo13 111
## 6 repo14 8218
## 7 repo2 1
## 8 repo3 215
## 9 repo4 332
## 10 repo5 2168
## 11 repo6 96
## 12 repo7 1059
## 13 repo8 7
## 14 repo9 1
## Number of observations
repoData %>% group_by(StudyID) %>% summarize(n=length(uniqueID))
## # A tibble: 14 x 2
## StudyID n
## <chr> <int>
## 1 repo1 1231
## 2 repo10 8
## 3 repo11 24
## 4 repo12 52
## 5 repo13 112
## 6 repo14 209732
## 7 repo2 222
## 8 repo3 219
## 9 repo4 345
## 10 repo5 9568
## 11 repo6 103
## 12 repo7 1162
## 13 repo8 102
## 14 repo9 476
## Timeframe
repoData %>% group_by(StudyID) %>% mutate(Year = as.numeric(as.character(Year))) %>% summarize(first = min(Year), last = max(Year))
## # A tibble: 14 x 3
## StudyID first last
## <chr> <dbl> <dbl>
## 1 repo1 1969 2017
## 2 repo10 2008 2011
## 3 repo11 2000 2000
## 4 repo12 1998 2000
## 5 repo13 1999 2004
## 6 repo14 1933 2013
## 7 repo2 1975 2018
## 8 repo3 1975 1985
## 9 repo4 2015 2015
## 10 repo5 1967 2019
## 11 repo6 1974 2010
## 12 repo7 2007 2007
## 13 repo8 1993 2018
## 14 repo9 1977 2016
method <- read.csv("data//methodsDataV2.csv", stringsAsFactors = F)
method %>% group_by(Survey.Type) %>% summarize(n=length(Survey.Type))
## # A tibble: 3 x 2
## Survey.Type n
## <chr> <int>
## 1 in situ 238
## 2 satellite/modelled 10
## 3 undescribed 37
method %>% group_by(DepthDetails) %>% summarize(n=length(DepthDetails))
## # A tibble: 8 x 2
## DepthDetails n
## <chr> <int>
## 1 "epilimnion " 4
## 2 "hypolimnion" 2
## 3 "integrated water sample" 34
## 4 "metadata" 4
## 5 "range" 99
## 6 "sampling station" 1
## 7 "surface water sample " 43
## 8 "undescribed" 98
max(method$DepthDeep, na.rm=T) ## Lake Baikal
## [1] 250
method %>% group_by(DetectionLimits) %>% summarize(n=length(DetectionLimits))
## # A tibble: 8 x 2
## DetectionLimits n
## <dbl> <int>
## 1 0.00001 40
## 2 0.00005 1
## 3 0.0001 161
## 4 0.0005 1
## 5 0.001 35
## 6 0.01 18
## 7 0.1 18
## 8 NA 11