Locations

Lakes

lakes <- fread('../data/metadata/lakes.csv')

Weather Stations

stations <- fread('../data/weather/wu_stations.csv')

Sample Metadata

metadata_legend <- data.table::fread('../data/metadata/metadata_legend.csv')

2018

metadata_2018 <- data.table::fread('../data/metadata/2018.csv')

Download

2019

metadata_2019 <- data.table::fread('../data/metadata/2019.csv')

Download

2020

metadata_2020 <- data.table::fread('../data/metadata/2020.csv')

Download

All

metadata <- rbind(metadata_2018, metadata_2019)
set(metadata, j = "Date", value = as.Date(metadata$Date, "%m/%d/%Y"))
set(metadata, j = "Year", value = factor(metadata$Year, levels = sort(unique(metadata$Year))))
set(metadata, j = "Week", value = factor(metadata$Week, levels = sort(unique(metadata$Week))))
setorder(metadata, Location, Year, Week)

Risk Levels

metadata[, Risk := "No"]
data.table::set(metadata, which(metadata$Microcystin >= 0.1), "Risk", "Low")
data.table::set(metadata, which(metadata$Microcystin >= 1), "Risk", "Moderate")
data.table::set(metadata, which(metadata$Microcystin >= 8), "Risk", "High")
lake_risk_colors <- schuylR::create_palette(4, 'viridis')
lake_risk <- rep(lake_risk_colors[1], length(unique(metadata$Location)))
lake_risk[metadata[, sum(Risk %in% "Low")>5, by = Location]$V1] <- lake_risk_colors[2]
lake_risk[metadata[, sum(Risk %in% "Moderate")>4, by = Location]$V1] <- lake_risk_colors[3]
lake_risk[metadata[, sum(Risk %in% "High")>2, by = Location]$V1] <- lake_risk_colors[4]

Read Counts

ASV Table

read_counts <- construct_ASVtable('../data/16S_processing/finalized_reads')
read_counts <- read_counts[, c(TRUE, colSums(read_counts[,-1]) > 5000), with = FALSE]
data.table::setnames(read_counts, colnames(read_counts), gsub('_S.*', '', colnames(read_counts)))

Download

Classifications

classifications <- dada2::assignTaxonomy(read_counts[[1]], '../data/16S_processing/databases/rdp_train_set_18.fa.gz')
classifications <- dada2::assignTaxonomy(read_counts[[1]], '../data/16S_processing/databases/silva_nr99_v138.1_train_set.fa.gz')

Download

Weather

weather <- readRDS('../data/weather/weather.RDS')

Stations

station_coords <- fread('../data/weather/station_locations.csv')
station_dist <- dist(data.frame(station_coords[,c("Latitude", "Longitude")],
                                row.names = station_coords$Station))
station_dist <- as.data.table(as.matrix(station_dist))
neighbors <- station_dist[, lapply(.SD, order)]
for(col in colnames(neighbors)){set(neighbors, j = col, value = colnames(neighbors)[neighbors[[col]]])}

Impute Missing Days

k=2
set(weather, weather[,.I[low_temp > (mean(low_temp, na.rm=T)*1.25)], by=Date]$V1,
    c("low_temp", "avg_temp"), NA)
set(weather, weather[,.I[low_temp < (mean(low_temp, na.rm=T)*0.75)], by=Date]$V1,
    c("low_temp", "avg_temp"), NA)
set(weather, weather[,.I[high_temp > (mean(high_temp, na.rm=T)*1.25)], by=Date]$V1,
    c("high_temp", "avg_temp"), NA)
set(weather, weather[,.I[high_temp < (mean(high_temp, na.rm=T)*0.75)], by=Date]$V1,
    c("high_temp", "avg_temp"), NA)
numeric_cols <- colnames(weather)[sapply(weather, is.numeric)]
for(i in weather[,.I[!complete.cases(.SD)], .SDcols=numeric_cols]){
  na_cols <- numeric_cols[!(weather[i,lapply(.SD, complete.cases), .SDcols=numeric_cols])]
  station <- weather[i]$Station
  i_neighbors <- neighbors[,station, with=FALSE]
  i_neighbors <- i_neighbors[get(station) %in% weather[Date == weather[i]$Date]$Station]
  set(weather, as.integer(i), na_cols, 
      unique(weather[Date == weather[i]$Date][Station %in% unlist(i_neighbors[2:(k+1)])][,na_cols,with=F])[,lapply(.SD, mean, na.rm=TRUE), .SDcols = na_cols])
}
setorder(weather, Location, -Date)

Weekly Averages

weather <- cbind(Year = factor(levels = unique(metadata$Year)), weather)
weather <- cbind(Week = factor(levels = unique(metadata$Week)), weather)
for(date in rev(sort(unique(metadata$Date)))){
  set(weather, weather[,.I[Date <= date]], 'Week', value = unique(metadata[Date == date]$Week))
  set(weather, weather[,.I[Date <= date]], 'Year', value = unique(metadata[Date == date]$Year))
}
colsToMean <- colnames(weather)[-c(1:5,20)]
colsToSum <- colnames(weather)[c(20)]  
scols <- list(colsToMean, colsToSum)
funs <- rep(c('mean', 'sum'), lengths(scols))
jexp <- paste0('list(', paste0(unlist(scols), "=", funs, '(', unlist(scols), ')', collapse = ', '), ')')
weather <- weather[, eval(parse(text = jexp)), by = c('Week', 'Year', 'Location')]

Conversion to C

# temp_F_to_C <- function(x, sig.fig=1){round((x-32)*(5/9),sig.fig)}
# 
# set(weather, j = "avg_temp", value = temp_F_to_C(weather$avg_temp))
# set(weather, j = "high_temp", value = temp_F_to_C(weather$high_temp))
# set(weather, j = "low_temp", value = temp_F_to_C(weather$low_temp))
# set(weather, j = "avg_dew", value = temp_F_to_C(weather$avg_dew))
# set(weather, j = "high_dew", value = temp_F_to_C(weather$high_dew))
# set(weather, j = "low_dew", value = temp_F_to_C(weather$low_dew))
# set(weather, j = "avg_wind", value = round(weather$avg_wind/2.237,2))
# set(weather, j = "high_wind", value = round(weather$high_wind/2.237,2))
# set(weather, j = "low_wind", value = round(weather$low_wind/2.237,2))
# set(weather, j = "avg_gust", value = round(weather$avg_gust/2.237,2))
# set(weather, j = "high_gust", value = round(weather$high_gust/2.237,2))
# set(weather, j = "precip", value = round(weather$precip*2.54,3))

Cyanobacteria

cyanobacteria <- dada2::assignSpecies(classifications[Class == "Cyanobacteria"][[1]], refFasta = '../data/16S_processing/databases/rdp_species_assignment_18.fa.gz', verbose = T)

Phyloseq Object

seqs_2018 <- readRDS('../data/16S_processing/dada2/seqtab_2018.RDS')
taxa_silva_2018 <- readRDS('../data/16S_processing/dada2/taxa_silva_species_2018.RDS')
taxa_rdp_2018 <- readRDS('../data/16S_processing/dada2/taxa_rdp_species_2018.RDS')

seqs_2019 <- readRDS('../data/16S_processing/dada2/seqtab_2019.RDS')
taxa_silva_2019 <- readRDS('../data/16S_processing/dada2/taxa_silva_species_2019.RDS')
taxa_rdp_2019 <- readRDS('../data/16S_processing/dada2/taxa_rdp_species_2019.RDS')

sample_data <- data.frame(unique(merge(metadata, weather,
                          by = c("Location", "Week", "Year"),
                          all.x = TRUE)), row.naΓmes = 4)

ps_2018 <- phyloseq::phyloseq(phyloseq::otu_table(seqs_2018, taxa_are_rows=FALSE), 
                         phyloseq::tax_table(taxa_silva_2018),
                         phyloseq::sample_data(data.frame(sample_data, row.names = "File")))
ps_2019 <- phyloseq::phyloseq(phyloseq::otu_table(seqs_2019, taxa_are_rows=FALSE), 
                         phyloseq::tax_table(taxa_silva_2019),
                         phyloseq::sample_data(data.frame(sample_data, row.names = "File")))

lake_po <- phyloseq::merge_phyloseq(ps_2018, ps_2019)
lake_po <- phyloseq::subset_samples(lake_po, sample_sums(lake_po) >= 10000)
low_taxa <- taxa_sums(conglomerate_taxa(lake_po, "Phylum", FALSE))
low_taxa <- names(low_taxa[low_taxa < 1000])
lake_po <- taxa_prune(lake_po, low_taxa, "Phylum", na.rm = TRUE)
lake_po <- taxa_prune(lake_po, c("Chloroplast", "Mitochondria"))
lake_po <- taxa_filter(lake_po)
lake_po
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 41651 taxa and 1060 samples ]
## sample_data() Sample Data:       [ 1060 samples by 38 sample variables ]
## tax_table()   Taxonomy Table:    [ 41651 taxa by 7 taxonomic ranks ]

Download

Maps

iowa <- readRDS('../data/maps/iowa_terrain_map.RDS')
iowa_bw <- readRDS('../data/maps/iowa_terrain_map_bw.RDS')

High and Low Risk Lakes

sam <- dcast(data.table(as(lake_po@sam_data, 'data.frame')), Location ~ Risk, fun.aggregate = length)
setkey(sam, High)
high_risk_lakes <- tail(sam$Location, 8)
low_risk_lakes <-head(sam$Location, 8)

lakes[, Risk := "grey"]
set(lakes, sapply(gsub(" Beach","",high_risk_lakes), grep, lakes$Lake), "Risk", "red")
set(lakes, sapply(gsub(" Beach","",low_risk_lakes), grep, lakes$Lake), "Risk", "blue")

lake_hl <- taxa_filter(lake_po,
                       treatment = "Location", 
                       subset = c(high_risk_lakes, low_risk_lakes))
lake_hl <- merge_treatments(lake_hl, c("Week", "Year"))
sam <- cbind(lake_hl@sam_data, Lake_Risk = "Low")
sam[sam$Location %in% high_risk_lakes,]$Lake_Risk <- "High"
lake_hl@sam_data <- sample_data(sam)
sam <- data.table(sam)
replace_DT_NA(sam)



Schuyler Smith
Ph.D. Student - Bioinformatics and Computational Biology
Iowa State University. Ames, IA.