Mock-Community Analysis

Theoretical ARG Counts

mock_samples <- as.character(metadata[grep('Mock', metadata$`ID`),][`Passed QC` == TRUE][["Sample"]])
for_blast <- data.table::fread('../data/mock/forward_primers.blast')
  for_blast <- merge(for_blast, classifications, by.x = "V1", by.y = "Target")
  for_blast <- for_blast[Gene != "16S"]
  for_blast <- for_blast[V3 == 100]
  for_blast <- for_blast[V4 >= 18]
rev_blast <- data.table::fread('../data/mock/reverse_primers.blast')
  rev_blast <- merge(rev_blast, classifications, by.x = "V1", by.y = "Target")
  rev_blast <- rev_blast[Gene != "16S"]
  rev_blast <- rev_blast[V3 == 100]
  rev_blast <- rev_blast[V4 >= 18]
mock_blast <- for_blast[paste0( for_blast[[1]], for_blast[[2]]) %in% paste0(rev_blast[[1]], rev_blast[[2]]),]

strains[, Abundance := Genome_copies_per_microgram * micrograms]
mock_genome <- vector()
i=1
for(genome in mock_blast$V2){
    genome <- mock_genome_names[sapply(strsplit(mock_genome_names, ' '), FUN=function(x){any(x %in% genome)})]
    genome <- strains[apply(strains,1,FUN=function(x){any(x %in% unlist(strsplit(genome, " ")))})]$Strain
    mock_genome <- c(mock_genome, genome)
} 
set(mock_blast, j = "V2", value = mock_genome)
setorder(mock_blast, V2, V9)

mock_expected_counts <- mock_blast[,c('V2', 'V1', 'V9', 'Family', 'Class')]
setnames(mock_expected_counts, c('V2','V1'), c('Microbe', 'Target'))
mock_expected_counts <- merge(mock_expected_counts, strains[,c(1:5,14)], by.x = "Microbe", by.y = "Strain")
mock_expected_counts <- mock_expected_counts[, .(count = sum(Abundance)), by = c( 'Class', "Family", 'Target', "Microbe")]
mock_expected_counts[, count := round(count/sum(count), 4)]
mock_expected_counts[, Sample := 'Theoretical']
setcolorder(mock_expected_counts, c("Sample", "Microbe", "Family", "Class", 'Target', "count"))

Profile

graph_data <- mock_expected_counts[, c("Sample", "Class", "count")]
graph_data <- graph_data[, lapply(.SD, sum, na.rm=TRUE), by=Class, .SDcols=c("count")]
set(graph_data, j = "Class", value = factor(graph_data$Class, levels = sort(graph_data$Class)))
setorder(graph_data, -Class)
graph_data[, ypos := cumsum(count)- 0.5*count]
mock_profile <- ggplot(graph_data, aes(x="", y=count, fill=Class)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) +
  theme_void() + 
  scale_fill_manual(values = schuylR::create_palette(length(unique(graph_data[['Class']])))) +
  geom_text(aes(y = ypos, label = Class), color = "black", size=4, nudge_x = 0.3)

ARG Abundances

Primer Success

mock_expected_ARGs <- c(unique(sort(mock_expected_counts$Target)), 'Ref')
mock_observed_counts <- read_counts[, c(1:3, grep('Mock', colnames(read_counts))), with = FALSE]
mock_observed_counts <- mock_observed_counts[rowSums(mock_observed_counts[,-c(1:3)]) > 1,]
mock_successful_primers <- mock_expected_ARGs[mock_expected_ARGs %in% mock_observed_counts$Target]
unexpected_primers <- mock_observed_counts$Target[!(mock_observed_counts$Target %in% mock_expected_ARGs)]
setnames(mock_observed_counts, colnames(mock_observed_counts)[-c(1:3)], 
         as.character(metadata$Sample[match(colnames(mock_observed_counts)[-c(1:3)], metadata$ID)]))

ARG-Heatmap

classification <- 'Family'
expected_genes <- mock_expected_counts[,c('Sample', classification, 'count'), with=FALSE]
expected_genes[, Abundance := count * 20000]
expected_genes <- dcast(expected_genes, Family ~ Sample, value.var = 'Abundance', fun.aggregate = sum)

graph_data <- mock_observed_counts[get(classification) %in% c(mock_expected_counts[[classification]], "Ref")]
graph_data <- merge(expected_genes, graph_data[,-c(1:3)[!(colnames(graph_data)[1:3] %in% classification)], 
                                               with=FALSE], by = classification, all = TRUE)
graph_data <- schuylR::replace_NA(graph_data)
graph_data <- rarefy(data=graph_data, min(colSums(graph_data[,-1])))
graph_data <- melt(graph_data, id.vars = classification, variable.name = 'Sample', value.name = 'Abundance')
set(graph_data, j = 'Sample', value = factor(graph_data[['Sample']], 
                                               levels = c('Theoretical', mock_samples)))
graph_data <- graph_data[, lapply(.SD, sum, na.rm=TRUE), by = c('Sample', classification), .SDcols=c("Abundance")]

set(graph_data, j = classification, value = factor(as.character(graph_data[[classification]]), 
      levels = rev(c(sort(unique(graph_data[get(classification) != 'Ref'][[classification]])),'Ref'))))
set(graph_data, which(graph_data[['Abundance']] < 1), j='Abundance', value=NA)
set(graph_data, j = "Abundance", value = log(graph_data$Abundance, base = 2))
ARG_counts <- ggplot(graph_data, aes_string("Sample", classification, fill = "Abundance")) + 
  geom_tile(color = "white", size = 0.25) +
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 20, hjust = 1, size = 10), 
        axis.text.y = element_text(hjust = 0.95, size = 10), 
        axis.title.x = element_text(size = 10, face = "bold"), 
        axis.title.y = element_text(size = 10, face = "bold"),
        axis.ticks.x = element_blank(), 
        legend.title = element_text(size = 8, face = "bold"), 
        legend.text = element_text(size = 8),
        legend.spacing.x = unit(0.005, "npc"), 
        legend.key.size = unit(6, "mm"), 
        legend.background = element_rect(fill = (alpha = 0), color = "black", size = 0.25), 
        panel.background = element_rect(color = "black",  size = 1.4), 
        strip.text.x = element_text(size = 10, face = "bold"), 
        strip.background = element_rect(colour = "black", size = 1.4)) + 
  scale_x_discrete(expand = expansion(mult = 0, add = 0.53)) +  
  ggraph::scale_fill_viridis(direction = -1, limits = c(min(graph_data$Abundance, na.rm = T), max(graph_data$Abundance, na.rm = T))) +
  labs(x = "Sample")

DARTE-QM found 55 (98.2%) of the 56 primers in the Mock-Community pangenomes that were targeted.

Primer-Heatmap

DARTE-QM found 80 (94.1%) of the 85 primers in the Mock-Community pangenomes that were targeted.

classification <- 'Target'
expected_genes <- mock_expected_counts[,c('Sample', classification, 'count'), with=FALSE]
expected_genes[, Abundance := count * 20000]
expected_genes <- dcast(expected_genes, Target ~ Sample, value.var = 'Abundance', fun.aggregate = sum)

graph_data <- mock_observed_counts[get(classification) %in% c(mock_expected_counts[[classification]], "Ref")]
graph_data <- merge(expected_genes, graph_data[,-c(1:3)[!(colnames(graph_data)[1:3] %in% classification)], 
                                               with=FALSE], by = classification, all = TRUE)
graph_data <- schuylR::replace_NA(graph_data)
graph_data <- rarefy(data=graph_data, min(colSums(graph_data[,-1])))
graph_data <- melt(graph_data, id.vars = classification, variable.name = 'Sample', value.name = 'Abundance')
set(graph_data, j = 'Sample', value = factor(graph_data[['Sample']], 
                                               levels = c('Theoretical', mock_samples)))
primer_graph_data <- graph_data[, lapply(.SD, sum, na.rm=TRUE), by = c('Sample', classification), .SDcols=c("Abundance")]
setorder(primer_graph_data, Abundance)
set(primer_graph_data, j = classification, value = factor(as.character(primer_graph_data[[classification]]), 
                                                   levels = unique(as.character(primer_graph_data[[classification]]))))
set(primer_graph_data, which(primer_graph_data[['Abundance']] < 1), j='Abundance', value=NA)

primer_graph_data$Target <- gsub('_.?_DARTE-QM', '', primer_graph_data$Target)
set(primer_graph_data, j = 'Target', value = factor(primer_graph_data$Target, levels = rev(gsub('_.?_DARTE-QM', '', mock_expected_ARGs))))

Primer_counts <- ggplot(primer_graph_data, aes_string("Sample", classification, fill = "Abundance")) + 
  geom_tile(color = "white", size = 0.25) +
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 20, hjust = 1, size = 10), 
        axis.text.y = element_text(hjust = 0.95, size = 10), 
        axis.title.x = element_text(size = 10, face = "bold"), 
        axis.title.y = element_text(size = 10, face = "bold"),
        axis.ticks.x = element_blank(), 
        legend.title = element_text(size = 8, face = "bold"), 
        legend.text = element_text(size = 8),
        legend.spacing.x = unit(0.005, "npc"), 
        legend.key.size = unit(6, "mm"), 
        legend.background = element_rect(fill = (alpha = 0), color = "black", size = 0.25), 
        panel.background = element_rect(color = "black",  size = 1.4), 
        strip.text.x = element_text(size = 10, face = "bold"), 
        strip.background = element_rect(colour = "black", size = 1.4)) + 
  scale_x_discrete(expand = expansion(mult = 0, add = 0.53)) +  
  ggraph::scale_fill_viridis(limits = c(min(graph_data$Abundance, na.rm = T), max(graph_data$Abundance, na.rm = T))) +
  labs(x = "Sample")

ARG-Family-Heatmap

classification <- 'Class'
expected_genes <- mock_expected_counts[,c('Sample', classification, 'count'), with=FALSE]
expected_genes[, Abundance := count * 20000]
expected_genes <- dcast(expected_genes, Class ~ Sample, value.var = 'Abundance', fun.aggregate = sum)

graph_data <- mock_observed_counts[get(classification) %in% c(mock_expected_counts[[classification]], "Ref")]
graph_data <- merge(expected_genes, graph_data[,-c(1:3)[!(colnames(graph_data)[1:3] %in% classification)], 
                                               with=FALSE], by = classification, all = TRUE)
graph_data <- schuylR::replace_NA(graph_data)
graph_data <- rarefy(data=graph_data, min(colSums(graph_data[,-1])))
graph_data <- melt(graph_data, id.vars = classification, variable.name = 'Sample', value.name = 'Abundance')
set(graph_data, j = 'Sample', value = factor(graph_data[['Sample']], 
                                               levels = c('Theoretical', mock_samples)))
graph_data <- graph_data[, lapply(.SD, sum, na.rm=TRUE), by = c('Sample', classification), .SDcols=c("Abundance")]

setorder(graph_data, Abundance)
set(graph_data, j = classification, value = factor(as.character(graph_data[[classification]]), levels = unique(as.character(graph_data[[classification]]))))
set(graph_data, which(graph_data[['Abundance']] < 1), j='Abundance', value=NA)

setorder(graph_data, Sample)
Class_counts <- ggplot(graph_data, aes_string("Sample", classification, fill = "Abundance")) + 
  geom_tile(color = "white", size = 0.25) +
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 20, hjust = 1, size = 10), 
        axis.text.y = element_text(hjust = 0.95, size = 10), 
        axis.title.x = element_text(size = 10, face = "bold"), 
        axis.title.y = element_text(size = 10, face = "bold"),
        axis.ticks.x = element_blank(), 
        legend.title = element_text(size = 8, face = "bold"), 
        legend.text = element_text(size = 8),
        legend.spacing.x = unit(0.005, "npc"), 
        legend.key.size = unit(6, "mm"), 
        legend.background = element_rect(fill = (alpha = 0), color = "black", size = 0.25), 
        panel.background = element_rect(color = "black",  size = 1.4), 
        strip.text.x = element_text(size = 10, face = "bold"), 
        strip.background = element_rect(colour = "black", size = 1.4)) + 
  scale_x_discrete(expand = expansion(mult = 0, add = 0.53)) +  
  ggraph::scale_fill_viridis(limits = c(min(graph_data$Abundance, na.rm = T), max(graph_data$Abundance, na.rm = T))) +
  labs(x = "Sample")

Distance Matrix

dist_matrix <- primer_graph_data
dist_matrix <- dcast(dist_matrix[Target %in% dist_matrix[, sum(Abundance), by = 'Target'][V1 > 0]$Target],
           Target ~ Sample, value.var = 'Abundance')
replace_DT_NA(dist_matrix)
dist_matrix <- as.matrix(round(vegdist(t(dist_matrix[,-c(1)])), 3))
dist_matrix_plot <- dist_matrix
dist_matrix_plot[upper.tri(dist_matrix)] <- NA
dist_matrix_plot <- data.table(melt(dist_matrix_plot))
set(dist_matrix_plot, j = "Var1", value = factor(dist_matrix_plot$Var1, levels = rev(unique(dist_matrix_plot$Var1))))
dist_matrix_plot <- ggplot(data = dist_matrix_plot, aes(x=Var2, y=Var1, fill=value)) + 
  geom_tile() + 
  viridis::scale_fill_viridis(limits=c(0,1), na.value="white") + 
  theme(
    axis.text.x = element_text(hjust = 0.95, angle = 20),
    panel.background = element_rect(fill = "white", color = "white")
  ) + 
  labs(x = "", y = "", fill = "BC-Distance") +
  geom_text(aes(Var2, Var1, label = round(value,2)), color = "white", size = 3)

Assessing the ARGs found in the pangenome of the mock-community, DARTE-QM showed high precision with the mean distance between all observed-samples found to be 0.1, whereas the accuracy compared to our theoretical distribution was weaker, with a mean distance of 0.52.

Synthetic Oligonucleotide ‘Ref’

graph_data <- melt(Ref_counts, id.vars = c('Family','Class', 'Target'))
setnames(graph_data, colnames(graph_data[,-c(1:3)]), c("Sample", "Ref"))
graph_data[, Ref_Level := as.numeric(gsub('-','.',
              gsub('Mock-','', gsub('spike-.?','', metadata$ID[match(as.character(graph_data$Sample), metadata$Sample)]))))]
setkey(graph_data, 'Ref_Level')
graph_data[, Color := schuylR::create_palette(length(unique(graph_data[['Ref_Level']])))[as.factor(graph_data[['Ref_Level']])]]

The added levels of the Ref-sequence were detected with a linear relation to their laboratory concentrations with an \(R^{2}\) = 0.91.

Linear Regression

Ref_points <-  ggplot(graph_data, aes_string(x = 'Ref_Level', y = 'Ref')) + 
  geom_smooth(formula = y~x, method = "lm", se = FALSE, aes(x = Ref_Level, y = Ref), color = 'black' ) + 
  geom_point(stat = "identity", size = 4, alpha = 1, position = position_dodge(width = 0), color = graph_data$Color) + 
  theme_bw() + 
  theme(axis.text.x = element_text(hjust = 1, size = 10), 
    axis.text.y = element_text(hjust = 0.95, size = 10), 
    axis.title.x = element_text(size = 12, face = "bold"), 
    axis.title.y = element_text(size = 12, face = "bold"), 
    axis.ticks.x = element_blank(), 
    legend.title = element_text(size = 10, face = "bold"), 
    legend.text = element_text(size = 8), 
    legend.spacing.x = unit(0.005, "npc"), 
    legend.key.size = unit(4, "mm"), 
    legend.background = element_rect(fill = (alpha = 0)), 
    panel.background = element_rect(color = "black", size = 1.5, fill = "white"), 
    panel.spacing = unit(0.015, "npc"), 
    strip.text.x = element_text(size = 10, face = "bold", color = "black"), 
    strip.background = element_rect(colour = "black", size = 1.4, fill = "white")) + 
  labs(x = "Ref Level", y = "Ref Sequences Detected at >= 98% Identity")

Bar Chart

Ref_bars <-  ggplot(graph_data,
                      aes_string(x = 'Sample', y = 'Ref')) + 
  geom_bar(stat = "identity", size = 0.45, alpha = 1, position = "dodge",
           fill = graph_data$Color) + 
  theme_light() +
  theme(
    axis.line.x = element_line(
      colour = 'black',
      size = 1,
      linetype = 'solid'
    ),
    axis.line.y = element_line(
      colour = 'black',
      size = 1,
      linetype = 'solid'
    ),
    axis.text.x = element_text(
      size = 10,
      vjust = 1,
      hjust = 1,
      angle = 30
    ),
    axis.title.x = element_text(size = 12, face = "bold"),
    axis.title.y = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10),
    legend.title = element_text(size = 12, face = "bold"),
    legend.background = element_rect(fill = (alpha = 0)),
    legend.key.size = unit(4, "mm"),
    legend.spacing.x = unit(0.005, 'npc'),
    strip.text.x = element_text(size = 10, face = 'bold', color = 'black'),
    strip.background = element_rect(colour = 'black', size = 1.4, fill = 'white')
  ) +
  scale_y_continuous(expand = expansion(mult = c(0.0025, 0.01))) +
  ylab("Read Count of Ref Sequence") + 
  xlab("")

Table

Unexpected ARGs

ARGs

classification = "Family"
graph_data <- mock_observed_counts[!(get(classification) %in% mock_expected_counts[[classification]])]

graph_data <- melt(graph_data, id.vars = colnames(graph_data)[1:3], variable.name = 'Sample')
graph_data <- graph_data[, sum(value), by = c('Sample', classification)]
set(graph_data, j = 'Sample', value = factor(graph_data[['Sample']], levels = mock_samples))

graph_data[[classification]] <- factor(graph_data[[classification]], 
                                       levels = c('Ref', rev(unique(graph_data[[classification]][!(graph_data[[classification]]=='Ref')]))))
set(graph_data, which(graph_data[['V1']] < 1), j='V1', value=NA)

unexpected_graph <- ggplot(graph_data, aes_string("Sample", classification, fill = "V1")) + 
  geom_tile(color = "white", size = 0.25) +
  guides(colour = guide_legend(ncol = ceiling(length(
    unique(graph_data[[classification]])
  ) / 25))) +
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 10), 
        axis.text.y = element_text(hjust = 0.95, size = 10), 
        axis.title.x = element_text(size = 10, face = "bold"), 
        axis.title.y = element_text(size = 10, face = "bold"), 
        axis.ticks.x = element_blank(), legend.title = element_text(size = 8, face = "bold"), 
        legend.text = element_text(size = 8), 
        legend.spacing.x = unit(0.005, "npc"), 
        legend.key.size = unit(6, "mm"), 
        legend.background = element_rect(fill = (alpha = 0), color = "black", size = 0.25), 
        panel.background = element_rect(color = "black", size = 1.4), 
        strip.text.x = element_text(size = 10, face = "bold"), 
        strip.background = element_rect(colour = "black", size = 1.4)) + 
  scale_x_discrete(expand = expansion(mult = 0, add = 0.53)) + 
  labs(x = "Sample", fill = "Abundance") + 
  ggraph::scale_fill_viridis(limits = c(0.0001, max(graph_data$V1, na.rm = T)))

Primers

DARTE-QM also detected 65 primers that were not found in the sequenced genomes of the mock-community.

classification = "Target"
graph_data <- mock_observed_counts[Target %in% unexpected_primers]
graph_data <- melt(graph_data, id.vars = colnames(graph_data)[1:3], variable.name = 'Sample')
graph_data <- graph_data[, sum(value), by = c('Sample', classification)]
set(graph_data, j = 'Sample', value = factor(graph_data[['Sample']], levels = mock_samples))

graph_data[[classification]] <- factor(graph_data[[classification]], 
                                       levels = c('Ref', rev(unique(graph_data[[classification]][!(graph_data[[classification]]=='Ref')]))))
set(graph_data, which(graph_data[['V1']] < 1), j='V1', value=NA)

unexpected_graph <- ggplot(graph_data, aes_string("Sample", classification, fill = "V1")) + 
  geom_tile(color = "white", size = 0.25) +
  guides(colour = guide_legend(ncol = ceiling(length(
    unique(graph_data[[classification]])
  ) / 25))) +
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 10), 
        axis.text.y = element_text(hjust = 0.95, size = 10), 
        axis.title.x = element_text(size = 10, face = "bold"), 
        axis.title.y = element_text(size = 10, face = "bold"), 
        axis.ticks.x = element_blank(), 
        legend.title = element_text(size = 8, face = "bold"), 
        legend.text = element_text(size = 8), 
        legend.spacing.x = unit(0.005, "npc"), 
        legend.key.size = unit(6, "mm"), 
        legend.background = element_rect(fill = (alpha = 0), color = "black", size = 0.25), 
        panel.background = element_rect(color = "black", size = 1.4), 
        strip.text.x = element_text(size = 10, face = "bold"), 
        strip.background = element_rect(colour = "black", size = 1.4)) + 
  scale_x_discrete(expand = expansion(mult = 0, add = 0.53)) + 
  labs(x = "Sample", fill = "Abundance") + 
  ggraph::scale_fill_viridis(limits = c(0.0001, max(graph_data$V1, na.rm = T)))

Schuyler Smith
Ph.D. Student - Bioinformatics and Computational Biology
Iowa State University. Ames, IA.