BLAST_file_path = "../blast/"
if (length(BLAST_file_path) == 1 && dir.exists(BLAST_file_path)) {
BLAST_file_path <- gsub("/$", "", BLAST_file_path)
BLAST_file <- dir(BLAST_file_path, full.names = TRUE)
} else {
BLAST_file <- BLAST_file_path
}
BLAST_file <- sort(normalizePath(BLAST_file))
primer_success <- data.table::data.table(sample = character(),
target = character(),
n_reads = numeric(),
artifacts = numeric(),
analyzed = numeric(),
true_pos = numeric(),
false_pos = numeric(),
false_neg = numeric(),
true_neg = numeric())
for(file in BLAST_file){
sample_name <- gsub('Q3-','',basename(file))
files <- dir(file, full.names = TRUE)
files <- sort(normalizePath(files))
unknown <- files[grep('unknown', files)]
unknown <- data.table::fread(unknown, header = FALSE, sep = '\t',
col.names = c('query', 'reference', 'perc_id', 'length',
'mismatch', 'gap', 'qstart', 'qend',
'rstart', 'rend', 'e', 'bit'))
unknown <- unknown[length >= 90]
setorder(unknown, 'query', 'perc_id')
unknown <- merge(classifications, unknown, by.x = "Target", by.y = "reference")
unknown <- unique(unknown, by=c("query", "Target"))
unknown <- unknown[unknown[, .I[length == max(length, na.rm=T)], by = c('query', 'Target')][[3]]]
unknown <- unknown[unknown[, .I[perc_id == max(perc_id, na.rm=T)], by = c('query', 'Target')][[3]]]
unknown <- unknown[, .(count = .N), by = Target]
for(primer_file in files){
if(file.info(primer_file)$size > 0){
blast <- data.table::fread(primer_file, header = FALSE, sep = '\t')
#'query', 'reference', 'perc_id', 'length', 'mismatch', 'gap',
#'qstart', 'qend', 'rstart', 'rend', 'e', 'bit'
setorder(blast, 'V1', 'V3')
blast <- blast[V3 > 95]
blast <- merge(classifications, blast, by.x = "Target", by.y = "V2")
n_reads <- length(unique(blast[['V1']]))
blast <- blast[V4 >= 90]
analyzed <- length(unique(blast[['V1']]))
n_artifacts <- n_reads - analyzed
primer_target <- gsub('____.?', '', basename(sapply(strsplit(primer_file, "\\."), "[[", 1)))
false_neg <- unknown[Target == primer_target][[2]]
successes <- 0
if(nrow(blast) > 0){
blast <- blast[blast[, .I[V4 == max(V4, na.rm=T)], by = c('V1', 'Target')][[3]]]
blast <- unique(blast, by=c("V1", "Target"))
blast <- blast[, .(count = .N), by = Target]
successes <- blast[Target == primer_target][[2]]
}
if(length(successes) < 1) successes <- 0
if(length(false_neg) < 1) false_neg <- 0
primer_success <- rbind(primer_success,
data.table::data.table(
sample = sample_name,
target = primer_target,
n_reads = n_reads,
artifacts = n_artifacts,
analyzed = analyzed,
true_pos = successes,
false_pos = (analyzed - successes),
true_neg = 0,
false_neg = false_neg))
}
}
set(primer_success, primer_success[, .I[sample == sample_name]], 'true_neg', sum())
print(sample_name)
}
primer_success[,true_neg := sum(true_pos)-true_pos, by = sample]
primer_success <- merge(classifications, primer_success, by.x = 'Target', by.y = 'target')
primer_success <- primer_success[n_reads != 'NA']
primer_success <- primer_success[ARG_Family != '16S']
saveRDS(primer_success, '../data/primers/primer_success.RDS')
data.table::set(primer_success, which(primer_success[["success_rate"]] == 0), "success_rate", NA)
primer_success[, experiment := sapply(strsplit(sample, "-"), "[[", 1)]
read_counts <- data.table::dcast(primer_success, Target ~ sample, value.var=c("true_pos"), fun = sum)
read_counts <- read_counts[c(which(rowSums(read_counts[,-1]) > 0)),]
read_counts <- merge(classifications, read_counts, by = 'Target')
saveRDS(read_counts, '../data/primers/read_counts_from_primers.RDS')
samples <- primer_success[, sum(true_pos, na.rm = T), by = sample][V1 > 1000]$sample
primer_success <- primer_success[sample %in% samples]
primers <- primer_success[, lapply(.SD, sum, na.rm=TRUE), by=Target, .SDcols=c("n_reads", "artifacts", "analyzed", "true_pos", "false_pos") ]
setorder(primers, -n_reads)
sample_success <- primer_success[, lapply(.SD, sum, na.rm=TRUE), by=sample, .SDcols=c("n_reads", "artifacts", "analyzed", "true_pos", "false_pos") ]
setorder(sample_success, sample)
data.table::set(primer_success, which(primer_success[["success_rate"]] == 0), "success_rate", NA)
primer_success[, experiment := sapply(strsplit(sample, "-"), "[[", 1)]
read_counts <- data.table::dcast(primer_success, Target ~ sample, value.var=c("true_pos"), fun = sum)
read_counts <- read_counts[c(which(rowSums(read_counts[,-1]) > 0)),]
read_counts <- merge(classifications, read_counts, by = 'Target')
primer_success[, experiment := NULL]
setorder(primer_success, -n_reads)
primer_success[, sensitivity := round(true_pos/(true_pos + false_neg)*100, 1)]
primer_success[, specificity := round(true_neg/(true_neg + false_pos)*100, 1)]
primer_success[, accuracy := round(true_pos/(true_pos + false_neg)*100, 1)]
Mean sensitivity of each primer, weighted by abundance per sample, was found to be 99.56
Mean specificity of each primer, weighted by abundance per sample, was found to be 99.92
Mean accuracy of each primer, weighted by abundance per sample, was found to be 99.56
primers <- primer_success[, lapply(.SD, sum, na.rm=TRUE), by=Target, .SDcols=c("n_reads", "artifacts", "analyzed", "true_pos", "false_pos", "false_neg", "true_neg") ]
primers[, percent_artifacts := artifacts/n_reads*100]
primers[, percent_tp := true_pos/n_reads*100]
primers[, percent_fp := false_pos/n_reads*100]
primers[, percent_fn := false_neg/n_reads*100]
The number of reads produced per sample and the percentage of those reads that were PCR-artifacts had a negative correlation with an R-squared value = 0.73
Schuyler Smith
Ph.D. Student - Bioinformatics and Computational Biology
Iowa State University. Ames, IA.