FASTQ File Loading
The only required files for this section of the reproducible analysis is the raw FASTQ files available from the data repository. Please download the required FASTQ files first and place them into a single folder for analysis. The 50 isolate samples will be analyzed separately from the 27 environmental samples.
knitr::opts_chunk$set(echo = TRUE)
#SET CORRECT WORKING DIRECTORY CONTAINING FASTQ FILES
library(dada2, quietly=TRUE); packageVersion("dada2")
## [1] '1.22.0'
library(Biostrings, quietly=TRUE)
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
library(ShortRead, quietly=TRUE)
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(DECIPHER, quietly=TRUE)
library(ggmsa, quietly=TRUE)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
## ggmsa v1.0.0 Document: http://yulab-smu.top/ggmsa/
##
## If you use ggmsa in published research, please cite: DOI: 10.18129/B9.bioc.ggmsa
library(ggplot2, quietly=TRUE)
library(reshape2, quietly=TRUE)
#Set correct working directory here first before proceding with the following analysis steps! Set to directory containing the isolate FASTQ files.
path <- "~/Desktop/Isolate_FASTQs"
fn <- list.files(path, pattern=".fq$", full.names=TRUE)
sapply(fn, function(f) length(getSequences(f)))
FWD <- "AGAGTTTGATCMTGGC" # Loop 16S forward primer
REV <- "TACCTTGTTACGACTT" # Loop 16S reverse primer
nop <- file.path(path, "nop", basename(fn))
out <- removePrimers(fn, nop, FWD, rc(REV), verbose=TRUE)
filt <- file.path(path, "filtered", basename(fn))
track <- filterAndTrim(nop, filt, maxEE=0.12, minLen=1400, maxLen=1600, verbose=TRUE)
err <- learnErrors(filt, multi=TRUE, verbose=0)
dd <- dada(filt, err, OMEGA_A=1e-10, DETECT_SINGLETONS=TRUE, multi=TRUE, verbose=0)
sta <- makeSequenceTable(dd)
st <- removeBimeraDenovo(sta, minFoldParentOverAbundance=4.5, multi=TRUE, verbose=TRUE)
tax <- assignTaxonomy(st, "silva_nr99_v138_train_set.fa.gz", minBoot=80, multi=TRUE)
sq.salm <- getSequences(tax)[tax[,"Genus"] %in% "Salmonella"]
unname(colSums(st[,sq.ec]))
uncolname <- function(x) { y <- x; colnames(y) <- NULL; y }
View(uncolname(st[,sq.salm]))
We are expecting only Salmonella in these bacterial isolates so in the final count table we can sort by species focused only on Salmonella. Next we can focus on a specific sample and determine the ASV ratio.
#Backup file load if not running the above section, contains the sq.salm object generated above.
load(file="Seroplacer_Sqsalm.Rdata")
uncolname <- function(x) { y <- x; colnames(y) <- NULL; y }
SE_Table<-uncolname(st[,sq.salm])
SE_Subset<-SE_Table[,which(SE_Table[12,]>0)]
SE_Subset2<-SE_Subset[11:15,]
print(SE_Subset2)
## [,1] [,2]
## 3509_sample_NC_WHO_S088_contig_list_trimmed.fq 0 0
## 3509_sample_NC_WHO_S091_contig_list_trimmed.fq 15 48
## 3509_sample_NC_WHO_S096_contig_list_trimmed.fq 0 0
## 3509_sample_NC_WHO_S097_contig_list_trimmed.fq 0 0
## 3509_sample_NC_WHO_S306_contig_list_trimmed.fq 0 0
We see that sample NC_WHO_S091 contains only two ASVs with counts 15 and 48. Knowing that 7 copies exist in the genome, we can assign a ratio of 2:5. 2 copies of the 15 count ASV, and 5 copies of the 48 count ASV. Finally the FASTA file can be saved for storage and input into the Seroplacer R package. We will use the NC_WHO_S019 sample as an example for a serovar prediction.
library(phylotools)
Sequence1<-rep(colnames(st)[12],2)
Sequence2<-rep(colnames(st)[13],5)
Raw_Sequences<-c(Sequence1,Sequence2)
Labels<-c("ASV1","ASV2","ASV3","ASV4","ASV5","ASV6","ASV7")
FastaTable<-cbind(Raw_Sequences,Labels)
colnames(FastaTable)<-c("seq.name","seq.text")
dat2fasta(dat = FastaTable, outfile = "test.fasta")
Finally after generating the ASV fasta file, we can input into the Seroplacer R package and generate a result.
library(Seroplacer)
library(ape)
##
## Attaching package: 'ape'
## The following object is masked from 'package:ShortRead':
##
## zoom
## The following object is masked from 'package:Biostrings':
##
## complement
Test<-read.dna("~/Desktop/test.fasta",format = "fasta",as.matrix = FALSE)
Queries_Aligned_To_MSA<-mafft_wrap(Reference = "Salmonella",Query = Test,options = "--keeplength")
Trimmed<-MSA_Trim(MSA = Queries_Aligned_To_MSA)
Prepared_Sequences<-Data_Preparation(Aligned_Queries = Trimmed,Reference = "Salmonella")
JPlace_Result<-epa_ng_wrap(msa = Prepared_Sequences[[2]],query = Prepared_Sequences[[1]],Species = "Salmonella")
MRCA_Node<-Clade_Hit_Finder_Pendant_Final(JPlace_Object = JPlace_Result,Pendant_Multi = 1.5,Species = "Salmonella")
Full_Placement_Results_Table<-Placement_Results_Output_Full(MRCA = MRCA_Node,Species = "Salmonella")
Full_Placement_Results_Table
## Serovar Fraction Matches in Final Clade Maximum Depth
## 1 Enteritidis 99.6610169 294 0.001386
## 2 Moscow 0.3389831 1 0.000444
## Sum of Branch Lengths
## 1 0.158995
## 2 0.000444
Serovar_Prediction<-Sero_Result(Sero = Full_Placement_Results_Table)
head(Serovar_Prediction)
## [1] "Enteritidis"
#Set correct working directory here first before proceding with the following analysis steps! Set to directory containing the isolate FASTQ files.
path <- "~/Desktop/Environmental_FASTQs"
fn <- list.files(path, pattern=".fq$", full.names=TRUE)
sapply(fn, function(f) length(getSequences(f)))
FWD <- "AGAGTTTGATCMTGGC" # Loop 16S forward primer
REV <- "TACCTTGTTACGACTT" # Loop 16S reverse primer
nop <- file.path(path, "nop", basename(fn))
out <- removePrimers(fn, nop, FWD, rc(REV), verbose=TRUE)
filt <- file.path(path, "filtered", basename(fn))
track <- filterAndTrim(nop, filt, maxEE=0.5, minLen=1400, maxLen=1600, verbose=TRUE)
err <- learnErrors(filt, multi=TRUE, verbose=0)
dd <- dada(filt, err, OMEGA_A=1e-10, DETECT_SINGLETONS=TRUE, multi=TRUE, verbose=0)
sta <- makeSequenceTable(dd)
st <- removeBimeraDenovo(sta, minFoldParentOverAbundance=4.5, multi=TRUE, verbose=TRUE)
save(st,file="stsave.Rdata")
tax <- assignTaxonomy(st, "silva_nr99_v138.1_train_set.fa.gz", minBoot=80, multi=TRUE)
sq.salm <- getSequences(tax)[tax[,"Genus"] %in% "Salmonella"]
sq.ec<- getSequences(tax)[tax[,"Genus"] %in% "Escherichia-Shigella"]
unname(colSums(st[,sq.ec]))
uncolname <- function(x) { y <- x; colnames(y) <- NULL; y }
View(uncolname(st[,sq.salm]))
View(uncolname(st[,sq.ec]))