Seroplacer Reproducible Analysis

Data and Package Loading Section

It is mandatory to load Seroplacer_Core_Files for the analysis section, all analysis can be re-run using these core data files. Some computational steps can take fairly long, so the analysis can be more quickly replicated by using pre-built output files found in Fig1_Files and Fig4_Files.

knitr::opts_chunk$set(echo = TRUE)
setwd("~/Desktop")
library(ggplot2)
library(ggtree)

## ggtree v3.4.4 For help: https://yulab-smu.top/treedata-book/
## 
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
## 
## Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam.
## ggtree: an R package for visualization and annotation of phylogenetic
## trees with their covariates and other associated data. Methods in
## Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
## 
## S Xu, Z Dai, P Guo, X Fu, S Liu, L Zhou, W Tang, T Feng, M Chen, L
## Zhan, T Wu, E Hu, Y Jiang, X Bo, G Yu. ggtreeExtra: Compact
## visualization of richly annotated phylogenetic data. Molecular Biology
## and Evolution. 2021, 38(9):4039-4042. doi: 10.1093/molbev/msab166
## 
## LG Wang, TTY Lam, S Xu, Z Dai, L Zhou, T Feng, P Guo, CW Dunn, BR
## Jones, T Bradley, H Zhu, Y Guan, Y Jiang, G Yu. treeio: an R package
## for phylogenetic tree input and output with richly annotated and
## associated data. Molecular Biology and Evolution. 2020, 37(2):599-603.
## doi: 10.1093/molbev/msz240

library(phangorn)

## Loading required package: ape

## 
## Attaching package: 'ape'

## The following object is masked from 'package:ggtree':
## 
##     rotate

library(ape)
library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ✔ purrr   0.3.4      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks ggtree::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(treeio)

## treeio v1.20.2 For help: https://yulab-smu.top/treedata-book/
## 
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
## 
## LG Wang, TTY Lam, S Xu, Z Dai, L Zhou, T Feng, P Guo, CW Dunn, BR
## Jones, T Bradley, H Zhu, Y Guan, Y Jiang, G Yu. treeio: an R package
## for phylogenetic tree input and output with richly annotated and
## associated data. Molecular Biology and Evolution. 2020, 37(2):599-603.
## doi: 10.1093/molbev/msz240
## 
## S Xu, Z Dai, P Guo, X Fu, S Liu, L Zhou, W Tang, T Feng, M Chen, L
## Zhan, T Wu, E Hu, Y Jiang, X Bo, G Yu. ggtreeExtra: Compact
## visualization of richly annotated phylogenetic data. Molecular Biology
## and Evolution. 2021, 38(9):4039-4042. doi: 10.1093/molbev/msab166
## 
## G Yu. Data Integration, Manipulation and Visualization of Phylogenetic
## Trees (1st ed.). Chapman and Hall/CRC. 2022. ISBN: 9781032233574
## 
## Attaching package: 'treeio'
## 
## The following object is masked from 'package:ape':
## 
##     drop.tip

#Set correct working directory to pre-load the required files for analysis. These files are generated in the previous section describing initial database construction.
load("/Users/dogrinev/Desktop/Seroplacer_Core_Files.Rdata")
load("/Users/dogrinev/Desktop/Fig1_Files.Rdata")

#If you are using the pre-generated files to save time for the reproducible analysis, set the appropriate working directory here:
load("/Users/dogrinev/Desktop/Fig4_Files.Rdata")

Figure 2, Heatmap of Average Dissimilarity Between Serovars

First we prepare the data in order to generate the heatmap. We want to cut down and only focus on larger serovars so we trim small serovars first and remove them from analysis. This will clean up the heatmap and also save computational time.

Tree_Assemblies2<-Tree_Assemblies2_Heatmap
FullTestQueries<-FullTestQueries_Heatmap
Full_16s_Data_Aligned<-Full_16s_Data_Heatmap
SerovarTable_SE2<-SerovarTable_Heatmap
vert.tree.SE<-Tree_Heatmap

Cut_Index<-which(Tree_Assemblies2 %in% FullTestQueries)
Full_16s_Data_Cut<-Full_16s_Data_Aligned[Cut_Index]
HeatmapTestQueries<-map_chr(.x = 1:1441,.f = ~Full_16s_Data_Cut[[.]][1])

Next we load all of the required functions for the heatmap calculations.

Super_HAM_Mismatcher<-function(TestSequence,ReferenceSequence)
{
  Row1<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[2],exclude = c("n","N","?"),ignore.case = TRUE))
  Row2<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[3],exclude = c("n","N","?"),ignore.case = TRUE))
  Row3<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[4],exclude = c("n","N","?"),ignore.case = TRUE))
  Row4<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[5],exclude = c("n","N","?"),ignore.case = TRUE))
  Row5<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[6],exclude = c("n","N","?"),ignore.case = TRUE))
  Row6<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[7],exclude = c("n","N","?"),ignore.case = TRUE))
  Row7<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[8],exclude = c("n","N","?"),ignore.case = TRUE))
  Hamming_Table_Result<-rbind(Row1,Row2,Row3,Row4,Row5,Row6,Row7)
  misms <- apply(cmb, 1, function(ii, dd=Hamming_Table_Result) {
    sum(dd[1,ii[[1]]], dd[2,ii[[2]]], dd[3,ii[[3]]],
        dd[4,ii[[4]]], dd[5,ii[[5]]], dd[6,ii[[6]]],
        dd[7,ii[[7]]])
  })
  Best_Order<-as.integer(cmb[which(misms==min(misms))[1],])
  FirstSeq<-paste(TestSequence[1],TestSequence[2],TestSequence[3],TestSequence[4],TestSequence[5],TestSequence[6],TestSequence[7])
  RefCut<-ReferenceSequence[2:8]
  SecondSeq<-paste(RefCut[Best_Order[1]],RefCut[Best_Order[2]],RefCut[Best_Order[3]],RefCut[Best_Order[4]],RefCut[Best_Order[5]],RefCut[Best_Order[6]],RefCut[Best_Order[7]])
  Diffs<-string.diff(a = FirstSeq,b = SecondSeq,exclude = c("n","N","?"),ignore.case = TRUE)
  return(Diffs)
}

Zero_Remover<-function(col)
{
  Zero_Index<-which(col==0)[1]
  newcol<-col[-Zero_Index]
  return(newcol)
}

Sero_Full_Averager<-function(Input)
{
  TestSero<-Input
  Sero_Index<-which(ST_Subset$Serovar==TestSero)
  Sero_Matrix<-map_dfc(.x = Sero_Index,.f = ~unlist(Heatmap_MainList[[.]]))
  Results<-unlist(map(.x = Seros,.f = ~Sero_Averager(Sero = TestSero,Sero2 = .,Sero_Matrix = Sero_Matrix)))
  return(Results)
}

Sero_Averager<-function(Sero,Sero2,Sero_Matrix)
{
  if(Sero==Sero2)
  {
    Sero_Matrix<-Sero_Matrix
    Sero_Index2<-which(ST_Subset$Serovar==Sero2)
    Sero_Matrix2<-Sero_Matrix[Sero_Index2,]
    Sero_Matrix3<-as.matrix(map_dfc(.x = 1:ncol(Sero_Matrix2),.f = ~Zero_Remover(unlist(as.vector(Sero_Matrix2[,.])))))
    Means_col<-unlist(map(.x = 1:ncol(Sero_Matrix3),.f = ~mean(Sero_Matrix3[,.])))
    Final<-mean(Means_col)
    return(Final)
  }
  if(Sero!=Sero2)
  {
    Sero_Matrix<-Sero_Matrix
    Sero_Index2<-which(ST_Subset$Serovar==Sero2)
    Sero_Matrix2<-as.matrix(Sero_Matrix[Sero_Index2,])
    Means_col<-unlist(map(.x = 1:ncol(Sero_Matrix2),.f = ~mean(Sero_Matrix2[,.])))
    Final<-mean(Means_col)
    return(Final)
  }
}

This next section runs the bulk of the calculations. This is the most time consuming step and will not be evaluated as part of this code chunk for time. The generated file will be loaded in the next step, but is calculated with the following lines. The heatmap is generated by calculating the average number of mismatches between each serovar to itself and also all other serovars. Mismatch values are averaged for summarization in this heatmap.

Heatmap_MainList<-as.list(NULL)

for(i in 1:1441)
{
  Index<-HeatmapTestQueries[i]
  TestSequence<-Full_16s_Data_Cut[[i]][2:8]
  Test<-map(.x = 1:1441,.f = ~Super_HAM_Mismatcher(TestSequence = TestSequence,ReferenceSequence = Full_16s_Data_Cut[[.]]))
  Heatmap_MainList[[i]]<-Test
}

After full nucleotide mismatch calculation, we process the data and reformat for generation of a clean heatmap image.

Figure 2 plot generation. We create the heatmap and analyze the mismatch differences across various serovars.

ggplot(Heatmap_DF, aes(Col1, Col2, fill= Heatmap_Vec)) + 
  geom_tile()+
  scale_fill_gradient2(low = "black",mid = "paleturquoise3",high = "paleturquoise1",midpoint = 110,name="Average Mismatches")+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.x = element_text(angle = 90,vjust = 0.5, hjust = 1))

#ggsave(filename = "Figure2.pdf",device = "pdf",width = 5,height = 5,dpi = 600)

Figure 3 Generation - Typhimurium Section

In this section we prepare the image which we will use to visualize the clustering of clades within our tree. For both, we will trim extra assemblies for cleaner visualization, in this first section we focus on the Typhimurium clade.

Note: The image is too large to visualize nicely within this HTML file, in order to properly see the image output, it is reccomended to save the final file and view in image or PDF software. Use the commented ggsave line after figure generate to save an appropriately sized version of the final figure.

#Locating Typhimurium Clade

#First we use a Typhimurium entry to find the MRCA of the clade, we will have to check which is the best for visualization
which(vert.tree.SE_Final$tip.label=="GCA_000213635.1")

## [1] 250

Ancestors(x = vert.tree.SE_Final,node = 250)

##  [1] 1884 1874 1873 1856 1855 1845 1844 1843 1842 1841 1840 1839 1838 1837 1815
## [16] 1698 1697 1696 1695 1694 1688 1678 1677 1676 1668 1667 1666 1665 1664 1624
## [31] 1623 1619

Descendants(x = vert.tree.SE_Final,node = 1688)

## [[1]]
##   [1]  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80
##  [19]  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98
##  [37]  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
##  [55] 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
##  [73] 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
##  [91] 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
## [109] 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
## [127] 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
## [145] 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
## [163] 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
## [181] 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
## [199] 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
## [217] 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
## [235] 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
## [253] 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
## [271] 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350
## [289] 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
## [307] 369 370 371 372 373 374 375 376 377 378

Tree_Subset<-unlist(Descendants(x = vert.tree.SE_Final,node = 1688))
Vec<-1:1618
Tree_Subset2<-Vec[-Tree_Subset]
vert.tree.cut<-ape:::drop.tip(phy = vert.tree.SE_Final,tip = Tree_Subset2)

SerovarTable_Cut<-SerovarTable_Final[which(SerovarTable_Final$Serovar=="Typhimurium/Monophasic"),]

p <- ggtree(vert.tree.cut,ladderize = TRUE) + 
  xlim(.0000001, 1)

p %<+% SerovarTable_Cut + 
  geom_tiplab(aes(fill = factor(Serovar),show.legend=FALSE),
              color = "black", # color for label font
              geom = "label",  # labels not text
              label.padding = unit(0.10, "lines"), # amount of padding around the labels
              label.size = 0.01)

#ggsave(filename = "Figure3A.pdf",device = "pdf",width = 20,height = 60,dpi = 600,limitsize = FALSE)

Figure 3 Generation - Kentucky Section

In this section we prepare the image which we will use to visualize the clustering of clades within our tree. In this section, we focus on the Kentucky clade. Because Kentucky shows a two-clade split structure, for visualization we chose to perform some additional trimming where we remove extra assemblies which are in between the two clades. This is purely for the sake of visualization and in the full tree, there are more assemblies in between the two original Kentucky clades.

#Locating Kentucky Clade
which(vert.tree.SE_Final$tip.label=="GCA_018340485.1")

## [1] 623

Ancestors(x = vert.tree.SE_Final,node = 623)

##  [1] 2241 2237 2236 2235 2234 2233 2232 2231 2229 2228 2227 2078 2077 1666 1665
## [16] 1664 1624 1623 1619

Descendants(x = vert.tree.SE_Final,node = 2231)

## [[1]]
##   [1] 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624
##  [19] 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642
##  [37] 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660
##  [55] 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678
##  [73] 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696
##  [91] 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714
## [109] 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732
## [127] 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750
## [145] 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768
## [163] 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786
## [181] 787 788 789 790 791 792 793 794 795

#Node 2231 appears to be a good starting point for locating the Kentucky clade.

#Next we trim out tree parts to cut down to visualizing Kentucky only.
Tree_Subset<-unlist(Descendants(x = vert.tree.SE_Final,node = 2231))
Vec<-1:1618
Tree_Subset2<-Vec[-Tree_Subset]
vert.tree.cut<-ape:::drop.tip(phy = vert.tree.SE_Final,tip = Tree_Subset2,trim.internal = FALSE)

#Now we look for assemblies in other parts of the tree to help with cutting. We use these assemblies to help cut in between the two separated Kentucky clades, for visual clarity. 
Ancestors(x = vert.tree.cut,node = which(vert.tree.cut$tip.label=="GCA_006165285.1"))

##  [1] 658 657 656 653 652 651 650 649 648 647 646 645 644 643 642 601 525 524 523
## [20] 522 521 507 506 441 440 439 431 430 429

Ancestors(x = vert.tree.cut,node = which(vert.tree.cut$tip.label=="GCA_002234755.2"))

##  [1] 703 696 642 601 525 524 523 522 521 507 506 441 440 439 431 430 429

Cut<-unlist(Descendants(x = vert.tree.cut,node = 642))
vert.tree.cut2<-ape:::drop.tip(phy = vert.tree.cut,tip = Cut)

#This section cuts the assemblies from the top of the tree which are above the upper Kentucky clade. 
Ancestors(x = vert.tree.cut2,node = which(vert.tree.cut$tip.label=="GCA_019264865.1"))

##  [1] 531 530 529 528 527 526 525 524 523 509 508 495 462 461 460 459 458 457 443
## [20] 442 377 376 375 367 366 365

Ancestors(x = vert.tree.cut2,node = which(vert.tree.cut$tip.label=="GCA_004194635.5"))

##  [1] 506 496 495 462 461 460 459 458 457 443 442 377 376 375 367 366 365

Cut2<-unlist(Descendants(x = vert.tree.cut2,node = 495))
vert.tree.cut3<-ape:::drop.tip(phy = vert.tree.cut2,tip = Cut2)


#Plotting Kentucky Serovar - Note: Final version is cropped externally to focus only on the Kentucky versions of the tree, cropping was done directly from this output figure. 
SerovarTable_Cut<-SerovarTable_Final[which(SerovarTable_Final$Serovar=="Kentucky"),]

p <- ggtree(vert.tree.cut3,ladderize = TRUE) + 
  xlim(.0000001, 1)

p %<+% SerovarTable_Cut + 
  geom_tiplab(aes(fill = factor(Serovar),show.legend=FALSE),
              color = "black", # color for label font
              geom = "label",  # labels not text
              label.padding = unit(0.10, "lines"), # amount of padding around the labels
              label.size = 0.01)

#ggsave(filename = "Figure3B.pdf",device = "pdf",width = 20,height = 60,dpi = 600,limitsize = FALSE)

Salmonella Data Analysis Section

First we load all of the required functions for performing serovar placement analysis. This section differs slightly from the original Seroplacer function beecause we are performing a step where we remove the test query. Each test query is selected from the tree, removed, and then placed back. These functions incorporate these extra steps, since the base Seroplacer algorithm does not involve removing any assemblies from the tree, and a placement would be made into the full tree.

Run<-FullTestQueries[1:1464]

#Actual core placement function which calculates mismatch optimization values.
Super_HAM_Placer2<-function(TestSequence,ReferenceSequence)
{
  Row1<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[2],exclude = c("n","N","?"),ignore.case = TRUE))
  Row2<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[3],exclude = c("n","N","?"),ignore.case = TRUE))
  Row3<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[4],exclude = c("n","N","?"),ignore.case = TRUE))
  Row4<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[5],exclude = c("n","N","?"),ignore.case = TRUE))
  Row5<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[6],exclude = c("n","N","?"),ignore.case = TRUE))
  Row6<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[7],exclude = c("n","N","?"),ignore.case = TRUE))
  Row7<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[8],exclude = c("n","N","?"),ignore.case = TRUE))
  Hamming_Table_Result<-rbind(Row1,Row2,Row3,Row4,Row5,Row6,Row7)
  misms <- apply(cmb, 1, function(ii, dd=Hamming_Table_Result) {
    sum(dd[1,ii[[1]]], dd[2,ii[[2]]], dd[3,ii[[3]]],
        dd[4,ii[[4]]], dd[5,ii[[5]]], dd[6,ii[[6]]],
        dd[7,ii[[7]]])
  })
  Best_Order<-as.integer(cmb[which(misms==min(misms))[1],])
  return(Best_Order)
}

#String matching function which supports Super_Ham_Placer2.
string.diff<-function(a,b,exclude=c("n","N","?"),ignore.case=TRUE)
{
  a<-toupper(a)
  b<-toupper(b)
  diff.a<-unlist(strsplit(a,split=""))
  diff.b<-unlist(strsplit(b,split=""))
  diff.d<-rbind(diff.a,diff.b)
  for(ex.loop in 1:length(exclude))
  {
    diff.d<-diff.d[,!(diff.d[1,]==exclude[ex.loop]|diff.d[2,]==exclude[ex.loop])]
  }
  differences<-sum(diff.d[1,]!=diff.d[2,])
  return(differences)
}

#Concatenation function which supports Super_HAM_Placer2.
Concatenate_By_Order<-function(Order,RefSequence)
{
  Sequence<-RefSequence[2:8]
  Concatenated_Output<-paste(Sequence[Order[1]],Sequence[Order[2]],Sequence[Order[3]],Sequence[Order[4]],Sequence[Order[5]],Sequence[Order[6]],Sequence[Order[7]],sep = "")
  return(Concatenated_Output)
}

#Required Package Loading
library(tidyverse)
library(dada2)
library(phylotools)

#Loop to execute placement function. 
#IMPORTANT NOTE: Set an appropriate working directory first because this loop will generate new directories to store placement data. These directories will be filled with the EPA-NG placement data and output files need to be organized correctly for downstream analysis.
for(x in Run)
{
  system(command = paste("mkdir ~/Placement_Testing_SE/",x,sep = ""))
  setwd(paste("~/Placement_Testing_SE/",x,sep = ""))
  Remove_Index<-as.numeric(which(Tree_Assemblies_Final==x))
  TestSequence<-Full_16s_Data_Aligned_Final[[which(Tree_Assemblies_Final==x)]][2:8]
  Full_16s_Data_Aligned2<-Full_16s_Data_Aligned_Final[-Remove_Index]
  Concat_Orders<-map(.x = 1:1617,.f = ~Super_HAM_Placer2(TestSequence = TestSequence,ReferenceSequence = Full_16s_Data_Aligned2[[.]]))
  Concatenated_References<-map(.x = 1:1617,.f = ~Concatenate_By_Order(Order = Concat_Orders[[.]][1:7],RefSequence = Full_16s_Data_Aligned2[[.]]))
  Prep_For_FASTA<-as.data.frame(unlist(Concatenated_References))
  GFFs_Names<-Tree_Assemblies_Final[-Remove_Index]
  Prep_For_FASTA2<-cbind(GFFs_Names,Prep_For_FASTA)
  colnames(Prep_For_FASTA2)<-c("seq.name","seq.text")
  dat2fasta(dat = Prep_For_FASTA2,outfile = paste("~/Placement_Testing_SE/",x,"/concatref.fasta",sep = ""))
  #References are ready from above
  #Now prepare query fasta
  QTestSequence<-Full_16s_Data_Aligned_Final[[which(Tree_Assemblies_Final==x)]][2:8]
  Query_Prep<-as.data.frame(paste(QTestSequence[1],QTestSequence[2],QTestSequence[3],QTestSequence[4],QTestSequence[5],QTestSequence[6],QTestSequence[7],sep = ""))
  Query_Prep2<-cbind(x,Query_Prep)
  colnames(Query_Prep2)<-c("seq.name","seq.text")
  dat2fasta(dat = Query_Prep2,outfile = paste("~/Placement_Testing_SE/",x,"/query.fasta",sep = ""))
  #Next prepare tree with test sequence plucked out
  vert.tree.trimmed<-ape:::drop.tip(phy = vert.tree.SE_Final,tip = x)
  write.tree(phy = vert.tree.trimmed,file = paste("~/Placement_Testing_SE/",x,"/RAXML.tree.trimmed",sep = ""))
  #add command to run epa-ng here
  system(command = paste("~/epa-ng-master/bin/epa-ng --ref-msa ~/Placement_Testing_SE/",x,"/concatref.fasta --tree ~/Placement_Testing_SE/",x,"/RAXML.tree.trimmed --query ~/Placement_Testing_SE/",x,"/query.fasta --model ~/Placement_Testing_SE/info.raxml.bestModel --filter-max 100",sep = ""))
  system(command = paste("rm concatref.fasta"))
  system(command = paste("rm RAXML.tree.trimmed"))
  #end loop
}

Function Loading Section, used for next analysis.

Placement_Node_Selector<-function(Distal_Node,Distal_Length,Tree,Root,Dist_Table)
{
  if(Distal_Node == Root)
  {
    return(Distal_Node)
  }
  else
  {
    Proximal_Node<-parent(.data = Tree,.node = Distal_Node)
    Total_Distance<-Dist_Table[Distal_Node,Proximal_Node]
    Proximal_Length<-Total_Distance-Distal_Length
    if(Distal_Length >= Proximal_Length)
    {
      return(Proximal_Node)
    }
    else
    {
      return(Distal_Node)
    }
  }
}

Depth_Calculator<-function(Serovar,Dists,Serovar_Names)
{
  Cut_Dists<-as.numeric(Dists[which(Serovar_Names %in% Serovar)])
  Depth<-max(Cut_Dists)
  Sum_Branches<-sum(Cut_Dists)
  Output<-c(Depth,Sum_Branches)
  return(Output)
}

Placement_Calculator<-function(SeroName,Placement_Results_DF)
{
  DF_Subset<-Placement_Results_DF[which(Placement_Results_DF$Placement_Actuals_Vec==SeroName),]
  Total_Sero<-length(rownames(DF_Subset))
  Number_Correct<-length(which(DF_Subset$Placement_Actuals_Vec == DF_Subset$Placement_Finals_Vec))
  Number_Ind<-length(which(DF_Subset$Placement_Finals_Vec=="Indeterminate"))
  Number_Wrong<-Total_Sero-Number_Correct-Number_Ind
  Percent_Correct<-Number_Correct/Total_Sero*100
  Percent_Ind<-Number_Ind/Total_Sero*100
  Percent_Wrong<-Number_Wrong/Total_Sero*100
  Results<-data.frame(SeroName,Total_Sero,Percent_Correct,Percent_Ind,Percent_Wrong)
  return(Results)
}

Clade_Hit_Finder<-function(Original,Pendant_Multi)
{
  x<-Original
  setwd(paste("~/Desktop/Database_Rebuild/Placement_Testing_SE/",x,sep=""))
  JPlace<-read.jplace("epa_result.jplace")
  vert.tree.trimmed<-ape:::drop.tip(phy = vert.tree.SE_Final,tip = x)
  vert.tree<-vert.tree.trimmed
  PlacedEdges<-JPlace@placements$node
  Node_Distances<-dist.nodes(x = vert.tree)
  PlacedEdges_List<-map(.x = 1:length(PlacedEdges),.f = ~Placement_Node_Selector(Distal_Node = PlacedEdges[.],Distal_Length = JPlace@placements$distal_length[.],Tree = vert.tree,Root = rootnode(vert.tree),Dist_Table = Node_Distances))
  PlacedEdges_Corrected<-unlist(PlacedEdges_List)
  All_Descendants<-map(.x = PlacedEdges_Corrected,.f = ~Descendants(x = vert.tree,node = .))
  All_Descendants_Vec<-unlist(All_Descendants)
  TipNumbersOI_Filtered<-unique(All_Descendants_Vec)
  TipNamesOI_Filtered<-vert.tree$tip.label[TipNumbersOI_Filtered]
  TipNumbersOI_OriginalTree<-which(vert.tree.SE_Final$tip.label %in% TipNamesOI_Filtered)
  Edge.Table<-cbind(vert.tree.SE_Final$edge,vert.tree.SE_Final$edge.length)
  Ancestor_List<-map(.x = TipNumbersOI_OriginalTree,.f = ~unlist(Ancestors(vert.tree.SE_Final,node = .)))
  Sum_List<-as.list(NULL)
  for(j in 1:length(Ancestor_List[[1]]))
  {
    Test_Vec<-map_int(.x = Ancestor_List,.f = ~match(Ancestor_List[[1]][j],.))
    Sum_List[[j]]<-Test_Vec
  }
  Result_Vec<-map_int(.x = Sum_List,.f = ~sum(.))
  Result_Vec_NoNA<-Result_Vec[!is.na(Result_Vec)]
  Target_Value<-min(Result_Vec_NoNA)
  Closest_Parent_Node<-Ancestor_List[[1]][which(Result_Vec==Target_Value)]
  Root_Node<-rootnode(vert.tree.SE_Final)
  if(Closest_Parent_Node == Root_Node)
  {
    MRCA<-Closest_Parent_Node
    return(MRCA)
  }
  if(Closest_Parent_Node != Root_Node)
  {
    Potential_Ancestors<-Ancestors(x = vert.tree.SE_Final,node = Closest_Parent_Node)
    Potential_Ancestors_Final<-c(Closest_Parent_Node,Potential_Ancestors)
    Edge_Scores<-Edge.Table[which(Edge.Table[,2] %in% Potential_Ancestors_Final),]
    if(class(Edge_Scores)[1]=="numeric")
    {
      Edge_Scores<-t(as.data.frame(Edge_Scores))
      Pendant<-max(JPlace@placements$pendant_length)
      Branch_Sums<-map(.x = 1:length(Edge_Scores[,3]),.f = ~sum(Edge_Scores[.:length(Edge_Scores[,3]),3]))
      Branch_Sums_Vec<-unlist(Branch_Sums)
      MRCA<-Edge_Scores[which(abs(Branch_Sums_Vec-Pendant*Pendant_Multi)==min(abs(Branch_Sums_Vec-Pendant*Pendant_Multi))),2]
      return(MRCA)
    }
    if(class(Edge_Scores)[1]!="numeric")
    {
      Pendant<-max(JPlace@placements$pendant_length)
      Branch_Sums<-map(.x = 1:length(Edge_Scores[,3]),.f = ~sum(Edge_Scores[.:length(Edge_Scores[,3]),3]))
      Branch_Sums_Vec<-unlist(Branch_Sums)
      MRCA<-Edge_Scores[which(abs(Branch_Sums_Vec-Pendant*Pendant_Multi)==min(abs(Branch_Sums_Vec-Pendant*Pendant_Multi))),2]
      return(MRCA)
    }
  }
}

Placement_Method_Evaluator<-function(Number)
{
  PlacementMappingData_MRCAs_Pendant_New<-map(.x = FullTestQueries,.f = ~Clade_Hit_Finder(Original = .x,Pendant_Multi = Number))
  Placement_Accuracy<-map(.x = 1:1464,.f = ~Clade_Accuracy(MRCA = PlacementMappingData_MRCAs_Pendant_New[[.]],Original = FullTestQueries[.]))
  Placement_Accuracy_DF<-as.data.frame(matrix(unlist(Placement_Accuracy), ncol = 2, byrow = TRUE))
  Placement_Summary<-mean(Placement_Accuracy_DF$V2)
  Placement_Checker<-map(.x = 1:1464,.f = ~Clade_Checker(MRCA = PlacementMappingData_MRCAs_Pendant_New[[.]],Original = FullTestQueries[.]))
  Placement_Checker_DF<-as.data.frame(matrix(unlist(Placement_Checker), ncol = 2, byrow = TRUE))
  Originals_Found<-length(which(Placement_Checker_DF$V1==6))/1464
  Result<-c(Number,Placement_Summary,Originals_Found)
  return(Result)
}

Clade_Accuracy<-function(MRCA,Original)
{
  Descendant_List<-Descendants(x = vert.tree.SE_Final,node = MRCA)
  Descendants_Vec<-unlist(Descendant_List)
  Descendant_Assemblies<-vert.tree.SE_Final$tip.label[Descendants_Vec]
  Original_Serovar<-SerovarTable_Final[which(SerovarTable_Final$Assembly %in% Original),2]
  Descendant_Serovars<-SerovarTable_Final[which(SerovarTable_Final$Assembly %in% Descendant_Assemblies),2]
  Descendant_Serovars_NoNA<-Descendant_Serovars[which(Descendant_Serovars!="")]
  Correct_Seros<-which(Descendant_Serovars_NoNA %in% Original_Serovar)
  Fraction_Total_Seros<-length(Correct_Seros)/length(Descendant_Serovars_NoNA)
  Fraction_Total_Seros[is.na(Fraction_Total_Seros)]<-0
  Node_Distances<-dist.nodes(vert.tree.SE_Final)
  Depth_Range<-Node_Distances[MRCA,Descendants_Vec]
  Depth<-max(Depth_Range)
  Final_Result<-c(-log10(Depth),Fraction_Total_Seros)
  return(Final_Result)
}

Clade_Checker<-function(MRCA,Original)
{
  Descendant_List<-Descendants(x = vert.tree.SE_Final,node = MRCA)
  Original_Node<-which(vert.tree.SE_Final$tip.label==Original)
  
  if(Original_Node %in% Descendant_List[[1]] == TRUE)
  {
    Output<-c(6,length(Descendant_List[[1]]))
    return(Output)
  }
  
  if(Original_Node %in% Descendant_List[[1]] == FALSE)
  {
    Node_Distances<-dist.nodes(x = vert.tree.SE_Final)
    Descendant_Distances<-Node_Distances[Original_Node,Descendant_List[[1]]]
    Output<-c(-log10(min(Descendant_Distances)),length(Descendant_List[[1]]))
    return(Output)
  }
}

Determine MRCAs - These are calculated from the placement JSON files generated previously.

Original_Node_Distances<-dist.nodes(vert.tree.SE_Final)
Placement_MRCAs<-as.list(NULL)

for(i in 1:1464)
{
  x<-FullTestQueries[i]
  setwd(paste("~/Desktop/Database_Rebuild/Placement_Testing_SE/",x,sep=""))
  JPlace<-read.jplace("epa_result.jplace")
  vert.tree.trimmed<-ape:::drop.tip(phy = vert.tree.SE_Final,tip = x)
  vert.tree<-vert.tree.trimmed
  PlacedEdges<-JPlace@placements$node
  Node_Distances<-dist.nodes(x = vert.tree)
  PlacedEdges_List<-map(.x = 1:length(PlacedEdges),.f = ~Placement_Node_Selector(Distal_Node = PlacedEdges[.],Distal_Length = JPlace@placements$distal_length[.],Tree = vert.tree,Root = rootnode(vert.tree),Dist_Table = Node_Distances))
  PlacedEdges_Corrected<-unlist(PlacedEdges_List)
  All_Descendants<-map(.x = PlacedEdges_Corrected,.f = ~Descendants(x = vert.tree,node = .))
  All_Descendants_Vec<-unlist(All_Descendants)
  TipNumbersOI_Filtered<-unique(All_Descendants_Vec)
  Edge.Table<-cbind(vert.tree$edge,vert.tree$edge.length)
  Ancestor_List<-map(.x = TipNumbersOI_Filtered,.f = ~unlist(Ancestors(vert.tree,node = .)))
  Sum_List<-as.list(NULL)
  for(j in 1:length(Ancestor_List[[1]]))
  {
    Test_Vec<-map_int(.x = Ancestor_List,.f = ~match(Ancestor_List[[1]][j],.))
    Sum_List[[j]]<-Test_Vec
  }
  Result_Vec<-map_int(.x = Sum_List,.f = ~sum(.))
  Result_Vec_NoNA<-Result_Vec[!is.na(Result_Vec)]
  Target_Value<-min(Result_Vec_NoNA)
  Closest_Parent_Node<-Ancestor_List[[1]][which(Result_Vec==Target_Value)]
  Root_Node<-rootnode(vert.tree)
  if(Closest_Parent_Node == Root_Node)
  {
    MRCA<-Closest_Parent_Node
    Placement_MRCAs[[i]]<-MRCA
  }
  if(Closest_Parent_Node != Root_Node)
  {
    Pendant_Multi<-1.5
    Potential_Ancestors<-Ancestors(x = vert.tree,node = Closest_Parent_Node)
    Potential_Ancestors_Final<-c(Closest_Parent_Node,Potential_Ancestors)
    Edge_Scores<-Edge.Table[which(Edge.Table[,2] %in% Potential_Ancestors_Final),]
    if(class(Edge_Scores)[1]=="numeric")
    {
      Edge_Scores<-t(as.data.frame(Edge_Scores))
      Pendant<-max(JPlace@placements$pendant_length)
      Branch_Sums<-map(.x = 1:length(Edge_Scores[,3]),.f = ~sum(Edge_Scores[.:length(Edge_Scores[,3]),3]))
      Branch_Sums_Vec<-unlist(Branch_Sums)
      MRCA<-Edge_Scores[which(abs(Branch_Sums_Vec-Pendant*Pendant_Multi)==min(abs(Branch_Sums_Vec-Pendant*Pendant_Multi))),2]
      Placement_MRCAs[[i]]<-MRCA
    }
    if(class(Edge_Scores)[1]!="numeric")
    {
      Pendant<-max(JPlace@placements$pendant_length)
      Branch_Sums<-map(.x = 1:length(Edge_Scores[,3]),.f = ~sum(Edge_Scores[.:length(Edge_Scores[,3]),3]))
      Branch_Sums_Vec<-unlist(Branch_Sums)
      MRCA<-Edge_Scores[which(abs(Branch_Sums_Vec-Pendant*Pendant_Multi)==min(abs(Branch_Sums_Vec-Pendant*Pendant_Multi))),2]
      Placement_MRCAs[[i]]<-MRCA
    }
  }
}

Calculation of our predictions and the actual results. Placement_Predictions contains the predictions made by the placement algorithm with relevant data, and Placement_Finals contains the final output serovar prediction. Placement_Actuals contains the correct serovar and is used to determine if a correct prediction was made.

load("/Users/dogrinev/Desktop/Fig4_Files.Rdata")

if(!identical(x = length(Placement_MRCAs),y = length(FullTestQueries))) stop("File error. Check input data.")

Original_Node_Distances<-dist.nodes(vert.tree.SE_Final)

Placement_Predictions<-as.list(NULL)
Placement_Finals<-as.list(NULL)

for(i in 1:1464)
{
  x<-FullTestQueries[i]
  vert.tree.trimmed<-ape:::drop.tip(phy = vert.tree.SE_Final,tip = x)
  MRCA<-Placement_MRCAs[[i]]
  Tree<-vert.tree.trimmed
  Descendant_List<-Descendants(x = Tree,node = MRCA)
  Descendants_Vec<-unlist(Descendant_List)
  Descendant_Assemblies<-Tree$tip.label[Descendants_Vec]
  GTD_Sero_Predict_Clean2<-SerovarTable_Final
  Descendant_Serovars<-GTD_Sero_Predict_Clean2[which(GTD_Sero_Predict_Clean2$Assembly %in% Descendant_Assemblies),2]
  Descendant_Serovars_NoNA<-Descendant_Serovars[which(Descendant_Serovars!="")]
  Serovar_Report<-names(table(Descendant_Serovars_NoNA))
  Sero_Percentages<-unlist(map(.x = Serovar_Report,.f = ~length(which(Descendant_Serovars %in% .))/length(Descendant_Serovars)))
  Sero_Numbers<-unlist(map(.x = Serovar_Report,.f = ~length(which(Descendant_Serovars %in% .))))
  Node_Distances<-dist.nodes(Tree)
  Distances<-Node_Distances[MRCA,Descendants_Vec]
  Depth_Results<-map(.x = Serovar_Report,.f = ~Depth_Calculator(Serovar = .,Dists = Distances,Serovar_Names = Descendant_Serovars))
  Depth_Results_Cols<-data.frame(t(matrix(unlist(Depth_Results),nrow=2)))
  Sero_Table<-as.data.frame(cbind(Serovar_Report,Sero_Percentages,Sero_Numbers,Depth_Results_Cols))
  colnames(Sero_Table)<-c("Serovar","Fraction","Matches in Final Clade","Maximum Depth","Sum of Branch Lengths")
  Sero_Table_Sorted <- Sero_Table[order(Sero_Table$Fraction,decreasing = TRUE),]
  Sero_Table_Sorted$Fraction<-as.numeric(Sero_Table_Sorted$Fraction)
  Sero_Table_Sorted$Fraction<-Sero_Table_Sorted$Fraction*100
  if(sum(Sero_Table_Sorted[,3]) > 500)
  {
    Placement_Predictions[[i]]<-Sero_Table_Sorted
    Placement_Finals[[i]]<-"Indeterminate"
  }
  if(sum(Sero_Table_Sorted[,3]) < 500)
  {
    Placement_Predictions[[i]]<-Sero_Table_Sorted
    if(Sero_Table_Sorted[1,2] >= 50)
    {
      Placement_Finals[[i]]<-Sero_Table_Sorted[1,1]
    }
    if(Sero_Table_Sorted[1,2] < 50)
    {
      Placement_Finals[[i]]<-"Indeterminate"
    }
  }
}

Placement_Actuals<-as.list(NULL)
for(i in 1:1464)
{
  Query<-FullTestQueries[i]
  QuerySero<-SerovarTable_Final[which(SerovarTable_Final$Assembly==Query),2]
  Placement_Actuals[[i]]<-QuerySero
}

Placement results evaluator section, the computational step here is quite slow so this code chunk will not be evaluated. If desired, it can be re-run using the following lines.

Pendant_Range<-c(0.001,0.025,0.05,0.01,0.25,0.5,0.75,1,1.25,1.3,1.35,1.45,1.5,1.55,1.6,1.65,1.7,1.75,2,3,4,6,8)
Evaluator_Results<-map(.x = Pendant_Range,.f = ~Placement_Method_Evaluator(.))

Figure 4 - Summary Statistics of Serovar Placement Algorithm Performance on Salmonella Test Data

The following lines generate the plo from the evaluator results to visualize overall placement accuracy.

Placement_Actuals_Vec<-unlist(Placement_Actuals)
Placement_Finals_Vec<-unlist(Placement_Finals)
Placement_Results_DF<-as.data.frame(cbind(Placement_Actuals_Vec,Placement_Finals_Vec))

if(!identical(x = as.numeric(length(Evaluator_Results)),y = 23)) stop("File error. Check input data.")

load("/Users/dogrinev/Desktop/Fig4_Files.Rdata")
Evaluator_Results_DF<-as.data.frame(matrix(unlist(Evaluator_Results), ncol = 3, byrow = TRUE))
colnames(Evaluator_Results_DF)<-c("Pendant_Multi","Serovar_Percentage","OQ_Percentage")

Evaluator_Results_DF_Trim<-Evaluator_Results_DF[-c(9,11,12,14,16,18),]

ggplot(Evaluator_Results_DF_Trim, aes(x=OQ_Percentage, y=Serovar_Percentage,label=Pendant_Multi)) +
  geom_text(vjust=(-1),size=3)+
  geom_point()+
  ggtitle("Summary Statistics of Salmonella Algorithm Performance")+
  xlab("Fraction of Correct Phylogenetic Origins")+
  ylab("Serovar Accuracy")

#ggsave(filename = "Figure4.png",device = "png",width = 10,height = 8,dpi = 600)

Figure 5 - Salmonella Serovar-Specific Classification Accuracy

Barplot of proportions representing Salmonella placement accuracy results for each specific serovar from the top 52 in our total dataset.

Data_Vec<-rep(1,1464)
Data_Vec[which(Placement_Finals_Vec=="Indeterminate")]<-"Indeterminate"
Data_Vec[which(Placement_Finals_Vec==Placement_Actuals_Vec)]<-"Correct"
Data_Vec[which(Data_Vec==1)]<-"Incorrect"
SE_Serovar_Results_Data_Table<-as.data.frame(cbind(FullTestQueries,Placement_Actuals_Vec,Data_Vec))
colnames(SE_Serovar_Results_Data_Table)<-c("Assemblies","Serovars","Results")

#High quality barplot script:
serosize<-names(sort(table(SerovarTable_Final$Serovar),decreasing = TRUE))
SE_Serovar_Results_Data_Table %>%
  mutate(Serovars=factor(Serovars, levels = rev(serosize)), 
         Results=factor(Results, levels=c("Indeterminate", "Incorrect", "Correct"))) %>% 
  count(Serovars, Results) %>%
  with_groups(c(Serovars), mutate, p=n/sum(n)) %>% 
  ggplot(aes(x=Serovars, y=p, fill=Results)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_y_continuous(breaks = seq(0,1,0.25), labels=c("0", "0.25", "0.5", "0.75", "1")) +
  scale_fill_manual(breaks=c("Correct", "Incorrect", "Indeterminate"), values = c("palegreen2", "indianred1", "gray75")) + 
  theme_bw() +
  theme(axis.text.y = element_text(size = 5.5),
        panel.grid = element_blank(), 
        legend.position = "top") +
  labs(y="Proportion")

#ggsave(filename = "Figure5.pdf",device = "pdf",width = 5,height = 6,dpi = 600)