Seroplacer Reproducible Analysis (E. coli)

Data and Package Loading Section

It is mandatory to load Seroplacer_Core_Files for the analysis section, all analysis can be re-run using these core data files. Some computational steps can take fairly long, so the analysis can be more quickly replicated by using pre-built output files found in Fig1_Files and Fig4_Files.

knitr::opts_chunk$set(echo = TRUE)
setwd("~/Desktop")
library(ggplot2)
library(ggtree)

## ggtree v3.4.4 For help: https://yulab-smu.top/treedata-book/
## 
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
## 
## Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam.
## ggtree: an R package for visualization and annotation of phylogenetic
## trees with their covariates and other associated data. Methods in
## Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
## 
## G Yu. Data Integration, Manipulation and Visualization of Phylogenetic
## Trees (1st ed.). Chapman and Hall/CRC. 2022. ISBN: 9781032233574
## 
## Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods
## for mapping and visualizing associated data on phylogeny using ggtree.
## Molecular Biology and Evolution. 2018, 35(12):3041-3043.
## doi:10.1093/molbev/msy194

library(phangorn)

## Loading required package: ape

## 
## Attaching package: 'ape'

## The following object is masked from 'package:ggtree':
## 
##     rotate

library(ape)
library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ✔ purrr   0.3.4      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks ggtree::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(treeio)

## treeio v1.20.2 For help: https://yulab-smu.top/treedata-book/
## 
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
## 
## LG Wang, TTY Lam, S Xu, Z Dai, L Zhou, T Feng, P Guo, CW Dunn, BR
## Jones, T Bradley, H Zhu, Y Guan, Y Jiang, G Yu. treeio: an R package
## for phylogenetic tree input and output with richly annotated and
## associated data. Molecular Biology and Evolution. 2020, 37(2):599-603.
## doi: 10.1093/molbev/msz240
## 
## S Xu, Z Dai, P Guo, X Fu, S Liu, L Zhou, W Tang, T Feng, M Chen, L
## Zhan, T Wu, E Hu, Y Jiang, X Bo, G Yu. ggtreeExtra: Compact
## visualization of richly annotated phylogenetic data. Molecular Biology
## and Evolution. 2021, 38(9):4039-4042. doi: 10.1093/molbev/msab166
## 
## Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods
## for mapping and visualizing associated data on phylogeny using ggtree.
## Molecular Biology and Evolution. 2018, 35(12):3041-3043.
## doi:10.1093/molbev/msy194
## 
## Attaching package: 'treeio'
## 
## The following object is masked from 'package:ape':
## 
##     drop.tip

#Set correct working directory to pre-load the required files for analysis. These files are generated in the previous section describing initial database construction.
load("/Users/dogrinev/Desktop/Seroplacer_Core_Files_EC.Rdata")

E. coli Data Analysis Section

First we load all of the required functions for performing serovar placement analysis. This section differs slightly from the original Seroplacer function beecause we are performing a step where we remove the test query. Each test query is selected from the tree, removed, and then placed back. These functions incorporate these extra steps, since the base Seroplacer algorithm does not involve removing any assemblies from the tree, and a placement would be made into the full tree.

Run<-FullTestQueries[1:1464]

#Actual core placement function which calculates mismatch optimization values.
Super_HAM_Placer2<-function(TestSequence,ReferenceSequence)
{
  Row1<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[2],exclude = c("n","N","?"),ignore.case = TRUE))
  Row2<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[3],exclude = c("n","N","?"),ignore.case = TRUE))
  Row3<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[4],exclude = c("n","N","?"),ignore.case = TRUE))
  Row4<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[5],exclude = c("n","N","?"),ignore.case = TRUE))
  Row5<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[6],exclude = c("n","N","?"),ignore.case = TRUE))
  Row6<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[7],exclude = c("n","N","?"),ignore.case = TRUE))
  Row7<-map_int(.x = 1:7,.f = ~string.diff(a = TestSequence[.],b = ReferenceSequence[8],exclude = c("n","N","?"),ignore.case = TRUE))
  Hamming_Table_Result<-rbind(Row1,Row2,Row3,Row4,Row5,Row6,Row7)
  misms <- apply(cmb, 1, function(ii, dd=Hamming_Table_Result) {
    sum(dd[1,ii[[1]]], dd[2,ii[[2]]], dd[3,ii[[3]]],
        dd[4,ii[[4]]], dd[5,ii[[5]]], dd[6,ii[[6]]],
        dd[7,ii[[7]]])
  })
  Best_Order<-as.integer(cmb[which(misms==min(misms))[1],])
  return(Best_Order)
}

#String matching function which supports Super_Ham_Placer2.
string.diff<-function(a,b,exclude=c("n","N","?"),ignore.case=TRUE)
{
  a<-toupper(a)
  b<-toupper(b)
  diff.a<-unlist(strsplit(a,split=""))
  diff.b<-unlist(strsplit(b,split=""))
  diff.d<-rbind(diff.a,diff.b)
  for(ex.loop in 1:length(exclude))
  {
    diff.d<-diff.d[,!(diff.d[1,]==exclude[ex.loop]|diff.d[2,]==exclude[ex.loop])]
  }
  differences<-sum(diff.d[1,]!=diff.d[2,])
  return(differences)
}

#Concatenation function which supports Super_HAM_Placer2.
Concatenate_By_Order<-function(Order,RefSequence)
{
  Sequence<-RefSequence[2:8]
  Concatenated_Output<-paste(Sequence[Order[1]],Sequence[Order[2]],Sequence[Order[3]],Sequence[Order[4]],Sequence[Order[5]],Sequence[Order[6]],Sequence[Order[7]],sep = "")
  return(Concatenated_Output)
}

#Required Package Loading
library(tidyverse)
library(dada2)
library(phylotools)

#Loop to execute placement function. 
#IMPORTANT NOTE: Set an appropriate working directory first because this loop will generate new directories to store placement data. These directories will be filled with the EPA-NG placement data and output files need to be organized correctly for downstream analysis.
for(x in Run)
{
  system(command = paste("mkdir ~/Placement_Testing_EC/",x,sep = ""))
  setwd(paste("~/Placement_Testing_EC/",x,sep = ""))
  Remove_Index<-as.numeric(which(Tree_Assemblies_Final==x))
  TestSequence<-Full_16s_Data_Aligned_Final[[which(Tree_Assemblies_Final==x)]][2:8]
  Full_16s_Data_Aligned2<-Full_16s_Data_Aligned_Final[-Remove_Index]
  Concat_Orders<-map(.x = 1:2790,.f = ~Super_HAM_Placer2(TestSequence = TestSequence,ReferenceSequence = Full_16s_Data_Aligned2[[.]]))
  Concatenated_References<-map(.x = 1:2790,.f = ~Concatenate_By_Order(Order = Concat_Orders[[.]][1:7],RefSequence = Full_16s_Data_Aligned2[[.]]))
  Prep_For_FASTA<-as.data.frame(unlist(Concatenated_References))
  GFFs_Names<-Tree_Assemblies_Final[-Remove_Index]
  Prep_For_FASTA2<-cbind(GFFs_Names,Prep_For_FASTA)
  colnames(Prep_For_FASTA2)<-c("seq.name","seq.text")
  dat2fasta(dat = Prep_For_FASTA2,outfile = paste("~/Placement_Testing_EC/",x,"/concatref.fasta",sep = ""))
  #References are ready from above
  #Now prepare query fasta
  QTestSequence<-Full_16s_Data_Aligned_Final[[which(Tree_Assemblies_Final==x)]][2:8]
  Query_Prep<-as.data.frame(paste(QTestSequence[1],QTestSequence[2],QTestSequence[3],QTestSequence[4],QTestSequence[5],QTestSequence[6],QTestSequence[7],sep = ""))
  Query_Prep2<-cbind(x,Query_Prep)
  colnames(Query_Prep2)<-c("seq.name","seq.text")
  dat2fasta(dat = Query_Prep2,outfile = paste("~/Placement_Testing_EC/",x,"/query.fasta",sep = ""))
  #Next prepare tree with test sequence plucked out
  vert.tree.trimmed<-ape:::drop.tip(phy = vert.tree.EC_Final,tip = x)
  write.tree(phy = vert.tree.trimmed,file = paste("~/Placement_Testing_EC/",x,"/RAXML.tree.trimmed",sep = ""))
  #add command to run epa-ng here
  system(command = paste("~/epa-ng-master/bin/epa-ng --ref-msa ~/Placement_Testing_EC/",x,"/concatref.fasta --tree ~/Placement_Testing_EC/",x,"/RAXML.tree.trimmed --query ~/Placement_Testing_EC/",x,"/query.fasta --model ~/Placement_Testing_EC/bestModelEC --filter-max 100",sep = ""))
  system(command = paste("rm concatref.fasta"))
  system(command = paste("rm RAXML.tree.trimmed"))
  #end loop
}

Function Loading Section, used for next analysis.

Placement_Node_Selector<-function(Distal_Node,Distal_Length,Tree,Root,Dist_Table)
{
  if(Distal_Node == Root)
  {
    return(Distal_Node)
  }
  else
  {
    Proximal_Node<-parent(.data = Tree,.node = Distal_Node)
    Total_Distance<-Dist_Table[Distal_Node,Proximal_Node]
    Proximal_Length<-Total_Distance-Distal_Length
    if(Distal_Length >= Proximal_Length)
    {
      return(Proximal_Node)
    }
    else
    {
      return(Distal_Node)
    }
  }
}

Depth_Calculator<-function(Serovar,Dists,Serovar_Names)
{
  Cut_Dists<-as.numeric(Dists[which(Serovar_Names %in% Serovar)])
  Depth<-max(Cut_Dists)
  Sum_Branches<-sum(Cut_Dists)
  Output<-c(Depth,Sum_Branches)
  return(Output)
}

Placement_Calculator<-function(SeroName,Placement_Results_DF)
{
  DF_Subset<-Placement_Results_DF[which(Placement_Results_DF$Placement_Actuals_Vec==SeroName),]
  Total_Sero<-length(rownames(DF_Subset))
  Number_Correct<-length(which(DF_Subset$Placement_Actuals_Vec == DF_Subset$Placement_Finals_Vec))
  Number_Ind<-length(which(DF_Subset$Placement_Finals_Vec=="Indeterminate"))
  Number_Wrong<-Total_Sero-Number_Correct-Number_Ind
  Percent_Correct<-Number_Correct/Total_Sero*100
  Percent_Ind<-Number_Ind/Total_Sero*100
  Percent_Wrong<-Number_Wrong/Total_Sero*100
  Results<-data.frame(SeroName,Total_Sero,Percent_Correct,Percent_Ind,Percent_Wrong)
  return(Results)
}

Clade_Hit_Finder<-function(Original,Pendant_Multi)
{
  x<-Original
  setwd(paste("~/Desktop/Database_Rebuild/Placement_Testing_EC/",x,sep=""))
  JPlace<-read.jplace("epa_result.jplace")
  vert.tree.trimmed<-ape:::drop.tip(phy = vert.tree.EC_Final,tip = x)
  vert.tree<-vert.tree.trimmed
  PlacedEdges<-JPlace@placements$node
  Node_Distances<-dist.nodes(x = vert.tree)
  PlacedEdges_List<-map(.x = 1:length(PlacedEdges),.f = ~Placement_Node_Selector(Distal_Node = PlacedEdges[.],Distal_Length = JPlace@placements$distal_length[.],Tree = vert.tree,Root = rootnode(vert.tree),Dist_Table = Node_Distances))
  PlacedEdges_Corrected<-unlist(PlacedEdges_List)
  All_Descendants<-map(.x = PlacedEdges_Corrected,.f = ~Descendants(x = vert.tree,node = .))
  All_Descendants_Vec<-unlist(All_Descendants)
  TipNumbersOI_Filtered<-unique(All_Descendants_Vec)
  TipNamesOI_Filtered<-vert.tree$tip.label[TipNumbersOI_Filtered]
  TipNumbersOI_OriginalTree<-which(vert.tree.EC_Final$tip.label %in% TipNamesOI_Filtered)
  Edge.Table<-cbind(vert.tree.EC_Final$edge,vert.tree.EC_Final$edge.length)
  Ancestor_List<-map(.x = TipNumbersOI_OriginalTree,.f = ~unlist(Ancestors(vert.tree.EC_Final,node = .)))
  Sum_List<-as.list(NULL)
  for(j in 1:length(Ancestor_List[[1]]))
  {
    Test_Vec<-map_int(.x = Ancestor_List,.f = ~match(Ancestor_List[[1]][j],.))
    Sum_List[[j]]<-Test_Vec
  }
  Result_Vec<-map_int(.x = Sum_List,.f = ~sum(.))
  Result_Vec_NoNA<-Result_Vec[!is.na(Result_Vec)]
  Target_Value<-min(Result_Vec_NoNA)
  Closest_Parent_Node<-Ancestor_List[[1]][which(Result_Vec==Target_Value)]
  Root_Node<-rootnode(vert.tree.EC_Final)
  if(Closest_Parent_Node == Root_Node)
  {
    MRCA<-Closest_Parent_Node
    return(MRCA)
  }
  if(Closest_Parent_Node != Root_Node)
  {
    Potential_Ancestors<-Ancestors(x = vert.tree.EC_Final,node = Closest_Parent_Node)
    Potential_Ancestors_Final<-c(Closest_Parent_Node,Potential_Ancestors)
    Edge_Scores<-Edge.Table[which(Edge.Table[,2] %in% Potential_Ancestors_Final),]
    if(class(Edge_Scores)[1]=="numeric")
    {
      Edge_Scores<-t(as.data.frame(Edge_Scores))
      Pendant<-max(JPlace@placements$pendant_length)
      Branch_Sums<-map(.x = 1:length(Edge_Scores[,3]),.f = ~sum(Edge_Scores[.:length(Edge_Scores[,3]),3]))
      Branch_Sums_Vec<-unlist(Branch_Sums)
      MRCA<-Edge_Scores[which(abs(Branch_Sums_Vec-Pendant*Pendant_Multi)==min(abs(Branch_Sums_Vec-Pendant*Pendant_Multi))),2]
      return(MRCA)
    }
    if(class(Edge_Scores)[1]!="numeric")
    {
      Pendant<-max(JPlace@placements$pendant_length)
      Branch_Sums<-map(.x = 1:length(Edge_Scores[,3]),.f = ~sum(Edge_Scores[.:length(Edge_Scores[,3]),3]))
      Branch_Sums_Vec<-unlist(Branch_Sums)
      MRCA<-Edge_Scores[which(abs(Branch_Sums_Vec-Pendant*Pendant_Multi)==min(abs(Branch_Sums_Vec-Pendant*Pendant_Multi))),2]
      return(MRCA)
    }
  }
}

Placement_Method_Evaluator<-function(Number)
{
  PlacementMappingData_MRCAs_Pendant_New<-map(.x = FullTestQueries,.f = ~Clade_Hit_Finder(Original = .x,Pendant_Multi = Number))
  Placement_Accuracy<-map(.x = 1:1819,.f = ~Clade_Accuracy(MRCA = PlacementMappingData_MRCAs_Pendant_New[[.]],Original = FullTestQueries[.]))
  Placement_Accuracy_DF<-as.data.frame(matrix(unlist(Placement_Accuracy), ncol = 2, byrow = TRUE))
  Placement_Summary<-mean(Placement_Accuracy_DF$V2)
  Placement_Checker<-map(.x = 1:1819,.f = ~Clade_Checker(MRCA = PlacementMappingData_MRCAs_Pendant_New[[.]],Original = FullTestQueries[.]))
  Placement_Checker_DF<-as.data.frame(matrix(unlist(Placement_Checker), ncol = 2, byrow = TRUE))
  Originals_Found<-length(which(Placement_Checker_DF$V1==6))/1819
  Result<-c(Number,Placement_Summary,Originals_Found)
  return(Result)
}

Clade_Accuracy<-function(MRCA,Original)
{
  Descendant_List<-Descendants(x = vert.tree.EC_Final,node = MRCA)
  Descendants_Vec<-unlist(Descendant_List)
  Descendant_Assemblies<-vert.tree.EC_Final$tip.label[Descendants_Vec]
  Original_Serovar<-EC_Sero_Table_Final[which(EC_Sero_Table_Final$Assembly %in% Original),2]
  Descendant_Serovars<-EC_Sero_Table_Final[which(EC_Sero_Table_Final$Assembly %in% Descendant_Assemblies),2]
  Descendant_Serovars_NoNA<-Descendant_Serovars[which(Descendant_Serovars!="")]
  Correct_Seros<-which(Descendant_Serovars_NoNA %in% Original_Serovar)
  Fraction_Total_Seros<-length(Correct_Seros)/length(Descendant_Serovars_NoNA)
  Fraction_Total_Seros[is.na(Fraction_Total_Seros)]<-0
  Node_Distances<-dist.nodes(vert.tree.EC_Final)
  Depth_Range<-Node_Distances[MRCA,Descendants_Vec]
  Depth<-max(Depth_Range)
  Final_Result<-c(-log10(Depth),Fraction_Total_Seros)
  return(Final_Result)
}

Clade_Checker<-function(MRCA,Original)
{
  Descendant_List<-Descendants(x = vert.tree.EC_Final,node = MRCA)
  Original_Node<-which(vert.tree.EC_Final$tip.label==Original)
  
  if(Original_Node %in% Descendant_List[[1]] == TRUE)
  {
    Output<-c(6,length(Descendant_List[[1]]))
    return(Output)
  }
  
  if(Original_Node %in% Descendant_List[[1]] == FALSE)
  {
    Node_Distances<-dist.nodes(x = vert.tree.EC_Final)
    Descendant_Distances<-Node_Distances[Original_Node,Descendant_List[[1]]]
    Output<-c(-log10(min(Descendant_Distances)),length(Descendant_List[[1]]))
    return(Output)
  }
}

Determine MRCAs - These are calculated from the placement JSON files generated previously.

Original_Node_Distances<-dist.nodes(vert.tree.EC_Final)
Placement_MRCAs<-as.list(NULL)

for(i in 1:1988)
{
x<-FullTestQueries[i]
setwd(paste("~/Desktop/Down//Placement_Testing_EC/",x,sep=""))
JPlace<-read.jplace("epa_result.jplace")
vert.tree.trimmed<-ape:::drop.tip(phy = vert.tree.EC_Final,tip = x)
vert.tree<-vert.tree.trimmed
PlacedEdges<-JPlace@placements$node
Node_Distances<-dist.nodes(x = vert.tree)
PlacedEdges_List<-map(.x = 1:length(PlacedEdges),.f = ~Placement_Node_Selector(Distal_Node = PlacedEdges[.],Distal_Length = JPlace@placements$distal_length[.],Tree = vert.tree,Root = rootnode(vert.tree),Dist_Table = Node_Distances))
PlacedEdges_Corrected<-unlist(PlacedEdges_List)
All_Descendants<-map(.x = PlacedEdges_Corrected,.f = ~Descendants(x = vert.tree,node = .))
All_Descendants_Vec<-unlist(All_Descendants)
TipNumbersOI_Filtered<-unique(All_Descendants_Vec)
Edge.Table<-cbind(vert.tree$edge,vert.tree$edge.length)
Ancestor_List<-map(.x = TipNumbersOI_Filtered,.f = ~unlist(Ancestors(vert.tree,node = .)))
Sum_List<-as.list(NULL)
for(j in 1:length(Ancestor_List[[1]]))
{
  Test_Vec<-map_int(.x = Ancestor_List,.f = ~match(Ancestor_List[[1]][j],.))
  Sum_List[[j]]<-Test_Vec
}
Result_Vec<-map_int(.x = Sum_List,.f = ~sum(.))
Result_Vec_NoNA<-Result_Vec[!is.na(Result_Vec)]
Target_Value<-min(Result_Vec_NoNA)
Closest_Parent_Node<-Ancestor_List[[1]][which(Result_Vec==Target_Value)]
Root_Node<-rootnode(vert.tree)
if(Closest_Parent_Node == Root_Node)
{
  MRCA<-Closest_Parent_Node
  Placement_MRCAs[[i]]<-MRCA
}
if(Closest_Parent_Node != Root_Node)
{
  Pendant_Multi<-1.5
  Potential_Ancestors<-Ancestors(x = vert.tree,node = Closest_Parent_Node)
  Potential_Ancestors_Final<-c(Closest_Parent_Node,Potential_Ancestors)
  Edge_Scores<-Edge.Table[which(Edge.Table[,2] %in% Potential_Ancestors_Final),]
  if(class(Edge_Scores)[1]=="numeric")
  {
    Edge_Scores<-t(as.data.frame(Edge_Scores))
    Pendant<-max(JPlace@placements$pendant_length)
    Branch_Sums<-map(.x = 1:length(Edge_Scores[,3]),.f = ~sum(Edge_Scores[.:length(Edge_Scores[,3]),3]))
    Branch_Sums_Vec<-unlist(Branch_Sums)
    MRCA<-Edge_Scores[which(abs(Branch_Sums_Vec-Pendant*Pendant_Multi)==min(abs(Branch_Sums_Vec-Pendant*Pendant_Multi))),2]
    Placement_MRCAs[[i]]<-MRCA
  }
  if(class(Edge_Scores)[1]!="numeric")
  {
    Pendant<-max(JPlace@placements$pendant_length)
    Branch_Sums<-map(.x = 1:length(Edge_Scores[,3]),.f = ~sum(Edge_Scores[.:length(Edge_Scores[,3]),3]))
    Branch_Sums_Vec<-unlist(Branch_Sums)
    MRCA<-Edge_Scores[which(abs(Branch_Sums_Vec-Pendant*Pendant_Multi)==min(abs(Branch_Sums_Vec-Pendant*Pendant_Multi))),2]
    Placement_MRCAs[[i]]<-MRCA
  }
}
}

Calculation of our predictions and the actual results. Placement_Predictions contains the predictions made by the placement algorithm with relevant data, and Placement_Finals contains the final output serovar prediction. Placement_Actuals contains the correct serovar and is used to determine if a correct prediction was made.

if(!identical(x = length(Placement_MRCAs),y = length(FullTestQueries))) stop("File error. Check input data.")

Original_Node_Distances<-dist.nodes(vert.tree.EC_Final)

Placement_Predictions<-as.list(NULL)
Placement_Finals<-as.list(NULL)

for(i in 1:1988)
{
  x<-FullTestQueries[i]
  vert.tree.trimmed<-ape:::drop.tip(phy = vert.tree.EC_Final,tip = x)
  MRCA<-Placement_MRCAs[[i]]
  Tree<-vert.tree.trimmed
  Descendant_List<-Descendants(x = Tree,node = MRCA)
  Descendants_Vec<-unlist(Descendant_List)
  Descendant_Assemblies<-Tree$tip.label[Descendants_Vec]
  GTD_Sero_Predict_Clean2<-EC_Sero_Table_Final
  Descendant_Serovars<-GTD_Sero_Predict_Clean2[which(GTD_Sero_Predict_Clean2$Assembly %in% Descendant_Assemblies),2]
  Descendant_Serovars_NoNA<-Descendant_Serovars[which(Descendant_Serovars!="")]
  Serovar_Report<-names(table(Descendant_Serovars_NoNA))
  Sero_Percentages<-unlist(map(.x = Serovar_Report,.f = ~length(which(Descendant_Serovars %in% .))/length(Descendant_Serovars)))
  Sero_Numbers<-unlist(map(.x = Serovar_Report,.f = ~length(which(Descendant_Serovars %in% .))))
  Node_Distances<-dist.nodes(Tree)
  Distances<-Node_Distances[MRCA,Descendants_Vec]
  Depth_Results<-map(.x = Serovar_Report,.f = ~Depth_Calculator(Serovar = .,Dists = Distances,Serovar_Names = Descendant_Serovars))
  Depth_Results_Cols<-data.frame(t(matrix(unlist(Depth_Results),nrow=2)))
  Sero_Table<-as.data.frame(cbind(Serovar_Report,Sero_Percentages,Sero_Numbers,Depth_Results_Cols))
  colnames(Sero_Table)<-c("Serovar","Fraction","Matches in Final Clade","Maximum Depth","Sum of Branch Lengths")
  Sero_Table_Sorted <- Sero_Table[order(Sero_Table$Fraction,decreasing = TRUE),]
  Sero_Table_Sorted$Fraction<-as.numeric(Sero_Table_Sorted$Fraction)
  Sero_Table_Sorted$Fraction<-Sero_Table_Sorted$Fraction*100
  if(sum(Sero_Table_Sorted[,3]) > 500)
  {
    Placement_Predictions[[i]]<-Sero_Table_Sorted
    Placement_Finals[[i]]<-"Indeterminate"
  }
  if(sum(Sero_Table_Sorted[,3]) < 500)
  {
    Placement_Predictions[[i]]<-Sero_Table_Sorted
    if(Sero_Table_Sorted[1,2] >= 50)
    {
      Placement_Finals[[i]]<-Sero_Table_Sorted[1,1]
    }
    if(Sero_Table_Sorted[1,2] < 50)
    {
      Placement_Finals[[i]]<-"Indeterminate"
    }
  }
}

Placement_Actuals<-as.list(NULL)
for(i in 1:1988)
{
  Query<-FullTestQueries[i]
  QuerySero<-EC_Sero_Table_Final[which(EC_Sero_Table_Final$Assembly==Query),2]
  Placement_Actuals[[i]]<-QuerySero
}

Figure 6 - E. coli Serovar-Specific Classification Accuracy

Barplot of proportions representing E. coli placement accuracy results for each specific serovar from the top serovars in our total dataset.

Placement_Actuals_Vec<-unlist(Placement_Actuals)
Placement_Finals_Vec<-unlist(Placement_Finals)
Placement_Results_DF<-as.data.frame(cbind(Placement_Actuals_Vec,Placement_Finals_Vec))

Data_Vec<-rep(1,1988)
Data_Vec[which(Placement_Finals_Vec=="Indeterminate")]<-"Indeterminate"
Data_Vec[which(Placement_Finals_Vec==Placement_Actuals_Vec)]<-"Correct"
Data_Vec[which(Data_Vec==1)]<-"Incorrect"
EC_Serovar_Results_Data_Table<-as.data.frame(cbind(FullTestQueries,Placement_Actuals_Vec,Data_Vec))
colnames(EC_Serovar_Results_Data_Table)<-c("Assemblies","Serovars","Results")
Multi_O<-map_chr(.x = 1:1988,.f = ~str_detect(string = EC_Serovar_Results_Data_Table[.,2],pattern = "/"))
EC_Serovar_Results_Data_Table_Cut<-EC_Serovar_Results_Data_Table[which(Multi_O==FALSE),]

#Nice barplot:
serosize<-names(sort(table(EC_Sero_Table_Final$Serovar),decreasing = TRUE))
EC_Serovar_Results_Data_Table_Cut %>%
  mutate(Serovars=factor(Serovars, levels = rev(serosize)), # make serovars a factor for ordering on plot
         Results=factor(Results, levels=c("Indeterminate", "Incorrect", "Correct"))) %>% # convert to factor for ordering
  count(Serovars, Results) %>%
  with_groups(c(Serovars), mutate, p=n/sum(n)) %>% # calculate proportions
  ggplot(aes(x=Serovars, y=p, fill=Results)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_y_continuous(breaks = seq(0,1,0.25), labels=c("0", "0.25", "0.5", "0.75", "1")) +
  scale_fill_manual(breaks=c("Correct", "Incorrect", "Indeterminate"), values = c("palegreen2", "indianred1", "gray75")) + #reorder colors
  theme_bw() +
  theme(axis.text.y = element_text(size = 5.5),
        panel.grid = element_blank(), # remove grid lines behind bars
        legend.position = "top") +
  labs(y="Proportion")

#ggsave(filename = "Figure6.pdf",device = "pdf",width = 5,height = 6,dpi = 600)

Seroplacer Reproducible Analysis (E. coli)

2023-05-20

E. coli Data Analysis Section

Figure 6 - E. coli Serovar-Specific Classification Accuracy