forked from hackseq/hs19-flu
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlarger_scale_features.R
138 lines (124 loc) · 6.2 KB
/
larger_scale_features.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
library(ape)
library(phangorn)
library(seqinr)
library(dplyr)
library(tibbletime)
library(treeio)
#' @param tree a phylogenetic tree
#' @param maxdate - the date of the most recent tip in the tree, in numerical format (or a format that can be added to a number)
#' @return a 2-column matrix, time and number of splitting events before that time. times are splitting times of the tree
divtt <- function(tree, maxdate=NULL) {
x = ltt.plot.coords(tree,backward = F)
# i want to take all the negative increments out and just have the births
# remove rows of x where N decreases. then the num divs is just 1: (num events)
toRemove = 1+which(diff(x[,2])<0)
x=x[-toRemove,]
x[,2]=1:nrow(x)
if (!is.null(maxdate)) {x[,1]= x[,1]+ maxdate - max(node.depth.edgelength(tree))}
return(data.frame(time=x[,1], Nsplits=x[,2]))
}
#' @param tree a phylogenetic tree
#' @param numbins number of bins you want to group the times into
#' @return list with group (the bin number of the tips) and times of bin breaks, in tree time (root at 0)
timechop <- function(tree, numbins) {
tipheights = node.depth.edgelength(tree)[1:length(tree$tip.label)]
bk = quantile(tipheights, seq(from = 0, to=1,length.out = numbins+1))
group = cut(tipheights, bk, include.lowest = T,labels = 1:numbins)
return(list(group=group, times=bk))
}
#' @param tree a phylogenetic tree
#' @param tipbins output of hte function timechop
#' @param aln - output of read.fasta; the sequences for the tips
#' @param gdtree - tree, in units of genetic distance (not timed), though this is not essential
#' @return genetic distance profile over time (a data frame)
gendistprofiles <- function(tree, tipbins, aln=NULL,scale=TRUE,gdtree=NULL, mode="alignment") {
if (mode == "alignment") sf=ifelse(scale==TRUE, length(aln[[1]]), 1)
gps=unique(tipbins$group)
num.bins=length(gps)
cum.diversity=0*(1:num.bins)
now.diversity=0*(1:num.bins)
num.tips=0*(1:num.bins)
isDiverseAnyTime = rep(F, length(aln[[1]]))
for (k in 1:length(gps)) {
thesetips = tree$tip.label[which(tipbins$group == k)]
if (mode == "alignment") {
myind = vapply(thesetips, function(x) grep(x, names(aln)), FUN.VALUE = 1)
mycharmat=matrix("hi",nrow = length(myind), ncol = length(aln[[myind[1]]]))
for (n in 1:length(myind)) mycharmat[n,]=aln[[myind[n]]] # store sequence as character matrix
nposs=vapply(1:ncol(mycharmat), function(x) {
myvals=unique(mycharmat[,x])
return(length(myvals[which(myvals != "-")]))
}, FUN.VALUE = 1)
isDiverse = vapply(1:ncol(mycharmat), function(x) (nposs[x]>1) , FUN.VALUE = T)
isDiverseAnyTime= (isDiverse | isDiverseAnyTime)
cum.diversity[k] = sum(isDiverseAnyTime)/sf
now.diversity[k]=sum(isDiverse)/sf
num.tips[k]=length(myind)
}
if (mode == "tree") {
# sum of path lengths in subtree of just these tips
thistree = drop.tip(tree, which( ! (tree$tip.label %in% thesetips)))
now.diversity[k] = sum(thistree$edge.length)
allthesetips = tree$tip.label[which(as.numeric(tipbins$group) <= k)]
cum.diversity[k]=sum( drop.tip(tree,which( !(tree$tip.label %in% allthesetips)))$edge.length)
num.tips[k]=length(thesetips)
}
}
return(data.frame(cum.diversity=cum.diversity, now.diversity=now.diversity,
time=tipbins$times[2:length(tipbins$times)]))
}
#' @param timetree a phylogenetic tree (in units of time; the clade of focus)
#' @param gdtree a phylogenetic tree in units of genetic distance, which contains all tips of the timetree
#' @return lineages through genetic distance space, using the part of gdtree containing the tips in timetree
lttgendist <- function(timetree, gdtree) {
mytree = drop.tip(gdtree, which( !(gdtree$tip.label %in% timetree$tip.label))) # tree of only my tips
return(ltt.plot.coords(mytree,backward = F))
}
#' @param tree a phylogenetic tree
#' @param gdtree a phylogenetic tree in units of genetic distance, which contains all tips of the timetree
#' @return a 2-column matrix, genetic dist from root and number of splitting events before that distance.
divttgendist <- function(timetree, gdtree,maxdate=NULL) {
mytree = drop.tip(gdtree, which( !(gdtree$tip.label %in% timetree$tip.label))) # tree of only my tips
x = ltt.plot.coords(mytree,backward = F)
# take all the negative increments out and just have the births
toRemove = 1+which(diff(x[,2])<0)
x=x[-toRemove,]
x[,2]=1:nrow(x)
if (!is.null(maxdate)) {x[,1]= x[,1]+ maxdate - max(node.depth.edgelength(tree))}
return(data.frame(time=x[,1],Ngdist=x[,2]))
}
#' @param labels_clades_dates_path path to three-column csv file containing labels, clade names and dates
load_labels_clades_dates <- function(labels_clades_dates_path) {
labels_clades_dates <- read_csv(
labels_clades_dates_path,
col_names = FALSE,
col_types = cols(X1=col_character(), X2=col_character(), X3=col_date("%Y-%m-%d"))
)
names(labels_clades_dates) <- c('label','clade_name','date')
labels_clades_dates <- as_tbl_time(labels_clades_dates, index=date)
}
#' @param tree phylogenetic tree
#' @param labels_dates data frame with fields 'label' and 'date'
#' @param truncation_date date after which all tips will be dropped from the tree
truncate_tree_by_date <- function(tree, labels_dates, truncation_date) {
labels_to_exclude <- labels_dates %>%
filter(date > truncation_date) %>%
pull(label)
ladderize(drop.tip(tree, labels_to_exclude))
}
#' @param tree1 phylogenetic tree, truncated at an earlier time than tree2
#' @param tree2 phylogenetic tree, truncated at an later time than tree1
#' @param clades list of clades as generated by pickClades() on tree1
growth_ratios_for_clades <- function(tree1, tree2, clades){
ratios <- c()
for (i in seq_along(clades$clades)) {
tree1_clade_tip_labels <- tree1$tip.label[clades$clades[[i]]]
tree1_clade_mrca <- findMRCA(tree1, tree1_clade_tip_labels)
tree2_clade_mrca <- findMRCA(tree2, tree1_clade_tip_labels)
tree1_clade_mrca_descendants <- getDescendants(tree1, tree1_clade_mrca)
tree2_clade_mrca_descendants <- getDescendants(tree2, tree2_clade_mrca)
ratio <- length(tree2_clade_mrca_descendants) / length(tree1_clade_mrca_descendants)
ratios <- append(ratios, ratio)
}
data.frame("clade" = clades$nodes, "growth_ratio" = ratios)
}