BRC_growth_rate_microbiome.RMD

---
title: "How fast does biological rock crust grow?"
subtitle: "Microbial community analysis"

date: "`r format(Sys.time(), '%d %B, %Y')`"
bibliography: references.bib
link-citations: yes
always_allow_html: true
output:
  rmarkdown::github_document:
    toc: true
    toc_depth: 5
    df_print: "kable"
    keep_html: TRUE
    
---

```{r libraries, include=F}
library(extrafont) # Tools for using fonts, CRAN v0.17
library(svglite) # An 'SVG' Graphics Device, CRAN v1.2.3.2
library(ragg) # Graphic Devices Based on AGG, CRAN v0.3.1
library(tidyverse) # Easily Install and Load the 'Tidyverse', CRAN v1.3.0 
library(magrittr) # A Forward-Pipe Operator for R, CRAN v1.5
library(scales) # Scale Functions for Visualization, CRAN v1.1.1
library(ggpomological) # Pomological plot themes for ggplot2, [github::gadenbuie/ggpomological] v0.1.2 
library(cowplot) # Streamlined Plot Theme and Plot Annotations for 'ggplot2', CRAN v1.1.0
library(patchwork) # The Composer of Plots, CRAN v1.0.1 # The Composer of Plots, CRAN v1.0.1
library(kableExtra) # Construct Complex Table with 'kable' and Pipe Syntax, CRAN v1.2.1
library(see) # Visualisation Toolbox for 'easystats' and Extra Geoms, Themes and Color Palettes for 'ggplot2', CRAN v0.6.0
library(phyloseq) # Handling and analysis of high-throughput microbiome census data, Bioconductor v1.32.0
library(speedyseq) # psmelt
library(vegan) # Community Ecology Package, CRAN v2.5-6
library(BiodiversityR) # Package for Community Ecology and Suitability Analysis, CRAN v2.12-1
# library(agricolae)
library(car) # Companion to Applied Regression, CRAN v3.0-9
library(userfriendlyscience) # Quantitative Analysis Made Accessible, CRAN v0.7.2
library(emmeans) # Estimated Marginal Means, aka Least-Squares Means, CRAN v1.5.1
library(multcomp) # Simultaneous Inference in General Parametric Models, CRAN v1.4-14
library(ALDEx2) # Analysis Of Differential Abundance Taking Sample Variation Into Account, Bioconductor v1.20.0
library(corncob) # Count Regression for Correlated Observations with the Beta-Binomial, [github::bryandmartin/corncob] v0.1.0
library(ggrepel) # Automatically Position Non-Overlapping Text Labels with 'ggplot2', CRAN v0.8.2 
```

```{r style settings, include=F}
options(width = 90, knitr.table.format = "html") 
knitr::opts_chunk$set(
  warning = FALSE,
  message = FALSE,
  cache = TRUE,
  dev = c("svglite", "png"),
  dev.args = list(svglite = list(bg = 'white'), png = list(bg = 'white')),
  dpi = 300,
#  fig.width = 12,
#  fig.height = 8,
  cache.path = "BRC_growth_rate_cache/",
  fig.path = "BRC_growth_rate_figures/"
)
f_name <- "DejaVu Sans" #sub("\\s//", "", f_name)
f_size <- 12
font_import(pattern = "DejaVuSans\\.", prompt = FALSE)
loadfonts() # registers fonts
theme_set(theme_bw(base_size = f_size, 
                      base_family = f_name
                      ))
```

```{r functions, include=FALSE}
PlotLibDist <- function(Ps_obj, x = "Replicate", fill = "Rock.type", facet = "Location") {
  Library.size <- rowSums(otu_table(Ps_obj))
  ggplot(sample_data(Ps_obj),
         aes(x = !!sym(x), y = Library.size, fill = !!sym(fill))) +
    geom_bar(stat = "identity",
             position = "dodge",
             color = "black") +
    scale_y_log10(
      breaks = trans_breaks("log10", function(x)
        10 ^ x),
      labels = trans_format("log10", math_format(10 ^ .x))
    ) +
    background_grid(major = "xy", minor = "none") +
    scale_fill_pomological() +
    facet_grid(get(facet) ~ .)
}

PlotReadHist <- function(OTUmat, b.width = 10) {
  OTUmat %>%
    t() %>%
    as.tibble() %>%
    gather(key = sample, value = abundance) %>%
    ggplot(aes(abundance)) +
    # geom_histogram(binwidth = 1000) +
    geom_freqpoly(binwidth = b.width) +
    scale_y_log10()
}

GMPR <- function (comm,
                  intersect.no = 4,
                  ct.min = 4) {
  require(matrixStats)
  # Computes the GMPR size factor
  #
  # Args:
  #   comm: a matrix of counts, row - features (OTUs, genes, etc) , column - sample
  #   intersect.no: the minimum number of shared features between sample pair, where the ratio is calculated
  #   ct.min: the minimum number of counts required to calculate ratios （Empirical study found ct.min=4 is suitable)
  
  #
  # Returns:
  #   a list that contains:
  #      gmpr： the GMPR size factors for all samples; Samples with distinct sets of features will be output as NA.
  #      nss:   number of samples with significant sharing (> intersect.no) including itself
  
  # mask counts < ct.min
  comm[comm < ct.min] <- 0
  
  if (is.null(colnames(comm))) {
    colnames(comm) <- paste0('S', 1:ncol(comm))
  }
  
  cat('Begin GMPR size factor calculation ...\n')
  
  comm.no <- numeric(ncol(comm))
  gmpr <- sapply(1:ncol(comm),  function(i) {
    if (i %% 50 == 0) {
      cat(i, '\n')
    }
    x <- comm[, i]
    # Compute the pairwise ratio
    pr <- x / comm
    # Handling of the NA, NaN, Inf
    pr[is.nan(pr) | !is.finite(pr) | pr == 0] <- NA
    # Counting the number of non-NA, NaN, Inf
    incl.no <- colSums(!is.na(pr))
    # Calculate the median of PR
    pr.median <- colMedians(pr, na.rm = TRUE)
    # Record the number of samples used for calculating the GMPR
    comm.no[i] <<- sum(incl.no >= intersect.no)
    # Geometric mean of PR median
    if (comm.no[i] > 1) {
      return(exp(mean(log(pr.median[incl.no >= intersect.no]))))
    } else {
      return(NA)
    }
  })
  
  if (sum(is.na(gmpr))) {
    warning(
      paste0(
        'The following samples\n ',
        paste(colnames(comm)[is.na(gmpr)], collapse = '\n'),
        '\ndo not share at least ',
        intersect.no,
        ' common taxa with the rest samples! ',
        'For these samples, their size factors are set to be NA! \n',
        'You may consider removing these samples since they are potentially outliers or negative controls!\n',
        'You may also consider decreasing the minimum number of intersecting taxa and rerun the procedure!\n'
      )
    )
  }
  
  cat('Completed!\n')
  cat(
    'Please watch for the samples with limited sharing with other samples based on NSS! They may be outliers! \n'
  )
  names(gmpr) <- names(comm.no) <- colnames(comm)
  return(list(gmpr = gmpr, nss = comm.no))
}

PlotLmResid <- function(lm.df, which = c(1:6), mfrow = c(3, 2)){
  require(grid)
  if (length(levels(as.factor(lm.df$.fitted))) < 10) {# if number of unique x values is <10 just draw a line through the means
    smoother <- stat_summary(fun = mean, colour = "red", geom = "line")
  } else smoother <- stat_smooth(method = "loess", geom = "smooth", se = FALSE, colour = "firebrick", size = 1)
  
  # residuals vs fitted
  g1 <- ggplot(lm.df, aes(.fitted, .resid)) +
    geom_point2()  +
    smoother + 
    geom_hline(yintercept = 0, linetype = 2, size = .2) +
    scale_x_continuous("Fitted Values") +
    scale_y_continuous("Residual") +
    labs(title = "Residuals vs Fitted")
  
  # normal qq
  a <- quantile(lm.df$.stdresid, c(0.25, 0.75), na.rm = TRUE)
  b <- qnorm(c(0.25, 0.75))
  slope <- diff(a)/diff(b)
  int <- a[1] - slope * b[1]
  g2 <- ggplot(lm.df, aes(sample = .stdresid)) +
    stat_qq() +
    geom_abline(slope = slope, intercept = int, colour = "firebrick", size = 1) +
      scale_x_continuous("Theoretical Quantiles") +
      scale_y_continuous("Standardized Quantiles") +
      labs(title = "Normal Q-Q")
 
  # scale-location
  g3 <- ggplot(lm.df, aes(.fitted, sqrt(abs(.stdresid)))) +
    geom_point2() +
    smoother +
    scale_x_continuous("Fitted Values") +
    scale_y_continuous("Root of Standardized Residuals") +
    labs(title = "Scale-Location")
 
  # residuals vs leverage
  g4 <- ggplot(lm.df, aes(factors, .stdresid)) +
    geom_point2() +
    smoother +
    geom_hline(yintercept = 0, linetype = 2, size = .2) +
    scale_x_continuous("Factor Level Combinations") +
    scale_y_continuous("Standardized Residuals") +
    labs(title = "Residuals vs Factor Levels")
 
#   # cook's distance
#   g5 <-  ggplot(lm.df, aes(rows, .cooksd, ymin=0, ymax=.cooksd)) +
#     geom_point2() + geom_linerange() +
#     scale_x_continuous("Observation Number") +
#     scale_y_continuous("Cook's distance") +
#     labs(title="Cook's Distance")  
  
  # cooksd vs leverage
  g5 <- ggplot(lm.df, aes(factors, .cooksd)) +
    geom_point2() +
    smoother +
    scale_x_continuous("Factor Level Combinations") +
    scale_y_continuous("Cook's distance") +
    labs(title = "Cook's dist vs Leverage")
  
  bw <- diff(range(lm.df$.resid)) / (2 * IQR(lm.df$.resid) / length(lm.df$.resid) ^ (1/3))
  sshist <- function(x){ # optimise bins
  # 2006 Author Hideaki Shimazaki
  # Department of Physics, Kyoto University
  # shimazaki at ton.scphys.kyoto-u.ac.jp
	N <- 2 : 100
	C <- numeric(length(N))
	D <- C
	for (i in 1:length(N)) {
		D[i] <- diff(range(x)) / N[i]
		edges = seq(min(x), max(x), length=N[i])
		hp <- hist(x, breaks = edges, plot=FALSE)
		ki <- hp$counts
		k <- mean(ki)
		v <- sum((ki-k) ^ 2) / N[i]
		C[i] <- (2 * k-v) / D[i] ^ 2	#Cost Function
	}
	idx <- which.min(C)
	optD <- D[idx]
	bins <- seq(min(x), max(x), length=N[idx])
	# h = hist(x, breaks = bins)
	# rug(x)
	return(bins)
  }
  
  bins <- sshist(lm.df$.resid)
  g6 <- ggplot(lm.df, aes(.resid)) + 
    geom_histogram(breaks = bins)
 
  plots <- list(g1, g2, g3, g4, g5, g6)
 
  # making the plots
  grid.newpage()
 
  if (prod(mfrow) > 1) {
    mypos <- expand.grid(1:mfrow[1], 1:mfrow[2])
    mypos <- mypos[with(mypos, order(Var1)), ]
    pushViewport(viewport(layout = grid.layout(mfrow[1], mfrow[2])))
    formatter <- function(.){}
  } else {
    mypos <- data.frame(matrix(1, length(which), 2))
    pushViewport(viewport(layout = grid.layout(1, 1)))
    formatter <- function(.) {
      .dontcare <- readline("Hit <Return> to see next plot: ")
      grid.newpage()
    }
  }
 
  j <- 1
  for (i in which) {
    formatter()
    print(plots[[i]], vp = viewport(layout.pos.row = mypos[j, ][1], layout.pos.col = mypos[j, ][2]))
    j <- j + 1
  }
}

TestAlphaV3 <- function(data2test = Richness_Diversity_long_sub,
                        response_name = "Estimate",
                        factor_names = c("Location", "Rock.type"),
                        boxcox.trans = FALSE) {

  
    require(dplyr)
      mod_lm <-
        aov(as.formula(paste(
          response_name,
          paste(factor_names[1], factor_names[2], sep = " * "),
          sep = " ~ "
        )), data2test)
    # }
    
    if (boxcox.trans) {
      # employ boxcox transformation then recalculate model
      print("Performing Box-Cox transformation of the data")
      lambdas <- boxcox(as.formula(paste(
        response_name,
        paste(factor_names[1], factor_names[2], sep = " * "),
        sep = " ~ "
      )),
      data = data2test,
      lambda = seq(0, 1.0, 0.01))
      print(range(lambdas$x[lambdas$y > max(lambdas$y) - qchisq(0.95, 1) /
                              2]))
      print(l.max <- lambdas$x[which.max(lambdas$y)])
      if (l.max == 0)
        l.max <- 1
      data2test$Estimate.box <-
        (data2test$Estimate ^ l.max - 1) / l.max
      mod_lm <-
        aov(as.formula(paste(
          "Estimate.box",
          paste(factor_names[1], factor_names[2], sep = " * "),
          sep = " ~ "
        )), data2test)
    }
    
    if (exists("mod_lm")) {
      print(mod_lm)
      mod_df <- fortify(mod_lm)
      factor.combinations <-
        as.numeric(factor(paste(mod_df[, factor_names[1]], mod_df[, factor_names[2]]),
                          levels = unique(as.character(
                            paste(mod_df[, factor_names[1]], mod_df[, factor_names[2]])
                          )))) # needed for "residuals vs leverage
      mod_df <-
        cbind(mod_df,
              rows = 1:nrow(mod_df),
              factors = factor.combinations)
      PlotLmResid(mod_df)
      if ((data2test %>% group_by(!!sym(factor_names[1]),!! sym(factor_names[2])) %>% dplyr::count() %>% pull(n) %>% n_distinct() == 1)) {
        print("Equal group sizes - showing SS type I")
        print(summary(mod_lm)) # display Type I ANOVA table
        } else {
          print("Unequal group sizes - showing SS type III")
          options(contrasts = c("contr.sum", "contr.poly"))
          print(Anova(mod_lm, type = "III")) # type III SS
          }
      print(model.tables(mod_lm, "means"), digits = 3) # Show the means
      return(mod_lm)
    }
  }

PairwiseAdonis <- function(x, factors, sim.function = "vegdist", sim.method = "horn", 
    p.adjust.m = "BH", reduce = NULL) 
{
  # Taken from: https://github.com/pmartinezarbizu/pairwiseAdonis
  
    co <- combn(unique(as.character(factors)), 2)
    pairs <- c()
    total.DF <- c()
    F.Model <- c()
    R2 <- c()
    p.value <- c()
    for (elem in 1:ncol(co)) {
        if (sim.function == "daisy") {
            x1 = cluster::daisy(x[factors %in% c(co[1, elem], co[2, elem]), 
                ], metric = sim.method)
        }
        else {
            x1 = vegdist(x[factors %in% c(co[1, elem], co[2, 
                elem]), ], method = sim.method)
        }
        ad <- adonis(x1 ~ factors[factors %in% c(co[1, elem], 
            co[2, elem])], permutations = 999)
        pairs <- c(pairs, paste(co[1, elem], "-", co[2, elem]))
        total.DF <- c(total.DF, ad$aov.tab["Total", 1])
        F.Model <- c(F.Model, ad$aov.tab[1, 4])
        R2 <- c(R2, ad$aov.tab[1, 5])
        p.value <- c(p.value, ad$aov.tab[1, 6])
    }
    p.adjusted <- p.adjust(p.value, method = p.adjust.m)
    sig = c(rep("", length(p.adjusted)))
    sig[p.adjusted <= 0.05] <- "."
    sig[p.adjusted <= 0.01] <- "*"
    sig[p.adjusted <= 0.001] <- "**"
    sig[p.adjusted <= 1e-04] <- "***"
    pairw.res <- data.frame(pairs, total.DF, F.Model, R2, p.value, 
        p.adjusted, sig)
    if (!is.null(reduce)) {
        pairw.res <- subset(pairw.res, grepl(reduce, pairs))
        pairw.res$p.adjusted <- p.adjust(pairw.res$p.value, method = p.adjust.m)
        sig = c(rep("", length(pairw.res$p.adjusted)))
        sig[pairw.res$p.adjusted <= 0.05] <- "."
        sig[pairw.res$p.adjusted <= 0.01] <- "*"
        sig[pairw.res$p.adjusted <= 0.001] <- "**"
        sig[pairw.res$p.adjusted <= 1e-04] <- "***"
        pairw.res <- data.frame(pairw.res[, 1:5], sig)
    }
    class(pairw.res) <- c("pwadonis", "data.frame")
    return(pairw.res)
}

STAMPR2 <- function(physeq_obj,
                    tax_rank = "Phylum",
                    vars2test = c("Location", "Rock.type"),
                    threshold = 0.005,
                    outputfile = "STAMPR_output") {
    # run a STAMP-like analysis: compare relative abundance differences using two-way analysis, then run a post-hoc test and correct for multiple comparison. This is an extended and updated version for Parks, D. H., Tyson, G. W., Hugenholtz, P., and Beiko, R. G. (2014). STAMP: statistical analysis of taxonomic and functional profiles. Bioinformatics 30, 3123–3124. doi:10.1093/bioinformatics/btu494.)
    # The test is meant as a follow-up test for a pair-wise ADONIS and use the significant test pairs
    # The global analysis is done using the Aligned Rank Transformed ANOVA test. This is a "modern" non-parametric test that should be more robust than the Kruskal-Wallis Rank Sum Test or the Scheirer Ray Hare.
    # The post-hoc test is an Estimated marginal means test on each significant pair.
    # If two variables are provided only the interactions are computed (for the main effects run it separately on each variable)
    # TODO: 
    require("emmeans") # Estimated marginal means (Least-squares means))
    require("ARTool") # Aligned Rank Transform
    require("multcomp") # Simultaneous Inference in General Parametric Models 
    if (length(vars2test) != 1 &
        length(vars2test) != 2)
    {stop('This function only suppurts 1 or two independent variables')}
    
    test_expression <-
      as.formula(paste("Abundance", paste(vars2test, collapse = " * "), sep = " ~ "))
    
    physeq_rel <-
      transform_sample_counts(physeq_obj, function(x)
        x / sum(x)) # convert to relative abundance
    
    physeq_glom_rel <- tax_glom(physeq_rel,
                            tax_rank,
                            NArm = TRUE) # agglomerate taxa using taxonomy

    
    # Test only abundant taxa: group dataframe by rank, calculate total rel. abundance per phylum and keep only taxa above threshold
    physeq_glom_rel %>%
      psmelt(.) %>%
      group_by(!!sym(tax_rank)) %>%
      summarise(tot_abundance = sum(Abundance)) %>%
      filter(tot_abundance >= threshold) %>%
      pull(1) %>%
      as.character() ->
      Taxa2test
    
    physeq_glom_rel_abund <- # remove rare taxa
      prune_taxa(tax_table(physeq_glom_rel)[, tax_rank] %in% Taxa2test, physeq_glom_rel)
    
    if (length(vars2test) == 1) {# One-way test
       cmb <- combn(unique(as.character(get_variable(physeq_glom_rel_abund, vars2test))), 2)
      as_tibble(matrix( # prepare results table (P and Eta)
        NA,
        nrow = ntaxa(physeq_glom_rel_abund),
        ncol = ncol(cmb) + 2,
      ), .name_repair = "minimal") %>% 
        bind_cols(Phylum = as(tax_table(physeq_glom_rel_abund)[, tax_rank], "vector"), .) %>% 
        set_names(c(tax_rank,
                    paste(vars2test, "- P"),
                    paste(vars2test, "- EtaSq"),
                    seq(ncol(cmb)))) %>% 
        mutate_if(., is.logical, as.numeric) ->
        taxa_test_results
      
      as_tibble(matrix( # prepare estimates table
        NA,
        nrow = ntaxa(physeq_glom_rel_abund),
        ncol = (ncol(cmb) * 5)
      ),
      .name_repair = "minimal") %>% 
        bind_cols(Phylum = as(tax_table(physeq_glom_rel_abund)[, tax_rank], "vector"), .) %>% 
        mutate_if(., is.logical, as.numeric) ->
        taxa_test_estimates 
      
      colnames(taxa_test_estimates) <- # cannot use set_names with non-unique names
        c(tax_rank, c(rbind(
          cmb, matrix(rep(
            c("Estimate diff.", "low CI", "high CI"), ncol(cmb)
          ), ncol = ncol(cmb))
        ))) # this is ugly but it works well, basically c() flattens a matrix
      
      for (phy_id in seq(ntaxa(physeq_glom_rel_abund))) {
        data2test <-
          bind_cols(Abundance = as.numeric(otu_table(physeq_glom_rel_abund)[, phy_id] * 100),
                    as(sample_data(physeq_glom_rel_abund), "data.frame"))
        print(tax_table(physeq_glom_rel_abund)[phy_id, tax_rank])
        
        art_mod <- art(test_expression, data = data2test)
        print(mod_summary <- anova(art_mod))
        taxa_test_results[phy_id, c(2)] <- mod_summary$`Pr(>F)` # p values
        mod_summary %>% 
          transmute(`Part Eta Sq` = `Sum Sq`/(`Sum Sq` + `Sum Sq.res`)) ->
          taxa_test_results[phy_id, c(3)] # EtaSq (effect size)
        art_mod_lm <- artlm(art_mod, vars2test) # build a linear model for ART data
        # Compute estimated marginal means (EMMs)
        (emm_mod <- emmeans(art_mod_lm, as.formula(paste("", vars2test, sep = " ~ ")), weights = "equal"))
        
        # Contrasts, pairwise comparisons, tests, and confidence intervals.
        (emm_mod_cons <- contrast(emm_mod, method = "pairwise", adjust = "tukey"))  
        # (emm_mod_CI <- confint(emm_mod_cons, adjust = "fdr", level = 0.95, type = "response")) 
        # (emm_mod_CLD <- multcomp::cld(emm_mod, alpha = 0.05, Letters = letters, adjust = "tukey")) 
        print("Note that the estimates aren’t as in a linear model because they are on the scale of ranks and not the data, but the p-values are useful")
        
        # assign pairwise results to table
        colnames(taxa_test_results)[seq(4, 3 + ncol(cmb))] <- summary(emm_mod_cons)$contrast
        taxa_test_results[phy_id, seq(4, 3 + ncol(cmb))] <-
          as.list(summary(emm_mod_cons)$p.value)
        # assign estimates to table
        mod_lm <- lm(test_expression, data = data2test) # because we want the real means not the rank means!
        emm_mod_lm <- emmeans(mod_lm, as.formula(paste("", vars2test, sep = " ~ ")), weights = "equal")
        emm_mod_lm_cons <- contrast(emm_mod, method = "pairwise", adjust = "tukey") 
        emm_mod_lm_CI <- confint(emm_mod_lm_cons, adjust = "fdr", level = 0.95, type = "response")
        taxa_test_estimates[phy_id, c(2,3)] <- as.list(summary(emm_mod_lm)$emmean)
        taxa_test_estimates[phy_id, 4] <- emm_mod_lm_CI$estimate
        taxa_test_estimates[phy_id, 5] <- as.list(emm_mod_lm_CI$lower.CL)
        taxa_test_estimates[phy_id, 6] <- as.list(emm_mod_lm_CI$upper.CL)
        # # assign pairwise means and CI to table
        # # I use wilcox.test here to get estimates on the moments and CI of the data not the ranks and p-value
        # for (pair in seq(length(sig_pairs[[1]]))) {
        #   pair2test <- unlist(str_split(sig_pairs[[1]][pair], " - "))
        #   possibleError <- tryCatch(
        #     wilcox_mod <- wilcox.test(
        #         test_expression,
        #         data = data2test,
        #         subset = do.call("%in%", list(get(vars2test), pair2test)), # grab only pair2test from vars2test
        #         conf.int = TRUE,
        #         exact = FALSE
        #       ),
        #     error = function(e)
        #       e
        #   ) # AKA Mann Whitney
        #   if (inherits(possibleError, 'error')) {
        #     print(possibleError)
        #     taxa_test_results[phy_id, pair + 3] <- NA
        #   } else {
        #     print(wilcox_mod)
        #     taxa_test_estimates[phy_id, (pair - 1) * 5 + 2] <-
        #       mean(data2test[unlist(data2test[vars2test]) %in% pair2test[1],]$Abundance)
        #     taxa_test_estimates[phy_id, (pair - 1) * 5 + 3] <-
        #       mean(data2test[unlist(data2test[vars2test]) %in% pair2test[2],]$Abundance)
        #     taxa_test_estimates[phy_id, (pair - 1) * 5 + 4] <-
        #       wilcox_mod$estimate
        #     taxa_test_estimates[phy_id, (pair - 1) * 5 + c(5, 6)] <-
        #       as.list(wilcox_mod$conf.int[c(1, 2)])
        #   }
        # }
      }
    } else {# Two-way test
      cmb <- combn(unique(c(as.character(get_variable(physeq_glom_rel_abund, vars2test[1])), as.character(get_variable(physeq_glom_rel_abund, vars2test[2])))), 2) # just for setting the right number of columns
      
      sample_data(physeq_glom_rel_abund)$Vars_combination <-  with(sample_data(physeq_glom_rel_abund), get(vars2test[1]):get(vars2test[2]))
      as_tibble(matrix( 
        NA,
        nrow = ntaxa(physeq_glom_rel_abund),
        ncol = sum(ncol(cmb)) + 6, # sig_pairs is taken from pairwise adonis. In addition the factors are tested alone.
      ),
      .name_repair = "minimal") %>% 
        bind_cols(Phylum = as(tax_table(physeq_glom_rel_abund)[, tax_rank], "vector"), .) %>% 
        set_names(c(tax_rank,
                    paste(vars2test[1], "- P"),
                    paste(vars2test[1], "- EtaSq"),
                    paste(vars2test[2], "- P"),
                    paste(vars2test[2], "- EtaSq"),
                    paste(paste(vars2test, collapse = " X "), "- P"),
                    paste(paste(vars2test, collapse = " X "), "- EtaSq"),
                    seq(ncol(cmb)))) %>% 
        mutate_if(., is.logical, as.numeric) ->
        taxa_test_results # prepare results table (P and Eta)
      
      as_tibble(matrix(
        NA,
        nrow = ntaxa(physeq_glom_rel_abund),
        ncol = (ncol(cmb) * 5) 
      ),
      .name_repair = "minimal") %>% 
        bind_cols(Phylum = as(tax_table(physeq_glom_rel_abund)[, tax_rank], "vector"), .) %>% 
        mutate_if(., is.logical, as.numeric) ->
        taxa_test_estimates # prepare estimates table
      
      colnames(taxa_test_estimates) <- # cannot use set_names with non-unique names
        c(tax_rank, c(rbind(
          cmb, matrix(rep(
            c("Estimate diff.", "low CI", "high CI"), ncol(cmb)
          ), ncol = ncol(cmb))
        ))) # this is ugly but it works well, basically c() flattens a matrix
      
      for (phy_id in seq(ntaxa(physeq_glom_rel_abund))) {
        bind_cols(Abundance = as.numeric(otu_table(physeq_glom_rel_abund)[, phy_id] * 100), as(sample_data(physeq_glom_rel_abund), "data.frame")) %>% 
          mutate_at(., vars2test[1],as.factor) %>% 
          mutate_at(., vars2test[2],as.factor) %>% 
          mutate_at(., "Vars_combination", as.factor) ->
          data2test
        
        print(tax_table(physeq_glom_rel_abund)[phy_id, tax_rank])
        
        art_mod <- art(test_expression, data = data2test)
        print(mod_summary <- anova(art_mod))
        taxa_test_results[phy_id, c(2, 4, 6)] <-
          as.list(mod_summary$`Pr(>F)`) # p values
        mod_summary %>% 
          transmute(`Part Eta Sq` = `Sum Sq`/(`Sum Sq` + `Sum Sq.res`)) %>% 
          t() %>% 
          as.list() ->
          taxa_test_results[phy_id, c(3, 5, 7)]  # EtaSq (effect size)
        
        # Post hoc of the main effects
        posthoc_expression <- as.formula(paste("Abundance", "Vars_combination", sep = " ~ "))
        art_mod_lm <- artlm(art(posthoc_expression, data = data2test), "Vars_combination") # build a linear model for ART data (This only works on combined factors; see: https://cran.r-project.org/web/packages/ARTool/vignettes/art-contrasts.html). Also, this might not be generalisable for more factors than two!!
        # Compute estimated marginal means (EMMs)
        (emm_mod <- emmeans(art_mod_lm, as.formula(paste("~", "Vars_combination"))))       
        # Contrasts, pairwise comparisons, tests, and confidence intervals.
        (emm_mod_cons <- contrast(emm_mod, method = "pairwise", adjust = "tukey")) 
        # (emm_mod_CI <- confint(emm_mod_cons, adjust = "tukey", level = 0.95, type = "response")) 
        # (emm_mod_CLD <- cld(emm_mod, alpha = 0.05, Letters = letters, adjust = "tukey")) 
        print("Note that the estimates aren’t as in a linear model because they are on the scale of ranks and not the data, but the p-values are useful")
        
        # assign pairwise results to table
        colnames(taxa_test_results)[seq(8, 7 + ncol(cmb))] <- summary(emm_mod_cons)$contrast
        taxa_test_results[phy_id, seq(8, 7 + length(summary(emm_mod_cons)$p.value))] <-
          as.list(summary(emm_mod_cons)$p.value) # need to check!! (trouble is it outputs everything and not just significant ones)
        
        # assign estimates to table
        mod_lm <- lm(test_expression, data = data2test) # because we want the real means not the rank means!
        emm_mod_lm <- emmeans(mod_lm, as.formula(paste("~", vars2test[1], "*",vars2test[2])), weights = "equal")
        # emm_mod_lm <- emmeans(mod_lm, as.formula(paste("~", "Vars_combination")), weights = "equal")
        emm_mod_lm_cons <- contrast(emm_mod, method = "pairwise", adjust = "tukey") 
        emm_mod_lm_CI <- confint(emm_mod_lm_cons, adjust = "fdr", level = 0.95, type = "response")
        colnames(taxa_test_estimates)[seq(2, ncol(taxa_test_estimates), by = 5)] <-
          str_split(emm_mod_lm_CI$contrast, " - ", simplify = TRUE)[, 1]
        colnames(taxa_test_estimates)[seq(3, ncol(taxa_test_estimates), by = 5)] <-
          str_split(emm_mod_lm_CI$contrast, " - ", simplify = TRUE)[, 2]
        taxa_test_estimates[phy_id, -1] <- as.list(summary(emm_mod_lm)$emmean[match(colnames(taxa_test_estimates), with(summary(emm_mod_lm), get(vars2test[1]):get(vars2test[2])))])[-1]
         
        taxa_test_estimates[phy_id, seq(4, ncol(taxa_test_estimates), by = 5)] <- as.list(emm_mod_lm_CI$estimate)
        taxa_test_estimates[phy_id, seq(5, ncol(taxa_test_estimates), by = 5)] <- as.list(emm_mod_lm_CI$lower.CL)
        taxa_test_estimates[phy_id, seq(6, ncol(taxa_test_estimates), by = 5)] <- as.list(emm_mod_lm_CI$upper.CL)
      }
    }
    
    # Correct for FDR for each comparison pair (not needed if adjust has been used in contrast())
    # for (ps2correct in seq(2, ncol(taxa_test_results), by = 2)) {
    #   # print(pair)
    #   taxa_test_results[, ps2correct] <-
    #     p.adjust(pull(taxa_test_results[, ps2correct]) , method = "BY") # Benjamini, Y., and Yekutieli, D. (2001). The control of the false discovery rate in multiple testing under dependency. Annals of Statistics 29, 1165–1188.
    #   # calculated by replacing the alpha of the Benjamini-Hochberg procedure by alpha/sum(1/1:m)).
    #   # we use this and not BH because of dependency in the tests (compositional data)
    # }
    # 
    write.csv(taxa_test_results, file = paste0("./Results/", outputfile, "_", tax_rank, "_Pvals.csv"))
    write.csv(taxa_test_estimates, file = paste0("./Results/", outputfile, "_", tax_rank, "_CI.csv"))
    Taxa_tests <- list(taxa_test_results, taxa_test_estimates)
    return(Taxa_tests)
  }

plotSTAMPR <- function(STAMPR_output = Taxa_tests_phylum1, pair = "City - Slope", tax_level = "Phylum", f_size = 14){
  require(ggthemes)
  require(ggpomological)
  if (!is.list(STAMPR_output) | length(STAMPR_output) != 2) {print("The function accepts only lists with two elements")}
  
  pair_number <- which(colnames(STAMPR_output[[1]]) == pair) - 3
  pvals_col <- pair_number + 3
  stats_col <- seq(from = 2, to = (ncol(STAMPR_output[[1]]) - 3) * 5, by = 5)[pair_number - 4]
  
  bind_cols(STAMPR_output[[1]][, 1], 
            STAMPR_output[[1]][, pair] ,
            STAMPR_output[[2]][, stats_col:(stats_col + 4)]
  ) %>% 
    mutate(Higher = if_else(.[3] > .[4], colnames(.)[3], colnames(.)[4])) %>% 
    gather("Factor", `Mean abundance (%)`, 3:4) %>% 
    mutate_at(c(tax_level, "Higher", "Factor"), ~fct_rev(.)) -> # fct_rev because coord_flip() reverses the order
    STAMPR_df

    tibble(
      min = seq(
        from = 0.5,
        to = max(as.numeric(pull(STAMPR_df, tax_level))),
        by = 1
      ),
      max = seq(
        from = 1.5,
        to = max(as.numeric(pull(STAMPR_df, tax_level))) + 0.5,
        by = 1
      )) %>% 
        add_column(Shade = rep(c(0, 1), length.out = nrow(.))) %>% 
      mutate_at("Shade", ~as.factor(.)) ->
    # %>% slice(rep(1:n(), each = 2))
    #   mutate(col = ifelse(allyrs == TRUE, 1, 0))
    #   
      # mutate_at("min", ~case_when(col == 1 ~ . - 0.3, 
                                       # TRUE ~ .)) %>% 
      # mutate_at("max", ~case_when(col == 1 ~ . - 0.3, 
                                       # TRUE ~ .)) -> 
      shading
  # ggthemr("grape")
  p1 <- ggplot() +
  geom_rect(data = shading,
            aes(xmin = min, xmax = max, ymin = -Inf, ymax = Inf,
                fill = Shade, alpha = 0.1), fill = rep(c("white", "#E9EDED"), length.out = nrow(shading)), show.legend = F) + 
  geom_col(data = STAMPR_df, 
           mapping = aes(!! sym(tax_level), y = `Mean abundance (%)`, fill = Factor), 
           width = 0.8, 
           position = position_dodge(),
           alpha = 2/3) + 
    scale_y_continuous(expand = c(0, 0)) +
    scale_x_discrete(expand = c(0, 0)) +
    coord_flip() + 
    geom_rangeframe(data = STAMPR_df, aes(!! sym(tax_level), y = `Mean abundance (%)`), sides = "b") +
    scale_fill_manual(values = Gradient.colours[c(6, 5)])  +
    theme_tufte(base_size = f_size, base_family = "sans") +
    theme(legend.position = "top",
          legend.justification = 'left',
          legend.title = element_blank()) +
    guides(fill = guide_legend(reverse = TRUE))
          # panel.grid.major.x = element_line(colour = "white"),
          # panel.ontop = TRUE)

  p2 <- ggplot() +
    geom_rect(
      data = shading,
      aes(
        xmin = min,
        xmax = max,
        ymin = -Inf,
        ymax = Inf,
        fill = Shade,
        alpha = 0.1
      ),
      fill = rep(c("white", "#E9EDED"), length.out = nrow(shading)),
      show.legend = F
    ) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "slategray", size = 1, alpha = 2/3) + 
    geom_errorbar(data = STAMPR_df,
                  aes(
                    !! sym(tax_level),
                    ymin = `low CI`,
                    ymax = `high CI`,
                    colour = Higher),
                  width = 0.3,
                  alpha = 2/3
    ) +
    geom_point2(data = STAMPR_df,
               aes(!! sym(tax_level), 
                   y = `Estimate diff.`, 
                   colour = Higher),
               size = 4,
               alpha = 2/3) +
    scale_y_continuous(expand = c(0, 0)) +
    scale_x_discrete(
      expand = c(0, 0),
      breaks = STAMPR_df[[tax_level]],
      labels = formatC(deframe(STAMPR_df[, 2]), format = "e", digits = 2),
      position = "top"
    ) +
    geom_rangeframe(data = STAMPR_df, aes(!! sym(tax_level), y = seq(
      min(`low CI`),
      max(`high CI`),
      length.out = nrow(STAMPR_df)
    )), sides = "b") +
    scale_colour_manual(values = Gradient.colours[c(6, 5)])  +
    theme_tufte(base_size = f_size, base_family = "sans") +
    theme(legend.position = "none", 
          plot.title = element_text(margin = margin(10, 0, 18, 0), hjust = 0.5),
          axis.title.y = element_text(margin = margin(t = 0, r = 0, b = 20, l = 0), angle = 180)) + # BUG: no response
    labs(title = "95% confidence intervals", 
         y = "Difference in mean abundance (%)",
         x = "p-value (corrected)") +
    coord_flip()

  # p1 + p2 + plot_layout(widths = c(1, 2)) # doesn't look as good
  plot_grid(p1, p2, rel_widths = c(2, 3)) + theme(plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"))
  # ggthemr_reset()
}

GGPlotCorncob <- function(da_analysis_df, OTU_labels = FALSE, Taxa = "Phylum", Y_val = "Differential abundance", sig_level = 0.05, Rank = Taxa_rank) {
  # Plot differential abundance model results 
  
  require(magrittr)
  require(ggplot2)
  require(ggrepel)
  require(see)
  
  pos <- position_jitter(width = 0.1, seed = 1)
  da_analysis_df %<>% 
    mutate_at(vars(matches(Taxa)), as_factor) %>%  
    mutate(!!Taxa := fct_relevel(get(Taxa), pull(Rank, !!Taxa))) %>% # Taxa_rank is calcuted for the taxa box plots
    mutate(!!Taxa := fct_relevel(get(Taxa), "Rare", after = Inf)) 
  
  corncob_summary <- tibble(Label = c(paste0("⬆", sum(da_analysis_df$`Differential abundance` > 0 &  da_analysis_df$Significance == "Pass"), " ⬇", sum(da_analysis_df$`Differential abundance` < 0 &  da_analysis_df$Significance == "Pass"), " (", nrow(da_analysis_df), ")")))
  
  p <-
    ggplot(da_analysis_df) +
    geom_point2(aes(
      x = !!sym(Taxa),
      y = !!sym(Y_val),
      colour = !!sym("Significance"),
      size = !!sym("Mean abundance (%)")),
      position = pos, 
      alpha = 2 / 3, 
      stroke = 0) +
    geom_linerange(aes(x = !!sym(Taxa),
                       y = !!sym(Y_val),
                       ymin = `ymin`,
                       ymax = `ymax`,
                       colour = !!sym("Significance")),
                   position = pos,
                   alpha = 1/5, 
                   show.legend = FALSE) +
    geom_text(
      data    = corncob_summary,
      mapping = aes(x = Inf, y = Inf, label = Label),
      hjust   = 1.1,
      vjust   = 1.6
    ) +
    xlab("") +
    ylab("Differential abundance") +
    labs(colour = paste("Significance at \n p <", sig_level), size = "Mean abundance (%)") +
    theme_grey(base_size = 18) +
    theme(axis.text.x = element_text(angle = 45.0, vjust = 1, hjust = 1),
          panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank()) +
    guides(colour = guide_legend(override.aes = list(size = 5))) +
    scale_colour_manual(values = c(ggpomological:::pomological_base[[7]], ggpomological:::pomological_palette[[1]])) +
    scale_size_continuous(name = "Mean abundance (%)",
                          range = c(2, 8),
                          breaks = c(round(seq(min(da_analysis_df$`Mean abundance (%)`), max(da_analysis_df$`Mean abundance (%)`), length.out = 5), 1)))
  
  if (OTU_labels) {
    p <- p + geom_label_repel(
      aes(x = !!sym(Taxa), y = !!sym(Y_val)),
      size = 6,
      label = sub("Seq_([0-9]+)", "\\1", pull(da_analysis_df[da_analysis_df$Significance == "Pass", ], "OTU")),
      position = pos,
      data = da_analysis_df[da_analysis_df$Significance == "Pass", ],
      # nudge_x = 0.4,
      colour = "#4a4a4a",
      label.size = NA, 
      alpha = 0.75, 
      # fontface = 'bold',
      box.padding = 0.80,
      point.padding = 0.5
    )
  }
  return(p)
}


gz <- function (in_path, out_path = tempfile()) 
{
  # Compress a file using gz and delete the uncompressed file
  out <- gzfile(out_path, "w")
  writeLines(readLines(in_path), out)
  close(out)
  file.remove(in_path)
  invisible(out_path)
}
```

### Setting general parameters:
```{r general parameters}
set.seed(15102020)
bootstraps <- 1000
data_path <- "./Data/"
Proj_name <- "BRC_growth_rate"

Browns <- RColorBrewer::brewer.pal(n = 9, "YlOrBr")[9:6]
Greens <- RColorBrewer::brewer.pal(n = 9, "YlGn")[9:6]
Blues <- RColorBrewer::brewer.pal(n = 9, "YlGnBu")[9:6]
Gradient.colours <- c(Browns[1], Greens[1], Browns[2], Greens[2], Browns[3], Greens[3], Browns[4], Greens[4], Blues)
```

## Description
This script reproduces all sequence analysis steps and plots included in the paper plus some additional exploratory analyses.

### Load data
```{r load data, cache=TRUE}
OTUmat <- t(read.csv(paste0(data_path, "Shivta_site_otuTab2.txt"), header = TRUE, row.names = 1))
sort.order <- as.numeric(gsub("OTU([0-9]+)", "\\1", colnames( OTUmat )))
OTUmat <- OTUmat[, order(sort.order )]

Metadata <- read.csv(paste0(data_path, "Shivta_metadata.csv"), row.names = 1, header = TRUE)

read_csv(paste0(data_path, "Shivta_metadata.csv"),
                     trim_ws = TRUE) %>%
  mutate_at(
    c(
      "Rock.type",
      "Location"
    ), 
    ~(factor(.))
  ) %>% 
  column_to_rownames("Sample.code") ->
  Metadata

row.names(OTUmat) <- gsub("(.*)Nimrod[0-9]+|Osnat[0-9]+", "\\1", row.names( OTUmat))
Metadata <- Metadata[order(row.names(Metadata)), ]
OTUmat <- OTUmat[order(row.names(OTUmat)), ]
# calculate sample size
Metadata$Library.size = rowSums(OTUmat)
Metadata$Location.rock <- with(Metadata, Location:Rock.type)

# Load taxonomy data
tax.file <- "Shivta_site_silva.nrv119.taxonomy"
Taxonomy <- read.table(paste0(data_path, tax.file), stringsAsFactors = FALSE) # read taxonomy file

# count how many ';' in each cell and add up to 6
for (i in 1:nrow(Taxonomy)){
  semicolons <- length(gregexpr(";", Taxonomy$V2[i] )[[1]])
  if (semicolons < 6){
    x <- paste0( rep("Unclassified;", 6 - semicolons ), collapse = "")
    Taxonomy$V2[i] <- paste0( Taxonomy$V2[i], x, sep = "")
  }
}
# split taxonomy to columns
do.call( "rbind", strsplit( Taxonomy$V1, ";", fixed = TRUE)) %>% 
  gsub( "size=([0-9]+)", "\\1", .) %>%
  data.frame( ., do.call( "rbind", strsplit( Taxonomy$V2, ";", fixed = TRUE)), stringsAsFactors = F) %>% 
  apply(., 2, function(x) gsub( "\\(.*\\)", "", x)) %>% 
  replace(., . == "unclassified", "Unclassified") -> 
  Taxonomy
colnames( Taxonomy ) <- c( "OTU", "Frequency", "Domain", "Phylum", "Class", "Order", "Family", "Genus" )
# rownames(Taxonomy) <- colnames(Rock_weathering_OTUmat)
rownames(Taxonomy) <- Taxonomy[, 1]

Tree_IQ <- read_tree(paste0(data_path, "Shivta_site_otuReps.filtered.align.treefile"))

# generate phyloseq object
Ps_obj <- phyloseq(otu_table(OTUmat, taxa_are_rows = FALSE),
                   tax_table(Taxonomy[, -c(1, 2)]),
                   sample_data(Metadata),
                   phy_tree(Tree_IQ)
)
# Reorder factors for plotting
sample_data(Ps_obj)$Location %<>% fct_relevel("Slope", "City")
```

Remove un- and mis-classified sequences, chloroplasts and mitochondria
```{r remove samples, cache=T}
domains2remove <- c("", "Archaea", "Eukaryota", "Unclassified")
classes2remove <- c("Chloroplast")
families2remove <- c("Mitochondria")
Ps_obj_filt <- subset_taxa(Ps_obj, !is.na(Phylum) &
                        !Domain %in% domains2remove &
                      !Class %in% classes2remove &
                      !Family %in% families2remove)
```

### Inspect library size and number of OTU
```{r Library Sizes, cache=T}
Ps_obj_df <-
  as.data.frame(sample_data(Ps_obj_filt)) # Put sample_data into a ggplot-friendly data.frame
Ps_obj_df <- Ps_obj_df[order(Ps_obj_df$Library.size), ]
Ps_obj_df$Index <- seq(nrow(Ps_obj_df))
ggplot(data = Ps_obj_df, 
       aes(x = Index, y = Library.size, color = Location.rock)) + 
  geom_point2(size = 4) + 
  scale_colour_manual(values = ggpomological:::pomological_palette[c(2, 1, 9, 3)], name = "Location.rock")
summary(sample_sums(Ps_obj_filt))
summary(taxa_sums(Ps_obj_filt))
```

### Explore the prevalence of different taxa in the database
```{r explore prevalence, cache=T}
prevdf <- apply(X = otu_table(Ps_obj_filt),
                 MARGIN = ifelse(taxa_are_rows(Ps_obj_filt), yes = 1, no = 2),
                 FUN = function(x){sum(x > 0)})
# Add taxonomy and total read counts to this data.frame
prevdf <- data.frame(Prevalence = prevdf,
                      TotalAbundance = taxa_sums(Ps_obj_filt),
                      tax_table(Ps_obj_filt))
prevdf %>%
  group_by(Phylum) %>%
  summarise(`Mean prevalence` = mean(Prevalence),
            `Sum prevalence` = sum(Prevalence)) ->
  Prevalence_phylum_summary

Prevalence_phylum_summary %>% 
  kable(., digits = c(0, 1, 0)) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)

prevdf %>%
  group_by(Order) %>%
  summarise(`Mean prevalence` = mean(Prevalence),
            `Sum prevalence` = sum(Prevalence)) ->
  Prevalence_Order_summary

Prevalence_Order_summary %>% 
  kable(., digits = c(0, 1, 0)) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)
```

Based on that we will remove all phyla with a prevalence of under 8
```{r remove rare taxa, cache=T}
Prevalence_phylum_summary %>% 
  filter(`Sum prevalence` < 8) %>% 
  dplyr::select(Phylum) %>% 
  map(as.character) %>% 
  unlist() ->
  filterPhyla
Ps_obj_filt %<>% subset_taxa(!Phylum %in% filterPhyla)
sample_data(Ps_obj_filt)$Library.size <- rowSums(otu_table(Ps_obj_filt))
print(Ps_obj)
print(Ps_obj_filt)
```

#### Plot general prevalence features of the phyla
```{r prevalence phylum, cahce=T, fig.height=12, fig.width=10}
# Subset to the remaining phyla
prevdf_phylum_filt <- subset(prevdf, Phylum %in% get_taxa_unique(Ps_obj_filt, "Phylum"))
ggplot(prevdf_phylum_filt,
       aes(TotalAbundance, Prevalence / nsamples(Ps_obj_filt), color = Phylum)) +
  # Include a guess for parameter
  geom_hline(yintercept = 0.05,
             alpha = 0.5,
             linetype = 2) + geom_point2(size = 2, alpha = 0.7) +
  scale_x_log10() +  xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
  facet_wrap( ~ Phylum) + theme(legend.position = "none")
```

#### Plot general prevalence features of the top 20 orders
```{r prevalence order, cache=T, fig.height=12, fig.width=10}
# Subset to the remaining phyla
prevdf_order_filt <- subset(prevdf, Order %in% get_taxa_unique(Ps_obj_filt, "Order"))
# grab the top 30 most abundant orders
prevdf_order_filt %>% 
  group_by(Order) %>%
  summarise(Combined.abundance = sum(TotalAbundance)) %>% 
  arrange(desc(Combined.abundance)) %>% 
  .[1:30, "Order"]  ->
  Orders2plot
prevdf_order_filt2 <- subset(prevdf, Order %in% Orders2plot$Order)
ggplot(prevdf_order_filt2,
       aes(TotalAbundance, Prevalence / nsamples(Ps_obj_filt), color = Order)) +
  # Include a guess for parameter
  geom_hline(yintercept = 0.05,
             alpha = 0.5,
             linetype = 2) + geom_point2(size = 2, alpha = 0.7) +
  scale_x_log10() +  xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
  facet_wrap( ~ Order) + theme(legend.position = "none")
```

#### Unsupervised filtering by prevalence
We will remove all sequences which appear in less than 10% of the samples
```{r unsupervised filtering, cache=T}
# Define prevalence threshold as 10% of total samples
prevalenceThreshold <- 0.1 * nsamples(Ps_obj_filt)
prevalenceThreshold
# Execute prevalence filter, using `prune_taxa()` function
keepTaxa <-
  row.names(prevdf_phylum_filt)[(prevdf_phylum_filt$Prevalence >= prevalenceThreshold)]
Ps_obj_filt  %<>%  prune_taxa(keepTaxa, .)
sample_data(Ps_obj_filt)$Library.size <- rowSums(otu_table(Ps_obj_filt))
print(Ps_obj)
print(Ps_obj_filt)
```
This removed `r ntaxa(Ps_obj) - ntaxa(Ps_obj_filt)` or `r percent(1 - (ntaxa(Ps_obj_filt) /  ntaxa(Ps_obj)))` of the sequences.

### Exploring the dataset features
First let's look at the count data distribution after filtering:
```{r plot abundance, cache=T}
PlotLibDist(Ps_obj_filt)

sample_data(Ps_obj_filt) %>% 
  as_tibble() %>% 
  dplyr::select(Sample.name, Library.size) %>% 
  as(., "data.frame") %>% 
  kable(.) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)
```
The figure and table indicate only a small deviation in the number of reads per samples.

```{r mod abundance, cache=T, fig.asp=.8}
(mod1 <- adonis(
  otu_table(Ps_obj_filt) ~ Library.size,
  data = as(sample_data(Ps_obj_filt), "data.frame"), 
  method = "horn",
  permutations = 9999
))
PlotReadHist(as(otu_table(Ps_obj_filt), "matrix"))
notAllZero <- (rowSums(t(otu_table(Ps_obj_filt))) > 0)
vsn::meanSdPlot(as.matrix(log2(t(otu_table(Ps_obj_filt))[notAllZero, ] + 1)))
```

The difference in library sizes is low and its effect on the community composition is minimal. 
We'll use the GMPR method for library size normalisation [@chen_gmpr:_2017]

```{r GMPR, cache=T}
Ps_obj_filt_GMPR <- Ps_obj_filt
Ps_obj_filt %>%
  otu_table(.) %>%
  t() %>%
  as(., "matrix") %>%
  GMPR() ->
  GMPR_factors
Ps_obj_filt %>%
  otu_table(.) %>%
  t() %*% diag(1 / GMPR_factors$gmpr) %>%
  t() %>%
  as.data.frame(., row.names = sample_names(Ps_obj_filt)) %>%
  otu_table(., taxa_are_rows = FALSE) ->
  otu_table(Ps_obj_filt_GMPR)
sample_data(Ps_obj_filt_GMPR)$Library.size <- sample_sums(Ps_obj_filt)
adonis(
  otu_table(Ps_obj_filt_GMPR) ~ Library.size,
  data = as(sample_data(Ps_obj_filt_GMPR), "data.frame"),
  method = "horn",
  permutations = 9999
)
PlotLibDist(Ps_obj_filt_GMPR)
```

Did it improve anything?
```{r GMPR diag plots, cache=T, fig.asp=.8}
PlotReadHist(as(otu_table(Ps_obj_filt_GMPR), "matrix"))
notAllZero <- (rowSums(t(otu_table(Ps_obj_filt_GMPR))) > 0)
vsn::meanSdPlot(as.matrix(log2(t(otu_table(Ps_obj_filt_GMPR))[notAllZero, ] + 1)))
```

### Alpha diversity 
#### Calculate and plot alpha diversity metrics
We do that by simulating 1000 rarefaction events and calculating the metrics each time. Then, the result is averaged.
```{r alpha-div, cache=T, fig.width=20, fig.height=10}
rarefaction.mat <- matrix(0, nrow = nsamples(Ps_obj_filt), ncol = bootstraps)
rownames(rarefaction.mat) <- sample_names(Ps_obj_filt)
rich.ests <- list(S.obs = rarefaction.mat, S.chao1 = rarefaction.mat, se.chao1 = rarefaction.mat,
                   S.ACE = rarefaction.mat, se.ACE = rarefaction.mat)
for (i in seq(bootstraps)) {
  sub.OTUmat <- rrarefy(otu_table(Ps_obj_filt), min(rowSums(otu_table(Ps_obj_filt))))
  for (j in seq(length(rich.ests))) {
    rich.ests[[j]][, i] <- t(estimateR(sub.OTUmat))[, j]
  }
}
Richness <- data.frame(row.names = row.names(rich.ests[[1]]))
for (i in c(1, seq(2, length(rich.ests), 2))) {
  S <- apply(rich.ests[[i]], 1, mean)
  if (i == 1) { 
    se <- apply(rich.ests[[i]], 1, function(x) (mean(x)/sqrt(length(x))))
    } else se <- apply(rich.ests[[i + 1]], 1, mean)
  Richness <- cbind(Richness, S, se)
}
colnames(Richness) <- c("S.obs", "S.obs.se", "S.chao1", "S.chao1.se", "S.ACE", "S.ACE.se")
saveRDS(Richness, file = paste0("./Results/", Proj_name, "_richness.RDS"))
write.csv(Richness, file = paste0("./Results/", Proj_name, "_richness.csv"))
ses <- grep("\\.se", colnames(Richness))
Richness[, ses] %>% 
  gather(key = "est.se") -> se.dat
Richness[, -unique(ses)] %>% 
  gather(key = "est") -> mean.dat
n <- length(unique(mean.dat$est))
# diversity indices
diversity.inds <- list(Shannon = rarefaction.mat, inv.simpson = rarefaction.mat, BP = rarefaction.mat)
for (i in seq(bootstraps)) {
  sub.OTUmat <- rrarefy(otu_table(Ps_obj_filt), min(rowSums(otu_table(Ps_obj_filt))))
  diversity.inds$Shannon[, i] <- diversityresult(sub.OTUmat, index = 'Shannon', method = 'each site', digits = 3)[, 1]
  diversity.inds$inv.simpson[, i] <- diversityresult(sub.OTUmat, index = 'inverseSimpson', method = 'each site', digits = 3)[, 1]
  diversity.inds$BP[, i] <- diversityresult(sub.OTUmat, index = 'Berger', method = 'each site', digits = 3)[, 1]
}
Diversity <- data.frame(row.names = row.names(diversity.inds[[1]]))
for (i in seq(length(diversity.inds))) {
  S <- apply(diversity.inds[[i]], 1, mean)
  se <- apply(diversity.inds[[i]], 1, function(x) (mean(x)/sqrt(length(x))))
  Diversity <- cbind(Diversity, S, se)
}
colnames(Diversity) <- c("Shannon", "Shannon.se", "Inv.simpson", "Inv.simpson.se", "BP", "BP.se")
ses <- grep("\\.se", colnames(Diversity))
Diversity[, ses] %>% gather(key = "est.se") -> se.dat
Diversity[, -unique(ses)] %>% gather(key = "est") -> mean.dat
saveRDS(Diversity, file = paste0("./Results/", Proj_name, "_diversity.RDS"))
write.csv(Diversity, file = paste0("./Results/", Proj_name, "_diversity.csv"))
```

#### Test the differences in alpha diversity.
```{r test alpha, cache=T}
# make combined richness diversity
Richness_Diversity <- cbind(Richness, Diversity)
ses <- grep("\\.se", colnames(Richness_Diversity))
Richness_Diversity[, ses] %>% 
  gather(key = "est.se") -> 
  se.dat
Richness_Diversity[, -unique(ses)] %>% 
  gather(key = "Metric", 
         value = "Estimate") -> 
  mean.dat
Richness_Diversity_long <-
  cbind(
    Sample = rep(rownames(Richness_Diversity), times = length(unique(mean.dat$Metric))),
    mean.dat,
    lerr = mean.dat$Estimate - se.dat$value,
    herr = mean.dat$Estimate + se.dat$value
  )
Richness_Diversity_long$Metric <-
  factor(
    Richness_Diversity_long$Metric,
    levels = c("S.obs", "S.chao1", "S.ACE", "Shannon", "Inv.simpson", "BP"),
    labels = c("S obs.", "Chao1", "ACE", "Shannon", "Inv. Simpson" , "Berger Parker")
  )
Richness_Diversity_long %<>%
  cbind(., 
        sample_data(Ps_obj_filt))

# S Obs
(mod_obsS <- TestAlphaV3(filter(Richness_Diversity_long, Metric == "S obs.")))
# Post-hoc test
marginal <- emmeans(mod_obsS,
                   ~ Location : Rock.type)
summary(marginal)
contrast(marginal, 
         method = "pairwise", 
         adjust = "tukey")
(obsS_pairwise <- cld(marginal,
                      alpha = 0.05,
                      Letters = letters,
                      adjust = "tukey")) # works with lm but not with two-factor ART

(mod_obsS %>% 
  anova() %>% 
  mutate(`Part Eta Sq`=`Sum Sq`/sum(`Sum Sq`) ) ->
  mod_obsS_ANOVA)
# pwpp(marginal) # Pairwise P-value plot. Fails for unbalanced design
emmip(mod_obsS, Location ~ Rock.type)
# summary(as.glht(pairs(marginal))) # fails because of unbalanced design

# Shannon
(mod_Shannon <- TestAlphaV3(filter(Richness_Diversity_long, Metric == "Shannon")))
# Post-hoc test
marginal <- emmeans(mod_Shannon,
                   ~ Location : Rock.type)
summary(marginal)
contrast(marginal, 
         method = "pairwise", 
         adjust = "tukey")
(Shannon_pairwise <- cld(marginal,
                      alpha = 0.05,
                      Letters = letters,
                      adjust = "tukey")) # works with lm but not with two-factor ART

(mod_Shannon %>% 
  anova() %>% 
  mutate(`Part Eta Sq`=`Sum Sq`/sum(`Sum Sq`) ) ->
  mod_Shannon_ANOVA)
# pwpp(marginal) # Pairwise P-value plot. Fails for unbalanced design
emmip(mod_Shannon, Location ~ Rock.type)

# ACE
(mod_ACE <- TestAlphaV3(filter(Richness_Diversity_long, Metric == "ACE")))
# Post-hoc test
marginal <- emmeans(mod_ACE,
                   ~ Location : Rock.type)
summary(marginal)
contrast(marginal, 
         method = "pairwise", 
         adjust = "tukey")
(ACE_pairwise <- cld(marginal,
                      alpha = 0.05,
                      Letters = letters,
                      adjust = "tukey")) # works with lm but not with two-factor ART

(mod_ACE %>% 
  anova() %>% 
  mutate(`Part Eta Sq`=`Sum Sq`/sum(`Sum Sq`) ) ->
  mod_ACE_ANOVA)
# pwpp(marginal) # Pairwise P-value plot. Fails for unbalanced design
emmip(mod_ACE, Location ~ Rock.type)
# summary(as.glht(pairs(marginal))) # fails because of unbalanced design

#Inv. Simpson
(mod_InvSim <- TestAlphaV3(filter(Richness_Diversity_long, Metric == "Inv. Simpson")))
# Post-hoc test
marginal <- emmeans(mod_InvSim,
                   ~ Location : Rock.type)
summary(marginal)
contrast(marginal, 
         method = "pairwise", 
         adjust = "tukey")
(InvSim_pairwise <- cld(marginal,
                      alpha = 0.05,
                      Letters = letters,
                      adjust = "tukey")) # works with lm but not with two-factor ART

(mod_InvSim %>% 
  anova() %>% 
  mutate(`Part Eta Sq`=`Sum Sq`/sum(`Sum Sq`) ) ->
  mod_InvSim_ANOVA)
# pwpp(marginal) # Pairwise P-value plot. Fails for unbalanced design
emmip(mod_InvSim, Location ~ Rock.type)
# summary(as.glht(pairs(marginal))) # fails because of unbalanced design


#Berger Parker
(mod_BP <- TestAlphaV3(filter(Richness_Diversity_long, Metric == "Berger Parker")))
# Post-hoc test
marginal <- emmeans(mod_BP,
                   ~ Location : Rock.type)
summary(marginal)
contrast(marginal, 
         method = "pairwise", 
         adjust = "tukey")
(BP_pairwise <- cld(marginal,
                      alpha = 0.05,
                      Letters = letters,
                      adjust = "tukey")) # works with lm but not with two-factor ART

(mod_BP %>% 
  anova() %>% 
  mutate(`Part Eta Sq`=`Sum Sq`/sum(`Sum Sq`) ) ->
  mod_BP_ANOVA)
# pwpp(marginal) # Pairwise P-value plot. Fails for unbalanced design
emmip(mod_BP, Location ~ Rock.type)
# summary(as.glht(pairs(marginal))) # fails because of unbalanced design
```

#### Plot all alpha diversity metrics together
```{r plot alpha, cache=T, fig.width=10, fig.height=6, fig.cap=""}
Richness_Diversity_long %>% 
  dplyr::filter(!Metric %in% c("Chao1", "ACE")) %>% 
    mutate_at(., "Metric", ~fct_recode(., "Observed S" = "S obs.", "Inv. Simpson" = "Inv. Simpson", "Berger Parker" = "Berger Parker")) %>% 
  mutate_at(., "Metric", ~fct_relevel(., "Observed S", "Inv. Simpson", "Shannon", "Berger Parker")) %>% 
  droplevels() ->
  Richness_Diversity_long2plot

p_alpha <- ggplot() +
  geom_violin(data = Richness_Diversity_long2plot,
             aes(
               x = Location,
               y = Estimate,
               ymin = lerr,
               ymax = herr
             ), colour = "grey",
              fill = "grey",
              alpha = 1 / 3) +
  geom_jitter2(data = Richness_Diversity_long2plot,
               aes(x = Location,
               y = Estimate,
               ymin = lerr,
               ymax = herr,
               colour = Location), size = 3, width = 0.2, alpha = 2/3) +
  scale_colour_manual(values = Gradient.colours[c(5, 6, 11)], name = "") +
  # geom_errorbar(alpha = 1 / 2, width = 0.3) +
  xlab("") +
  ylab("") +
  theme(axis.text.x = element_text(
    angle = 45,
    vjust = 0.9,
    hjust = 1
  ),
  legend.position="none") +
  facet_grid(Metric ~ Rock.type, scale = "free") +
  theme(strip.text = element_text(size = f_size - 4)) +
  background_grid(major = "y",
                  minor = "none",
                  size.major = 0.8) 

dat_text <- data.frame(
  label = str_remove_all(c(obsS_pairwise$.group[1:4], 
                           Shannon_pairwise$.group[1:4], 
                           InvSim_pairwise$.group[1:4],
                           BP_pairwise$.group[1:4]), 
                         pattern = " "),
  Metric = fct_inorder(rep(levels(Richness_Diversity_long2plot$Metric), each = 4)),
  Rock.type = fct_c(obsS_pairwise$Rock.type[1:4], 
                           Shannon_pairwise$Rock.type[1:4], 
                           InvSim_pairwise$Rock.type[1:4],
                           BP_pairwise$Rock.type[1:4]), 
  x = fct_c(obsS_pairwise$Location[1:4], 
                           Shannon_pairwise$Location[1:4], 
                           InvSim_pairwise$Location[1:4],
                           BP_pairwise$Location[1:4]),
  # x     = as.factor(levels(Richness_Diversity_long2plot$Climate.Source)),
  y = rep(c(520, 45, 5.2, 0.52), each = 4)
  # y = rep(c(40, 140, 0.5), each = 6)
)
p_alpha <- p_alpha + geom_text(
  data = dat_text,
  mapping = aes(x = x, y = y, label = label),
  nudge_x = 0,
  nudge_y = 0
)
print(p_alpha)
```

### Beta diversity
Calculate and plot beta diversity metrics.
```{r ADONIS full, cache=T}
(mod1 <-  adonis(
  otu_table(Ps_obj_filt_GMPR) ~ Location * Rock.type,
  data = as(sample_data(Ps_obj_filt_GMPR), "data.frame"),
  method = "horn",
  permutations = 9999
))

(mod2 <- adonis(
  otu_table(Ps_obj_filt_GMPR) ~ Location,
  data = as(sample_data(Ps_obj_filt_GMPR), "data.frame"),
  method = "horn",
  permutations = 9999
))

mod1_pairwise <- PairwiseAdonis(
  otu_table(Ps_obj_filt_GMPR),
  sample_data(Ps_obj_filt_GMPR)$Location,
  sim.function = "vegdist",
  sim.method = "horn",
  p.adjust.m = "BH"
)
print(mod1_pairwise)
(sig_pairs1 <- list(as.character(mod1_pairwise$pairs[mod1_pairwise$p.adjusted < 0.05])))

mod2_pairwise <- PairwiseAdonis(
  otu_table(Ps_obj_filt_GMPR),
  sample_data(Ps_obj_filt_GMPR)$Rock.type,
  sim.function = "vegdist",
  sim.method = "horn",
  p.adjust.m = "BH"
)
print(mod2_pairwise)
(sig_pairs2 <- list(as.character(mod2_pairwise$pairs[mod2_pairwise$p.adjusted < 0.05])))

mod3_pairwise <- PairwiseAdonis(
  otu_table(Ps_obj_filt_GMPR),
  sample_data(Ps_obj_filt_GMPR)$Location.rock,
  sim.function = "vegdist",
  sim.method = "horn",
  p.adjust.m = "BH"
)
print(mod3_pairwise)
(sig_pairs3 <- list(as.character(mod3_pairwise$pairs[mod3_pairwise$p.adjusted < 0.1])))
```

```{r ADONIS phylo, cache=T}
Unifrac_mat <- UniFrac(Ps_obj_filt, 
                       weighted = TRUE, 
                       normalized = TRUE, 
                       parallel = TRUE, 
                       fast = TRUE)

(mod1 <-  adonis(
  Unifrac_mat ~ Location * Rock.type,
  data = as(sample_data(Ps_obj_filt_GMPR), "data.frame"),
  method = "horn",
  permutations = 9999
))

(mod2 <- adonis(
  Unifrac_mat ~ Location,
  data = as(sample_data(Ps_obj_filt_GMPR), "data.frame"),
  method = "horn",
  permutations = 9999
))
```
The difference between city and slope is significant based on the Morisita-Horn distances between OTUs but not based on UniFrac.

##### Calculate ordinations
```{r ordinate all, cache=T, fig.height=10}
Ps_obj_ord1 <- ordinate(Ps_obj_filt_GMPR, "CAP", "horn", formula = Ps_obj_filt_GMPR ~  Location * Rock.type)
Ps_obj_ord2 <- ordinate(Ps_obj_filt_GMPR, "CAP", "horn", formula = Ps_obj_filt_GMPR ~  Location)

explained <- eigenvals(Ps_obj_ord2)/sum( eigenvals(Ps_obj_ord2)) * 100
explained <- as.numeric(format(round(explained, 1), nsmall = 1))

Ps_obj_filt_GMPR %>% 
  plot_ordination(., Ps_obj_ord2, type = "samples", shape = "Rock.type", color = "Location", justDF = TRUE) -> 
  ord_df

p_ord <- ggplot(ord_df,
             aes(
               x = CAP1,
               y = MDS1,
               shape = Rock.type,
               color = Location
             )) +
  stat_ellipse(
    aes(x = CAP1, 
        y = MDS1, 
        fill = Location
        ),
    geom = "polygon",
    alpha = 1/4,
    type = "t",
    level = 0.9,
    # linetype = 2,
    inherit.aes = FALSE
  ) +
  geom_point2(size = 4, alpha = 2 / 3) +
  guides(colour = guide_legend(title = "Location"), shape = guide_legend(title = "Rock.type")) +
  scale_colour_manual(values = Gradient.colours) +
  scale_fill_manual(values = Gradient.colours, guide = "none") +
  labs(x = sprintf("CAP1 (%s%%)", explained[1]), 
       y = sprintf("CAP2 (%s%%)", explained[2])) +
  coord_fixed(ratio = sqrt(explained[2] / explained[1])) +
   theme(legend.justification = "top")
  # facet_wrap(. ~ Rock.type)
print(p_ord)
```

```{r ordinate phylo, cache=T, fig.height=10}
Ps_obj_ord1 <- ordinate(Ps_obj_filt, "PCoA", "Unifrac", formula = Ps_obj_filt ~ Location * Rock.type)
Ps_obj_ord2 <- ordinate(Ps_obj_filt, "PCoA", "Unifrac", formula = Ps_obj_filt ~ Location)

explained <- Ps_obj_ord2$values$Relative_eig/sum(Ps_obj_ord2$values$Relative_eig) * 100
explained <- as.numeric(format(round(explained, 1), nsmall = 1))

Ps_obj_filt %>% 
  plot_ordination(., Ps_obj_ord2, type = "samples", shape = "Rock.type", color = "Location", justDF = TRUE) -> 
  ord_df

p_ord_phylo <- ggplot(ord_df,
             aes(
               x = Axis.1,
               y = Axis.2,
               shape = Rock.type,
               color = Location
             )) +
  stat_ellipse(
    aes(x = Axis.1, 
        y = Axis.2, 
        fill = Location
        ),
    geom = "polygon",
    alpha = 1/4,
    type = "t",
    level = 0.9,
    # linetype = 2,
    inherit.aes = FALSE
  ) +
  geom_point2(size = 4, alpha = 2 / 3) +
  # theme_bw(base_size = f_size) +
  guides(colour = guide_legend(title = "Location"), shape = guide_legend(title = "Rock.type")) +
  scale_colour_manual(values = Gradient.colours) +
  scale_fill_manual(values = Gradient.colours, guide = "none") +
  labs(x = sprintf("CAP1 (%s%%)", explained[1]), 
       y = sprintf("CAP2 (%s%%)", explained[2])) +
  coord_fixed(ratio = sqrt(explained[2] / explained[1])) #+ 
  # facet_wrap(. ~ Rock.type)
print(p_ord_phylo)
```

#### Test differences between samples on the phylum level
STAMPR analysis of the differences of each phylum between locations using Aligned Rank Transformed ANOVA test and a post-hoc estimated marginal means.
```{r STAMPR, fig.width=12, fig.height=6, cache=T, results="hide"}
Taxa_tests_phylum1 <- STAMPR2(Ps_obj_filt, vars2test = "Location", threshold = 0.05, outputfile = paste0(Proj_name, "_Location"))

pSTAMPR1 <- plotSTAMPR(Taxa_tests_phylum1, pair = "Slope - City", f_size = f_size)
print(pSTAMPR1)

Taxa_tests_phylum2 <- STAMPR2(Ps_obj_filt, vars2test = c("Location", "Rock.type"), threshold = 0.05, outputfile = paste0(Proj_name, "_Location_Rock"))

pSTAMPR2 <- plotSTAMPR(Taxa_tests_phylum2, pair = "Slope:Chalk - City:Chalk", f_size = f_size)
print(pSTAMPR2)
```

### Taxonmic distribution analysis  
Agglomerate data and tag rare taxa
```{r agglomarate rares, cache=T}
Ps_obj_filt_GMPR %>% 
  transform_sample_counts(., function(x){x / sum(x)} * 100) %>% 
  tax_glom(., 
           "Phylum",
           NArm = TRUE) ->
  Ps_obj_filt_GMPR_glom

Ps_obj_filt_GMPR_glom_DF <- speedyseq::psmelt(Ps_obj_filt_GMPR_glom)
Ps_obj_filt_GMPR_glom_DF$Phylum %<>% as.character()
# Ps_obj_filt3_glom_DF %<>% mutate(Species = fct_relevel(Species, "NA", after = Inf))

# group dataframe by Phylum, calculate sum rel. abundance
Ps_obj_filt_GMPR_glom_DF %>%
  group_by(Phylum) %>%
  summarise(Sum = sum(Abundance) / nsamples(Ps_obj_filt_GMPR_glom) ) ->
  Sums

# find Phyla whose rel. abund. is less than 5%
Rare_phyla0.05 <- Sums[Sums$Sum <= 0.05, ]$Phylum

# change their name to "Rare"
Ps_obj_filt_GMPR_glom_DF[Ps_obj_filt_GMPR_glom_DF$Phylum %in% Rare_phyla0.05, ]$Phylum <- 'Rare'
# re-group
Ps_obj_filt_GMPR_glom_DF %>%
  group_by(Sample, Phylum, Location, Rock.type, Location.rock) %>%
  summarise(Abundance = sum(Abundance)) ->
  Ps_obj_filt_GMPR_glom_DF_2plot

# ab.taxonomy$Freq <- sqrt(ab.taxonomy$Freq)
# Ps_obj_filt3_glom_rel_DF$Phylum %<>% sub("unclassified", "Unclassified", .)
# Ps_obj_filt3_glom_rel_DF$Phylum %<>% sub("uncultured", "Unclassified", .)

Ps_obj_filt_GMPR_glom_DF_2plot %>% 
  group_by(Sample) %>% 
  filter(Phylum == "Rare") %>% 
  summarise(`Rares (%)` = sum(Abundance)) -> 
  Rares
```
Summarise taxonomy
```{r summarise taxonomy, cache=T}
# Percentage of reads classified as rare 
Rares %>%
  kable(., digits = c(2), caption = "Percentage of reads per sample classified as rare:") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)

sample_order <- match(Rares$Sample, row.names(sample_data(Ps_obj_filt_GMPR_glom)))
Rares %<>% arrange(., sample_order)

Rares %>% 
  cbind(., sample_data(Ps_obj_filt_GMPR_glom)) %>% 
  group_by(Location.rock) %>% 
  setNames(make.names(names(.), unique = TRUE)) %>% # fails for some reason without it
  summarise(`Rares (%)` = mean(`Rares....`)) -> 
  Rares_merged

# Percentage of reads classified as rare 
Rares_merged %>%
  kable(., digits = c(2), caption = "Percentage of reads per sample classified as rare:") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)
```
Plot taxonomy box-plot
```{r taoxonomy_boxplot, cache=T}
Ps_obj_filt_GMPR_glom_DF_2plot %>% 
  group_by(Phylum) %>% 
  summarise(sum.Taxa = sum(Abundance)) %>% 
  arrange(desc(sum.Taxa)) -> Taxa_rank
Ps_obj_filt_GMPR_glom_DF_2plot$Phylum %<>% 
  factor(., levels = Taxa_rank$Phylum) %>% 
  fct_relevel(., "Rare", after = Inf)
  
p_taxa_box <-
  ggplot(Ps_obj_filt_GMPR_glom_DF_2plot, aes(x = Phylum, y = (Abundance))) +
  geom_boxplot(aes(group = interaction(Phylum, Location)), position = position_dodge(width = 0.9), fatten = 1) +
  geom_point2(
    aes(colour = Rock.type),
    position = position_jitterdodge(dodge.width = 1),
    alpha = 1 / 2,
    stroke = 0,
    size = 2
  ) +
  scale_colour_manual(values = Gradient.colours, name = "") +
  # theme_bw()+
  labs(x = NULL, y = "Relative abundance (%)") +
  guides(colour = guide_legend(override.aes = list(size = 5))) +
  facet_grid(Location ~ .) +
  background_grid(major = "xy",
                  minor = "none") +
  theme(axis.text.x = element_text(
    angle = 45,
    vjust = 0.9,
    hjust = 0.9
  ),
  legend.position = c(.99, .99),
  legend.justification = c("right", "top"),
  legend.box.just = "top",
  legend.margin = margin(0, 3, 3, 3))
print(p_taxa_box)
```

### Differential abundance models
Tag rare phyla (for plotting purposes only)
```{r tag rare phyla, cache=T}
Ps_obj_filt_GMPR_glom <- tax_glom(Ps_obj_filt_GMPR, 
                             "Phylum", 
                             NArm = TRUE) # glomerate to the phylum level
Ps_obj_filt_GMPR_glom_rel <- transform_sample_counts(Ps_obj_filt_GMPR_glom, function(x) x / sum(x)) # transform to rel. ab.
Ps_obj_filt_GMPR_glom_rel_DF <- speedyseq::psmelt(Ps_obj_filt_GMPR_glom_rel) # generate a df
Ps_obj_filt_GMPR_glom_rel_DF$Phylum %<>% as.character() # factor to char

# group dataframe by Phylum, calculate sum rel. abundance
Ps_obj_filt_GMPR_glom_rel_DF %>%
  group_by(Phylum) %>%
  summarise(Sum = sum(Abundance) / nsamples(Ps_obj_filt_GMPR_glom) ) ->
  Sums

# find Phyla whose mean rel. abund. is less than 0.5%
Rare_phyla0.05 <- Sums[Sums$Sum <= 0.05, ]$Phylum

# change their name to "Rare"
Ps_obj_filt_GMPR_glom_rel_DF[Ps_obj_filt_GMPR_glom_rel_DF$Phylum %in% Rare_phyla0.05, "Phylum"] <- 'Rare' 

# re-group
Ps_obj_filt_GMPR_glom_rel_DF %>%
  group_by(Phylum) %>%
  summarise(Abundance = sum(Abundance)) %>% 
  arrange(desc(Abundance)) -> Taxa_rank
```

Detect differentially abundant OTUs using corncob [@martin_modeling_2020]
```{r corncob - location, cache=T}
comparison_string <- c("City", "Slope")

Ps_obj_filt %>%
  subset_samples(Location %in% c(comparison_string[1], comparison_string[2])) %>%
  tax_glom("Order") ->
  Ps_obj_filt_pairwise_glom

# Test differential abundance for location
da_Loc <- differentialTest(formula = ~ Location,
                           phi.formula = ~ Location,
                           formula_null = ~ 1,
                           phi.formula_null = ~ Location, 
                           test = "Wald", boot = FALSE,
                           data = Ps_obj_filt,
                           fdr_cutoff = 0.05,
                           full_output = TRUE)
da_Loc_intervals <- plot(da_Loc, level = "Class", data_only = T)
which(is.na(da_Loc$p)) %>% names

Ps_obj_filt %>%
  transform_sample_counts(., function(x) x / sum(x) * 100) %>% 
  taxa_sums(.) %>% 
  map_dbl(~(.x / nsamples(Ps_obj_filt))) %>% 
  enframe(name = "OTU", value = "Mean abundance (%)") -> 
  baseMean

map(da_Loc$all_models,15) %>% 
  map(.,2) %>% 
  unlist %>%  # grab all mu.LocationSlope Estimates (differences in estimated population relative abundance)
  bind_cols(OTU = taxa_names(Ps_obj_filt), 
            tax_table(Ps_obj_filt), 
            `Differential abundance` = .,
            Significance = fct_recode(as_factor(taxa_names(Ps_obj_filt) %in% da_Loc$significant_taxa), Pass = "TRUE", Fail = "FALSE"),
            ymin = as.numeric(NA),
            ymax = as.numeric(NA)
            ) %>%
  left_join(., baseMean, by = "OTU") ->
  da_Loc_df

da_Loc_df %<>% rows_update(., tibble(ymin = da_Loc_intervals$xmin, OTU = da_Loc$significant_taxa), by = "OTU")
da_Loc_df %<>% rows_update(., tibble(ymax = da_Loc_intervals$xmax, OTU = da_Loc$significant_taxa), by = "OTU")
da_Loc_df[da_Loc_df$Phylum %in% Rare_phyla0.05, "Phylum"] <- 'Rare' # rare_phyla is

p_corncob_loc <- GGPlotCorncob(da_Loc_df, OTU_labels = FALSE, Taxa = "Phylum", Y_val = "Differential abundance", sig_level = 0.05, Rank = Taxa_rank)

corncob_summary <- tibble(Label = c(paste0("⬆", sum(da_Loc_df$`Differential abundance` > 0 &  da_Loc_df$Significance == "Pass"), " ⬇", sum(da_Loc_df$`Differential abundance` < 0 &  da_Loc_df$Significance == "Pass"), " (", nrow(da_Loc_df), ")")))

p_corncob_loc <- p_corncob_loc +
  labs(title = paste(comparison_string, collapse = " - ")) +
  coord_cartesian(ylim = c(-10, 10))
print(p_corncob_loc)

write.csv(da_Loc_df, file = paste0("./Results/corncob_", comparison_string[1], "_vs_", comparison_string[2], ".csv"))
```

Modelling differential abundance and variance between locations discovered `length(da_Loc$significant_taxa)` 

```{r corncob - rock, cache=T}
comparison_string <- c("Limestone", "Chalk")
# Test differential abundance and variance for rock type
da_Rock <- differentialTest(formula = ~ Rock.type,
                                 phi.formula = ~ Rock.type,
                                 formula_null = ~ 1,
                                 phi.formula_null = ~ Rock.type, 
                                 test = "Wald", boot = FALSE,
                                 data = Ps_obj_filt,
                                 fdr_cutoff = 0.05,
                                full_output = TRUE)
da_Rock_intervals <- plot(da_Rock, level = "Class", data_only = TRUE)
which(is.na(da_Rock$p)) %>% names
map(da_Rock$all_models,15) %>% 
  map(.,2) %>% 
  unlist %>%  # grab all mu.LocationSlope Estimates (differences in estimated population relative abundance)
  bind_cols(OTU = taxa_names(Ps_obj_filt), 
            tax_table(Ps_obj_filt), 
            `Differential abundance` = .,
            Significance = fct_recode(as_factor(taxa_names(Ps_obj_filt) %in% da_Rock$significant_taxa), Pass = "TRUE", Fail = "FALSE"),
            ymin = as.numeric(NA),
            ymax = as.numeric(NA)
            ) %>%
  left_join(., baseMean, by = "OTU") ->
  da_Rock_df

da_Rock_df %<>% rows_update(., tibble(ymin = da_Rock_intervals$xmin, OTU = da_Rock$significant_taxa), by = "OTU")
da_Rock_df %<>% rows_update(., tibble(ymax = da_Rock_intervals$xmax, OTU = da_Rock$significant_taxa), by = "OTU")
da_Rock_df[da_Rock_df$Phylum %in% Rare_phyla0.05, "Phylum"] <- 'Rare' # rare_phyla is

p_corncob_rock <- GGPlotCorncob(da_Rock_df, OTU_labels = FALSE, Taxa = "Phylum", Y_val = "Differential abundance", sig_level = 0.05, Rank = Taxa_rank)

corncob_summary <- tibble(Label = c(paste0("⬆", sum(da_Rock_df$`Differential abundance` > 0 &  da_Rock_df$Significance == "Pass"), " ⬇", sum(da_Rock_df$`Differential abundance` < 0 &  da_Rock_df$Significance == "Pass"), " (", nrow(da_Rock_df), ")")))

p_corncob_rock <- p_corncob_rock +
  labs(title = paste(comparison_string, collapse = " - ")) +
  coord_cartesian(ylim = c(-10, 10))
print(p_corncob_rock)

write.csv(da_Rock_df, file = paste0("./Results/corncob_", comparison_string[1], "_vs_", comparison_string[2], ".csv"))
```

Modelling differential abundance and variance between rock types discovered `length(da_Rock$significant_taxa)` 

```{r corncob - loc exl. rock, cache=T}
# Test differential abundance for location, control for Rock.type for both cases
comparison_string <- c("City", "Slope")
da_Loc_exRock <- differentialTest(formula = ~ Location + Rock.type,
                                 phi.formula = ~ Location + Rock.type,
                                 formula_null = ~ Rock.type,
                                 phi.formula_null = ~ Location + Rock.type, 
                                 test = "Wald", boot = FALSE,
                                 data = Ps_obj_filt,
                                 fdr_cutoff = 0.05,
                                full_output = TRUE)
da_Loc_exRock_intervals <- plot(da_Loc_exRock, level = "Class", data_only = TRUE)

which(is.na(da_Loc_exRock$p)) %>% names
map(da_Loc_exRock$all_models, 15) %>% 
  map(., 2) %>% 
  unlist %>%  # grab all mu.LocationSlope Estimates (differences in estimated population relative abundance)
  bind_cols(OTU = taxa_names(Ps_obj_filt), 
            tax_table(Ps_obj_filt), 
            `Differential abundance` = .,
            Significance = fct_recode(as_factor(taxa_names(Ps_obj_filt) %in% da_Loc_exRock$significant_taxa), Pass = "TRUE", Fail = "FALSE"),
            ymin = as.numeric(NA),
            ymax = as.numeric(NA)
            ) %>%
  left_join(., baseMean, by = "OTU") ->
  da_Loc_exRock_df

da_Loc_exRock_df %<>% rows_update(., tibble(ymin = da_Loc_exRock_intervals$xmin, OTU = da_Loc_exRock$significant_taxa), by = "OTU")
da_Loc_exRock_df %<>% rows_update(., tibble(ymax = da_Loc_exRock_intervals$xmax, OTU = da_Loc_exRock$significant_taxa), by = "OTU")
da_Loc_exRock_df[da_Loc_exRock_df$Phylum %in% Rare_phyla0.05, "Phylum"] <- 'Rare' # rare_phyla is

p_corncob_locExroc <- GGPlotCorncob(da_Loc_exRock_df, OTU_labels = FALSE, Taxa = "Phylum", Y_val = "Differential abundance", sig_level = 0.05, Rank = Taxa_rank)

corncob_summary <- tibble(Label = c(paste0("⬆", sum(da_Loc_exRock_df$`Differential abundance` > 0 &  da_Loc_exRock_df$Significance == "Pass"), " ⬇", sum(da_Loc_exRock_df$`Differential abundance` < 0 &  da_Loc_exRock_df$Significance == "Pass"), " (", nrow(da_Loc_exRock_df), ")")))

p_corncob_locExroc <- p_corncob_locExroc +
  labs(title = paste(comparison_string, collapse = " - ")) +
  coord_cartesian(ylim = c(-10, 10))
print(p_corncob_locExroc)

write.csv(da_Loc_exRock_df, file = paste0("./Results/corncob_", comparison_string[1], "_vs_", comparison_string[2], "_ExRockType.csv"))
```

Modelling differential abundance between locations, while controlling for rock type discovered `length(da_Loc_exRock$significant_taxa)`

```{r plot OTU 260,  cache=T}
mod260 <- bbdml(formula = OTU260 ~ 1,
             phi.formula = ~ 1,
             data = Ps_obj_filt)
mod260_Loc <- bbdml(formula = OTU260 ~ Location,
             phi.formula = ~ Location,
             data = Ps_obj_filt)
mod260_Loc_rock <- bbdml(formula = OTU97 ~ Location*Rock.type,
             phi.formula = ~ Location*Rock.type,
             data = Ps_obj_filt)
lrtest(mod_null = mod260, mod = mod260_Loc)
# lrtest(mod_null = mod260_Loc, mod = mod260_Loc_rock)
summary(mod260_Loc)

plot(mod260_Loc, color = "Location", shape = "Rock.type") # add total = TRUE for total counts (i.e. not relative abundance)
```

### Compose figures
```{r Figures, cache=F}
# composite_plot <- ((p_alpha + p_taxa_box +  plot_layout(widths = c(1, 2))) /(p_ord + pSTAMPR1) + plot_annotation(tag_levels = 'A') & theme(plot.tag = element_text(size = f_size)))
composite_plot <- (p_alpha + p_ord) /(p_taxa_box) / (pSTAMPR1) + 
  plot_layout(heights = c(1.5, 1, 1)) +
  plot_annotation(tag_levels = 'A') & theme(plot.tag = element_text(size = f_size)) 

plot_file <- "./Results/Microbiome_1"
svglite(paste0(plot_file, ".svg"), 
        width = 10, 
        height = 11)
print(composite_plot)
invisible(dev.off())

agg_png(paste0(plot_file, ".png"),
        width = 10, 
        height = 11, 
        units = 'cm',
        res = 900, 
        scaling = 0.38)
print(composite_plot)
invisible(invisible(dev.off()))

gz(paste0(plot_file, ".svg"), paste0(plot_file, ".svgz"))
knitr::include_graphics(paste0(plot_file, ".png"))

plot_file <- "./Results/Microbiome_2"
svglite(paste0(plot_file, ".svg"), 
        width = 12, 
        height = 10)
print(p_corncob_locExroc)
invisible(dev.off())

agg_png(paste0(plot_file, ".png"),
        width = 12, 
        height = 10.5, 
        units = 'cm',
        res = 900, 
        scaling = 0.38)
print(p_corncob_locExroc)
invisible(invisible(dev.off()))
gz(paste0(plot_file, ".svg"), paste0(plot_file, ".svgz"))
```

### References