1 Preliminaries

We will work with the STAT data from the STAT.RData file.

load('STAT.RData')
library(phyloseq); packageVersion("phyloseq")
## [1] '1.24.2'

Remember the PCoA Results from last time

library(ade4); packageVersion("ade4")
## [1] '1.7.11'
## Compute Bray-Curtis distance matrix
bray.dist = phyloseq::distance(phy, "bray")

## PCoA results
bray.pco = dudi.pco(cailliez(bray.dist), scannf=F, nf=2)

s.class(bray.pco$li, sample_data(phy)$Location)

s.class(bray.pco$li, sample_data(phy)$Treatment)

s.class(bray.pco$li, with(sample_data(phy), 
                         interaction(Location,Treatment)))

There seems to be a big difference in terms of Location variable and slightly uncertain differences with respect to Treatment.

2 PERMANOVA

With PERMANOVA is implemented in the vegan library. The function is adonis.

2.1 Vanilla

Let’s first run simple analysis with respect to Location.

library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-2
with(sample_data(phy), adonis(bray.dist ~ Location))
## 
## Call:
## adonis(formula = bray.dist ~ Location) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## Location   1    4.0251  4.0251  43.899 0.31834  0.001 ***
## Residuals 94    8.6188  0.0917         0.68166           
## Total     95   12.6439                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The result is an analysis of variance table that lists pseudo-F statistic, R^2 and significance values.

We can do the same for the Treatment variable.

with(sample_data(phy), 
     adonis(formula = bray.dist ~ Treatment))
## 
## Call:
## adonis(formula = bray.dist ~ Treatment) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model     R2 Pr(>F)    
## Treatment  4     1.521 0.38026   3.111 0.1203  0.001 ***
## Residuals 91    11.123 0.12223         0.8797           
## Total     95    12.644                 1.0000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

2.2 Stratified

Notice that statistically the above analyses are suboptimal because they do not take into account the fact that microbiota is measured in two different locations of the same animal. Also note that we have previously decided that Location differences are not as interesting to us, but Treatment differences are. We can account for these considerations by stratifying the permutations. What this means is that the permutations will only occur at the levels of the strata (e.g. animals, or locations), effectively removing the effect of these stratifying variables from the analysis.

Let’s first do the stratified analysis for Location variable accounting for animal from which the samples are taken. We may want to do this because we expect the microbiota to be more similar within each animal than across.

head(sample_data(phy))
##           X.SampleID BarcodeSequence LinkerPrimerSequence Treatment
## cecal_C1    cecal_C1              NA                   NA         C
## cecal_C10  cecal_C10              NA                   NA         C
## cecal_C2    cecal_C2              NA                   NA         C
## cecal_C3    cecal_C3              NA                   NA         C
## cecal_C4    cecal_C4              NA                   NA         C
## cecal_C5    cecal_C5              NA                   NA         C
##           Location         Description
## cecal_C1     cecal missing_description
## cecal_C10    cecal missing_description
## cecal_C2     cecal missing_description
## cecal_C3     cecal missing_description
## cecal_C4     cecal missing_description
## cecal_C5     cecal missing_description

Since the animal identified is not coded we need to parse it out of sample.names. Note that each sample is “coded” as location_animal.

sample_data(phy)$Mouse = sapply(strsplit(sample_names(phy), split = "_"), "[", 2)
head(sample_data(phy))
##           X.SampleID BarcodeSequence LinkerPrimerSequence Treatment
## cecal_C1    cecal_C1              NA                   NA         C
## cecal_C10  cecal_C10              NA                   NA         C
## cecal_C2    cecal_C2              NA                   NA         C
## cecal_C3    cecal_C3              NA                   NA         C
## cecal_C4    cecal_C4              NA                   NA         C
## cecal_C5    cecal_C5              NA                   NA         C
##           Location         Description Mouse
## cecal_C1     cecal missing_description    C1
## cecal_C10    cecal missing_description   C10
## cecal_C2     cecal missing_description    C2
## cecal_C3     cecal missing_description    C3
## cecal_C4     cecal missing_description    C4
## cecal_C5     cecal missing_description    C5

Now we can perform the analysis stratified on the newly created Mouse variable.

with(sample_data(phy), adonis(bray.dist ~ Location, strata = Mouse))
## 
## Call:
## adonis(formula = bray.dist ~ Location, strata = Mouse) 
## 
## Blocks:  strata 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## Location   1    4.0251  4.0251  43.899 0.31834  0.001 ***
## Residuals 94    8.6188  0.0917         0.68166           
## Total     95   12.6439                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Compare these results with what we saw before.

As before we can visualize these results using PCoA with within class analysis.

mouse.wca = wca(bray.pco, factor(sample_data(phy)$Mouse), scannf=F, nf=2)

s.class(mouse.wca$li, sample_data(phy)$Location)

Also note what the effect of wca is on the class factor.

s.class(bray.pco$li, factor(sample_data(phy)$Mouse))

s.class(mouse.wca$li, factor(sample_data(phy)$Mouse))

Similar analysis can be done for the Treatment variable accounting for the Location differences.

with(sample_data(phy), adonis(bray.dist ~ Treatment, strata = Location))
## 
## Call:
## adonis(formula = bray.dist ~ Treatment, strata = Location) 
## 
## Blocks:  strata 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model     R2 Pr(>F)    
## Treatment  4     1.521 0.38026   3.111 0.1203  0.001 ***
## Residuals 91    11.123 0.12223         0.8797           
## Total     95    12.644                 1.0000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
location.wca = wca(bray.pco, sample_data(phy)$Location, scannf=F, nf=2)
s.class(location.wca$li, sample_data(phy)$Treatment)

2.3 Post Hoc Tests

To demonstrate the concept of post hoc tests, we will use just the fecal samples. We will compare the antibiotic groups versus control. First, let’s subset our data and re-compute the distances.

fecal = subset_samples(phy, Location == 'fecal')

p.values = c()
for(lev in levels(sample_data(fecal)$Treatment)){
  if(lev != 'C'){
    ## subset the data to just a pair of levels
    pair.phy = subset_samples(fecal, Treatment %in% c('C', lev))
    ## compute the distance
    fecal.dist = phyloseq::distance(pair.phy, method = "bray")
    print(paste("C vs", lev))
    ## execute PERMANOVA analysis
    adonis.res = adonis(fecal.dist~sample_data(pair.phy)$Treatment)
    print(adonis.res)
    ## save the p-values
    p.values=c(p.values, adonis.res$aov.tab[1,6])
  } 
}
## [1] "C vs P"
## 
## Call:
## adonis(formula = fecal.dist ~ sample_data(pair.phy)$Treatment) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##                                 Df SumsOfSqs  MeanSqs F.Model      R2
## sample_data(pair.phy)$Treatment  1   0.11839 0.118390   2.549 0.13039
## Residuals                       17   0.78957 0.046445         0.86961
## Total                           18   0.90796                  1.00000
##                                 Pr(>F)  
## sample_data(pair.phy)$Treatment  0.019 *
## Residuals                               
## Total                                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "C vs T"
## 
## Call:
## adonis(formula = fecal.dist ~ sample_data(pair.phy)$Treatment) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##                                 Df SumsOfSqs MeanSqs F.Model     R2 Pr(>F)
## sample_data(pair.phy)$Treatment  1   0.44533 0.44533  6.4095 0.2738  0.001
## Residuals                       17   1.18115 0.06948         0.7262       
## Total                           18   1.62648                 1.0000       
##                                    
## sample_data(pair.phy)$Treatment ***
## Residuals                          
## Total                              
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "C vs V"
## 
## Call:
## adonis(formula = fecal.dist ~ sample_data(pair.phy)$Treatment) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##                                 Df SumsOfSqs  MeanSqs F.Model      R2
## sample_data(pair.phy)$Treatment  1   0.11410 0.114099  2.6295 0.12746
## Residuals                       18   0.78106 0.043392         0.87254
## Total                           19   0.89516                  1.00000
##                                 Pr(>F)   
## sample_data(pair.phy)$Treatment  0.004 **
## Residuals                                
## Total                                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "C vs VP"
## 
## Call:
## adonis(formula = fecal.dist ~ sample_data(pair.phy)$Treatment) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##                                 Df SumsOfSqs  MeanSqs F.Model      R2
## sample_data(pair.phy)$Treatment  1   0.19787 0.197872  3.4238 0.17627
## Residuals                       16   0.92469 0.057793         0.82373
## Total                           17   1.12256                  1.00000
##                                 Pr(>F)    
## sample_data(pair.phy)$Treatment  0.001 ***
## Residuals                                 
## Total                                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
names(p.values) = levels(sample_data(fecal)$Treatment)[-1]
p.values
##     P     T     V    VP 
## 0.019 0.001 0.004 0.001
p.adjust(p.values, method='bonf')
##     P     T     V    VP 
## 0.076 0.004 0.016 0.004

Note the large differences in variability between the ‘C’ vs. ‘VP’ and ‘T’ groups. These are suspect to bring about false positives and/or false negatives, especially taking into account the fact that these groups are not balanced:

table(sample_data(fecal)$Treatment)
## 
##  C  P  T  V VP 
## 10  9  9 10  8

3 TW2

We will use Tw2 statistic and test to see if there indeed are differences between C vs. VP and C vs. T. ## Getting the code The code is available at https://github.com/alekseyenko/Tw2. You will need to get Tw2.R file. You can either download it or source it directly from GitHub.

source('https://raw.githubusercontent.com/alekseyenko/Tw2/master/code/Tw2.R')

3.1 Running simple analysis

Now with the necessary functions loaded, we can run simple analyses. First comparing C vs. VP

set.seed(3.15)
## subset the data
CvsVP = subset_samples(fecal, Treatment %in% c("C", "VP"))
## compute distance
CvsVP.dist = distance(CvsVP, method="bray")
WT.test(CvsVP.dist, sample_data(CvsVP)$Treatment, nrep=999)
## $p.value
## [1] 0.004
## 
## $t.stat
##        C 
## 3.189398 
## 
## $nrep
## [1] 999

Can you do the same for C vs. T and all other treatment levels?