Preliminaries

Load the phyloseq package and the PICRUSt data that we have worked with last time.

library(phyloseq)
load('STAT.picrust.RData')

genes
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 6909 taxa and 96 samples ]
## sample_data() Sample Data:       [ 96 samples by 5 sample variables ]
head(pathways)
##       ko                             L1                               L2
## 1 K01365                 Human Diseases           Immune System Diseases
## 2 K01365                     Metabolism                  Enzyme Families
## 3 K01365             Organismal Systems                    Immune System
## 4 K01365             Cellular Processes         Transport and Catabolism
## 5 K01365 Genetic Information Processing Folding, Sorting and Degradation
## 6 K01365             Cellular Processes         Transport and Catabolism
##                                    L3
## 1                Rheumatoid arthritis
## 2                          Peptidases
## 3 Antigen processing and presentation
## 4                           Phagosome
## 5    Chaperones and folding catalysts
## 6                            Lysosome

PCoA

We can perform PCoA analyses, just like we did with 16S rRNA gene abundance data.

genes.jsd = phyloseq::distance(genes, "jsd")

genes.pcoa = ordinate(genes, "PCoA", distance=genes.jsd)
plot_ordination(genes, ordination = genes.pcoa, color="Location", shape="Treatment")

Multivariate analysis

library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-5
adonis(genes.jsd~sample_data(genes)$Treatment)
## 
## Call:
## adonis(formula = genes.jsd ~ sample_data(genes)$Treatment) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##                              Df SumsOfSqs    MeanSqs F.Model      R2
## sample_data(genes)$Treatment  4  0.009535 0.00238366  2.6075 0.10283
## Residuals                    91  0.083188 0.00091415         0.89717
## Total                        95  0.092722                    1.00000
##                              Pr(>F)  
## sample_data(genes)$Treatment  0.072 .
## Residuals                            
## Total                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis(genes.jsd~sample_data(genes)$Location, strata = sample_data(genes)$MouseID)
## 
## Call:
## adonis(formula = genes.jsd ~ sample_data(genes)$Location, strata = sample_data(genes)$MouseID) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##                             Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)
## sample_data(genes)$Location  1  0.072346 0.072346  333.75 0.78024  0.001
## Residuals                   94  0.020376 0.000217         0.21976       
## Total                       95  0.092722                  1.00000       
##                                
## sample_data(genes)$Location ***
## Residuals                      
## Total                          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

What do these results mean?

Univariate differences

genes.clr = transform_sample_counts(genes,
                                    function(x){y=log(x+1); 
                                    y-sum(y)/ntaxa(genes)})
Location = sample_data(genes.clr)$Location
p.values = apply(otu_table(genes.clr), 1, 
                 function(x) t.test(c(x) ~ Location)$p.value)

t.values = apply(otu_table(genes.clr), 1, 
                 function(x) t.test(c(x) ~ Location)$statistic)

q.values = p.adjust(p.values, method="fdr")

plot(sort(q.values), log='y', pch='.')

min(q.values)
## [1] 5.235677e-37

How do you interpret these results?

Pathway enrichment

Compute an example contingency table.

pathway = "Human Diseases"
(ctab = table(names(p.values) %in% pathways$ko[pathways$L1 == pathway], 
             p.values<0.05))
##        
##         FALSE TRUE
##   FALSE   877 5841
##   TRUE     13  178
fisher.test(ctab)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  ctab
## p-value = 0.008395
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.165077 3.954256
## sample estimates:
## odds ratio 
##   2.055665

Define a function to test each level-1-pathway.

test.pathway.level1 = function(pathway, p.values = p.values, threshold=0.05){
  ctab = table(names(p.values) %in% pathways$ko[pathways$L1 == pathway], 
               p.values<threshold)
  fisher.test(ctab)$p.value
}

Perform the analysis for all level 1 pathways.

level1.pvalues = sapply(unique(pathways$L1), test.pathway.level1, p.values=p.values, threshold=0.05)
level1.pvalues
##                       Human Diseases                           Metabolism 
##                         8.395387e-03                         1.673644e-03 
##                   Organismal Systems                   Cellular Processes 
##                         8.662215e-01                         1.000000e+00 
##       Genetic Information Processing                                 None 
##                         8.372768e-13                         2.111145e-07 
##                         Unclassified Environmental Information Processing 
##                         2.220032e-01                         6.819562e-06
p.adjust(level1.pvalues, method='fdr')
##                       Human Diseases                           Metabolism 
##                         1.343262e-02                         3.347288e-03 
##                   Organismal Systems                   Cellular Processes 
##                         9.899675e-01                         1.000000e+00 
##       Genetic Information Processing                                 None 
##                         6.698214e-12                         8.444579e-07 
##                         Unclassified Environmental Information Processing 
##                         2.960043e-01                         1.818550e-05

Metabolism pathway seems to be involved.

Distance-based analysis

Adonis

test.pathway.distance = function(pathway){
  phy.path = prune_taxa(pathways$KO[pathways$L1 == pathway], genes.clr)
  phy.dist = phyloseq::distance(phy.path, method="euclidean") 
  adonis(phy.dist ~ sample_data(phy.path)$Location)$aov.tab[1,6]
}
sapply(unique(pathways$L1), test.pathway.distance)
##                       Human Diseases                           Metabolism 
##                                0.001                                0.001 
##                   Organismal Systems                   Cellular Processes 
##                                0.001                                0.001 
##       Genetic Information Processing                                 None 
##                                0.001                                0.001 
##                         Unclassified Environmental Information Processing 
##                                0.001                                0.001

Tw2

source('https://raw.githubusercontent.com/alekseyenko/Tw2/master/code/Tw2.R')

test.pathway.distance2 = function(pathway){
  phy.path = prune_taxa(pathways$KO[pathways$L1 == pathway], genes.clr)
  phy.dist = phyloseq::distance(phy.path, method="euclidean") 
  WT.test(phy.dist, sample_data(phy.path)$Location)$p.value
}
sapply(unique(pathways$L1), test.pathway.distance2)
##                       Human Diseases                           Metabolism 
##                                0.001                                0.001 
##                   Organismal Systems                   Cellular Processes 
##                                0.001                                0.001 
##       Genetic Information Processing                                 None 
##                                0.001                                0.001 
##                         Unclassified Environmental Information Processing 
##                                0.001                                0.001

Groupping

By median

test.pathway.group = function(pathway){
  phy.path = prune_taxa(pathways$KO[pathways$L1 == pathway], genes.clr)
  t.test(apply(otu_table(phy.path), 2, median) ~ sample_data(phy.path)$Location)$p.value
}
sapply(unique(pathways$L1), test.pathway.group)
##                       Human Diseases                           Metabolism 
##                         2.744858e-08                         2.744858e-08 
##                   Organismal Systems                   Cellular Processes 
##                         2.744858e-08                         2.744858e-08 
##       Genetic Information Processing                                 None 
##                         2.744858e-08                         2.744858e-08 
##                         Unclassified Environmental Information Processing 
##                         2.744858e-08                         2.744858e-08

by mean

test.pathway.group2 = function(pathway){
  phy.path = prune_taxa(pathways$KO[pathways$L1 == pathway], genes.clr)
  t.test(apply(otu_table(phy.path), 2, mean) ~ sample_data(phy.path)$Location)$p.value
}
sapply(unique(pathways$L1), test.pathway.group2)
##                       Human Diseases                           Metabolism 
##                            0.1181401                            0.1181401 
##                   Organismal Systems                   Cellular Processes 
##                            0.1181401                            0.1181401 
##       Genetic Information Processing                                 None 
##                            0.1181401                            0.1181401 
##                         Unclassified Environmental Information Processing 
##                            0.1181401                            0.1181401

GSEA-like analysis

barplot(sort(t.values, decreasing=T), names.arg=NA)

Where do Metabolism genes fall?

pathway.indicator = taxa_names(genes.clr) %in% pathways$ko[pathways$Level1 == "Metabolism"]
sum(pathway.indicator)
## [1] 0
oo = order(t.values, decreasing=T)
barplot(t.values[oo], col=c("black","red")[pathway.indicator+1][oo], border=NA, names.arg = NA)

Calculating and plotting the enrichment score.

ES=-t.values
ES[pathway.indicator] = -ES[pathway.indicator]
ES.o = c(ES[oo[1]], cumsum(ES[oo]))
plot(ES.o, main="Enrichment score")

max(ES.o)
## [1] 12908.37

Permutations

compute.ES = function(statistic, indicator){
  ES=-statistic
  ES[indicator] = -ES[indicator]
  ES.o = c(ES[oo[1]], cumsum(ES[order(statistic, decreasing=T)]))
  max(ES.o)
}

physeq.t.test.statistics = function(phy, f){
  return(apply(otu_table(phy), 1, 
                 function(x) t.test(c(x) ~ f)$statistic))
}

head(physeq.t.test.statistics(genes.clr, Location))
##    K01365    K01364    K01361    K01360    K01362    K02249 
## -6.049952 -6.049952 -3.850435 -6.049952 -5.429629 -6.049952
compute.ES(t.values, pathway.indicator)
## [1] 12908.37
set.seed(4.12)
nrep=1000
#ps = replicate(nrep,
#compute.ES(physeq.t.test.statistics(genes.clr,
#                                    sample(Location)),
#           pathway.indicator))

#sum(ps>=compute.ES(t.values, pathway.indicator))/nrep

This is the permutation-based p-value for gene set enrichment in the Metabolism pathway. The same procedure can be repeated for other pathways.