Load the phyloseq package and the PICRUSt data that we have worked with last time.
library(phyloseq)
load('STAT.picrust.RData')
genes
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 6909 taxa and 96 samples ]
## sample_data() Sample Data: [ 96 samples by 5 sample variables ]
head(pathways)
## ko L1 L2
## 1 K01365 Human Diseases Immune System Diseases
## 2 K01365 Metabolism Enzyme Families
## 3 K01365 Organismal Systems Immune System
## 4 K01365 Cellular Processes Transport and Catabolism
## 5 K01365 Genetic Information Processing Folding, Sorting and Degradation
## 6 K01365 Cellular Processes Transport and Catabolism
## L3
## 1 Rheumatoid arthritis
## 2 Peptidases
## 3 Antigen processing and presentation
## 4 Phagosome
## 5 Chaperones and folding catalysts
## 6 Lysosome
We can perform PCoA analyses, just like we did with 16S rRNA gene abundance data.
genes.jsd = phyloseq::distance(genes, "jsd")
genes.pcoa = ordinate(genes, "PCoA", distance=genes.jsd)
plot_ordination(genes, ordination = genes.pcoa, color="Location", shape="Treatment")
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-2
adonis(genes.jsd~sample_data(genes)$Treatment)
##
## Call:
## adonis(formula = genes.jsd ~ sample_data(genes)$Treatment)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2
## sample_data(genes)$Treatment 4 0.009535 0.00238366 2.6075 0.10283
## Residuals 91 0.083188 0.00091415 0.89717
## Total 95 0.092722 1.00000
## Pr(>F)
## sample_data(genes)$Treatment 0.082 .
## Residuals
## Total
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis(genes.jsd~sample_data(genes)$Location, strata = sample_data(genes)$MouseID)
##
## Call:
## adonis(formula = genes.jsd ~ sample_data(genes)$Location, strata = sample_data(genes)$MouseID)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(genes)$Location 1 0.072346 0.072346 333.75 0.78024 0.001
## Residuals 94 0.020376 0.000217 0.21976
## Total 95 0.092722 1.00000
##
## sample_data(genes)$Location ***
## Residuals
## Total
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
What do these results mean?
genes.clr = transform_sample_counts(genes,
function(x){y=log(x+1);
y-sum(y)/ntaxa(genes)})
Location = sample_data(genes.clr)$Location
p.values = apply(otu_table(genes.clr), 1,
function(x) t.test(c(x) ~ Location)$p.value)
t.values = apply(otu_table(genes.clr), 1,
function(x) t.test(c(x) ~ Location)$statistic)
q.values = p.adjust(p.values, method="fdr")
plot(sort(q.values), log='y', pch='.')
min(q.values)
## [1] 5.235677e-37
How do you interpret these results?
Compute an example contingency table.
pathway = "Human Diseases"
(ctab = table(names(p.values) %in% pathways$ko[pathways$L1 == pathway],
p.values<0.05))
##
## FALSE TRUE
## FALSE 877 5841
## TRUE 13 178
fisher.test(ctab)
##
## Fisher's Exact Test for Count Data
##
## data: ctab
## p-value = 0.008395
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.165077 3.954256
## sample estimates:
## odds ratio
## 2.055665
Define a function to test each level-1-pathway.
test.pathway.level1 = function(pathway, p.values = p.values, threshold=0.05){
ctab = table(names(p.values) %in% pathways$ko[pathways$L1 == pathway],
p.values<threshold)
fisher.test(ctab)$p.value
}
Perform the analysis for all level 1 pathways.
level1.pvalues = sapply(unique(pathways$L1), test.pathway.level1, p.values=p.values, threshold=0.05)
level1.pvalues
## Human Diseases Metabolism
## 8.395387e-03 1.673644e-03
## Organismal Systems Cellular Processes
## 8.662215e-01 1.000000e+00
## Genetic Information Processing None
## 8.372768e-13 2.111145e-07
## Unclassified Environmental Information Processing
## 2.220032e-01 6.819562e-06
p.adjust(level1.pvalues, method='fdr')
## Human Diseases Metabolism
## 1.343262e-02 3.347288e-03
## Organismal Systems Cellular Processes
## 9.899675e-01 1.000000e+00
## Genetic Information Processing None
## 6.698214e-12 8.444579e-07
## Unclassified Environmental Information Processing
## 2.960043e-01 1.818550e-05
Metabolism pathway seems to be involved.
test.pathway.distance = function(pathway){
phy.path = prune_taxa(pathways$KO[pathways$L1 == pathway], genes.clr)
phy.dist = phyloseq::distance(phy.path, method="euclidean")
adonis(phy.dist ~ sample_data(phy.path)$Location)$aov.tab[1,6]
}
sapply(unique(pathways$L1), test.pathway.distance)
## Human Diseases Metabolism
## 0.001 0.001
## Organismal Systems Cellular Processes
## 0.001 0.001
## Genetic Information Processing None
## 0.001 0.001
## Unclassified Environmental Information Processing
## 0.001 0.001
source('https://raw.githubusercontent.com/alekseyenko/Tw2/master/code/Tw2.R')
test.pathway.distance2 = function(pathway){
phy.path = prune_taxa(pathways$KO[pathways$L1 == pathway], genes.clr)
phy.dist = phyloseq::distance(phy.path, method="euclidean")
WT.test(phy.dist, sample_data(phy.path)$Location)$p.value
}
sapply(unique(pathways$L1), test.pathway.distance2)
## Human Diseases Metabolism
## 0.001 0.001
## Organismal Systems Cellular Processes
## 0.001 0.001
## Genetic Information Processing None
## 0.001 0.001
## Unclassified Environmental Information Processing
## 0.001 0.001
test.pathway.group = function(pathway){
phy.path = prune_taxa(pathways$KO[pathways$L1 == pathway], genes.clr)
t.test(apply(otu_table(phy.path), 2, median) ~ sample_data(phy.path)$Location)$p.value
}
sapply(unique(pathways$L1), test.pathway.group)
## Human Diseases Metabolism
## 2.744858e-08 2.744858e-08
## Organismal Systems Cellular Processes
## 2.744858e-08 2.744858e-08
## Genetic Information Processing None
## 2.744858e-08 2.744858e-08
## Unclassified Environmental Information Processing
## 2.744858e-08 2.744858e-08
test.pathway.group2 = function(pathway){
phy.path = prune_taxa(pathways$KO[pathways$L1 == pathway], genes.clr)
t.test(apply(otu_table(phy.path), 2, mean) ~ sample_data(phy.path)$Location)$p.value
}
sapply(unique(pathways$L1), test.pathway.group2)
## Human Diseases Metabolism
## 0.1181401 0.1181401
## Organismal Systems Cellular Processes
## 0.1181401 0.1181401
## Genetic Information Processing None
## 0.1181401 0.1181401
## Unclassified Environmental Information Processing
## 0.1181401 0.1181401
barplot(sort(t.values, decreasing=T), names.arg=NA)
Where do Metabolism genes fall?
pathway.indicator = taxa_names(genes.clr) %in% pathways$ko[pathways$Level1 == "Metabolism"]
sum(pathway.indicator)
## [1] 0
oo = order(t.values, decreasing=T)
barplot(t.values[oo], col=c("black","red")[pathway.indicator+1][oo], border=NA, names.arg = NA)
Calculating and plotting the enrichment score.
ES=-t.values
ES[pathway.indicator] = -ES[pathway.indicator]
ES.o = c(ES[oo[1]], cumsum(ES[oo]))
plot(ES.o, main="Enrichment score")
max(ES.o)
## [1] 12908.37
Permutations
compute.ES = function(statistic, indicator){
ES=-statistic
ES[indicator] = -ES[indicator]
ES.o = c(ES[oo[1]], cumsum(ES[order(statistic, decreasing=T)]))
max(ES.o)
}
physeq.t.test.statistics = function(phy, f){
return(apply(otu_table(phy), 1,
function(x) t.test(c(x) ~ f)$statistic))
}
head(physeq.t.test.statistics(genes.clr, Location))
## K01365 K01364 K01361 K01360 K01362 K02249
## -6.049952 -6.049952 -3.850435 -6.049952 -5.429629 -6.049952
compute.ES(t.values, pathway.indicator)
## [1] 12908.37
set.seed(4.12)
nrep=1000
#ps = replicate(nrep,
#compute.ES(physeq.t.test.statistics(genes.clr,
# sample(Location)),
# pathway.indicator))
#sum(ps>=compute.ES(t.values, pathway.indicator))/nrep
This is the permutation-based p-value for gene set enrichment in the Metabolism pathway. The same procedure can be repeated for other pathways.