Sometimes we need to add data to a phyloseq object. An important consideration is that the data is aligned in terms of sample names and units of measurement. In this example, we will merge data that is measured at two locations in the same individual with the data that is measured on a per individual basis.
# load the data
library(phyloseq)
load('STAT.RData')
ls()
## [1] "phenotypes" "phy"
We have two objects here. phy
– a phyloseq object, and phenotypes
– additional sample data for the phyloseq data. Note that the sample names are in different format:
head(phenotypes)
## GIP Insulin Leptin IGF1 BMD mFat pFat
## C1 8.61 1 2320 1250 0.0492 4.0 19.7
## C2 28.60 137 1040 2180 0.0487 3.7 18.6
## C3 19.20 543 1670 2160 0.0510 3.6 19.5
## C4 38.30 606 972 2070 0.0465 4.0 21.3
## C5 13.20 1 2210 1240 0.0547 4.1 21.6
## C6 36.80 696 2330 1830 0.0512 3.7 17.7
phy
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 6547 taxa and 96 samples ]
## sample_data() Sample Data: [ 96 samples by 6 sample variables ]
## tax_table() Taxonomy Table: [ 6547 taxa by 9 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 6547 tips and 6321 internal nodes ]
head(sample_data(phy))
## X.SampleID BarcodeSequence LinkerPrimerSequence Treatment
## cecal_C1 cecal_C1 NA NA C
## cecal_C10 cecal_C10 NA NA C
## cecal_C2 cecal_C2 NA NA C
## cecal_C3 cecal_C3 NA NA C
## cecal_C4 cecal_C4 NA NA C
## cecal_C5 cecal_C5 NA NA C
## Location Description
## cecal_C1 cecal missing_description
## cecal_C10 cecal missing_description
## cecal_C2 cecal missing_description
## cecal_C3 cecal missing_description
## cecal_C4 cecal missing_description
## cecal_C5 cecal missing_description
levels(sample_data(phy)$Location)
## [1] "cecal" "fecal"
Our strategy is to duplicate the phyenotype variables for each of the location, then merge these two together and then to merge these with the phy
phyloseq object.
#Create copies
pheno.cecal = phenotypes
pheno.fecal = phenotypes
rownames(pheno.cecal) =
paste('cecal', rownames(pheno.cecal), sep="_")
rownames(pheno.fecal) =
paste('fecal', rownames(pheno.fecal), sep="_")
pheno = rbind(pheno.cecal, pheno.fecal)
phy2 = merge_phyloseq(phy, sample_data(pheno))
phy2
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 6547 taxa and 96 samples ]
## sample_data() Sample Data: [ 96 samples by 13 sample variables ]
## tax_table() Taxonomy Table: [ 6547 taxa by 9 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 6547 tips and 6321 internal nodes ]
head(sample_data(phy2))
## X.SampleID BarcodeSequence LinkerPrimerSequence Treatment
## cecal_C1 cecal_C1 NA NA C
## cecal_C10 cecal_C10 NA NA C
## cecal_C2 cecal_C2 NA NA C
## cecal_C3 cecal_C3 NA NA C
## cecal_C4 cecal_C4 NA NA C
## cecal_C5 cecal_C5 NA NA C
## Location Description GIP Insulin Leptin IGF1 BMD
## cecal_C1 cecal missing_description 8.61 1 2320 1250 0.0492
## cecal_C10 cecal missing_description 27.70 516 2680 3720 0.0479
## cecal_C2 cecal missing_description 28.60 137 1040 2180 0.0487
## cecal_C3 cecal missing_description 19.20 543 1670 2160 0.0510
## cecal_C4 cecal missing_description 38.30 606 972 2070 0.0465
## cecal_C5 cecal missing_description 13.20 1 2210 1240 0.0547
## mFat pFat
## cecal_C1 4.0 19.7
## cecal_C10 4.0 19.8
## cecal_C2 3.7 18.6
## cecal_C3 3.6 19.5
## cecal_C4 4.0 21.3
## cecal_C5 4.1 21.6
class(otu_table(phy2))
## [1] "otu_table"
## attr(,"package")
## [1] "phyloseq"
otu_table(phy2)[1:5, 1:10]
## OTU Table: [5 taxa and 10 samples]
## taxa are rows
## cecal_C1 cecal_C10 cecal_C2 cecal_C3 cecal_C4 cecal_C5 cecal_C6
## 1192 0 0 0 0 0 0 0
## 5898 0 0 0 1 0 0 0
## 4506 0 1 0 0 0 0 0
## 1027 1 1 1 1 0 1 0
## 5437 7 2 9 15 12 3 6
## cecal_C7 cecal_C8 cecal_C9
## 1192 0 0 0
## 5898 0 0 0
## 4506 0 0 0
## 1027 0 1 2
## 5437 3 3 6
head(tax_table(phy2))
## Taxonomy Table: [6 taxa by 9 taxonomic ranks]:
## Root Domain Phylum Class Order
## 1192 "Root" "Bacteria" "Bacteroidetes" NA NA
## 5898 "Root" "Bacteria" "Bacteroidetes" "Bacteroidetes" "Bacteroidales"
## 4506 "Root" "Bacteria" "Bacteroidetes" "Bacteroidetes" "Bacteroidales"
## 1027 "Root" "Bacteria" "Bacteroidetes" NA NA
## 5437 "Root" "Bacteria" "Bacteroidetes" "Bacteroidetes" "Bacteroidales"
## 6380 "Root" "Bacteria" "Bacteroidetes" "Bacteroidetes" "Bacteroidales"
## Family Genus Species Subspecies
## 1192 NA NA NA NA
## 5898 NA NA NA NA
## 4506 NA NA NA NA
## 1027 NA NA NA NA
## 5437 NA NA NA NA
## 6380 NA NA NA NA
class(phy_tree(phy2))
## [1] "phylo"
plot(phy_tree(phy2))
We will experiment with data normalization. To make dataset easier to work with we will focus on Order level taxa. Here’s how to summarize the taxa at order level:
order.phy = tax_glom(phy2, taxrank = "Order")
In the next sections we compute the normalizations using relative abundance and CLR methods.
order.rel =
transform_sample_counts(order.phy,
function(x) x/sum(x))
order.rel
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 21 taxa and 96 samples ]
## sample_data() Sample Data: [ 96 samples by 13 sample variables ]
## tax_table() Taxonomy Table: [ 21 taxa by 9 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 21 tips and 19 internal nodes ]
library(ggplot2)
plot_bar(order.rel, fill="Order") + facet_wrap(~Order, scales = "free_y")
Filter taxa that are in less than 5% abundance.
pb_data = plot_bar(order.rel, fill="Order")
names(pb_data)
## [1] "data" "layers" "scales" "mapping" "theme"
## [6] "coordinates" "facet" "plot_env" "labels"
head(pb_data$data)
## OTU Sample Abundance X.SampleID BarcodeSequence
## 492 1948 fecal_P4 0.9776248 fecal_P4 NA
## 506 1948 fecal_V4 0.9737034 fecal_V4 NA
## 489 1948 fecal_V9 0.9727891 fecal_V9 NA
## 567 1948 fecal_P2 0.9713656 fecal_P2 NA
## 508 1948 fecal_C9 0.9709865 fecal_C9 NA
## 566 1948 fecal_C6 0.9670103 fecal_C6 NA
## LinkerPrimerSequence Treatment Location Description GIP
## 492 NA P fecal missing_description 24.9
## 506 NA V fecal missing_description 21.8
## 489 NA V fecal missing_description 18.3
## 567 NA P fecal missing_description 42.0
## 508 NA C fecal missing_description 20.7
## 566 NA C fecal missing_description 36.8
## Insulin Leptin IGF1 BMD mFat pFat Root Domain Phylum
## 492 514 5160 1720 0.0479 4.2 21.7 Root Bacteria Bacteroidetes
## 506 194 1800 1980 0.0474 4.4 23.3 Root Bacteria Bacteroidetes
## 489 1 680 1040 0.0477 4.2 22.3 Root Bacteria Bacteroidetes
## 567 460 4570 1670 0.0550 7.4 32.1 Root Bacteria Bacteroidetes
## 508 1 677 1700 0.0457 3.8 20.6 Root Bacteria Bacteroidetes
## 566 696 2330 1830 0.0512 3.7 17.7 Root Bacteria Bacteroidetes
## Class Order
## 492 Bacteroidetes Bacteroidales
## 506 Bacteroidetes Bacteroidales
## 489 Bacteroidetes Bacteroidales
## 567 Bacteroidetes Bacteroidales
## 508 Bacteroidetes Bacteroidales
## 566 Bacteroidetes Bacteroidales
pb_data$data = pb_data$data[pb_data$data$Abundance>0.05,]
print(pb_data)
order.clr =
transform_sample_counts(order.phy,
function(x){
y=log(x+1);
y-sum(y)/ntaxa(order.phy)})
order.clr
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 21 taxa and 96 samples ]
## sample_data() Sample Data: [ 96 samples by 13 sample variables ]
## tax_table() Taxonomy Table: [ 21 taxa by 9 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 21 tips and 19 internal nodes ]
plot_bar(order.clr, fill="Order")
# Compare location
# Use Mann-Whitney test for relative abundance
rel.p = apply(otu_table(order.rel), 1,
function(x) wilcox.test(c(x)~sample_data(order.rel)$Location)$p.value)
# Use t-test for CLR data
clr.p = apply(otu_table(order.clr),1,
function(x) t.test(c(x)~sample_data(order.clr)$Location)$p.value)
univ.location.res = data.frame(
taxa = tax_table(order.clr)[,"Order"],
rel.p, clr.p)
univ.location.res
## Order rel.p clr.p
## 1948 Bacteroidales 1.491443e-10 5.298849e-01
## 1931 "Erysipelotrichales" 6.511867e-01 2.832666e-01
## 4911 Anaeroplasmatales 4.575879e-09 2.560191e-15
## 4220 Clostridiales 3.263823e-15 1.063258e-22
## 1026 Bacillales 2.620285e-08 3.110364e-06
## 1673 "Lactobacillales" 1.987911e-16 2.836508e-19
## 4789 Actinobacteridae 1.065711e-01 1.668570e-01
## 5995 Burkholderiales 6.907857e-02 8.682374e-01
## 5975 Neisseriales 3.480826e-01 1.911682e-02
## 5242 Sphingomonadales 6.988373e-02 3.052896e-01
## 1453 Rhodobacterales 3.480826e-01 2.261139e-02
## 455 Rhizobiales 3.069131e-01 8.319051e-02
## 6648 Myxococcales 3.069131e-01 9.658023e-02
## 3405 Enterobacteriales 2.640680e-02 5.767138e-01
## 2012 Pasteurellales 9.527240e-01 5.229797e-02
## 1847 Pseudomonadales 5.578887e-04 1.043209e-01
## 2172 Chloroplast 3.250245e-05 1.071959e-03
## 3806 Flavobacteriales 1.422842e-01 2.654133e-01
## 4609 Mycoplasmatales 5.609610e-01 1.588900e-02
## 3348 Desulfovibrionales 1.200849e-02 1.823538e-01
## 2626 Coriobacteridae 4.142710e-12 1.611677e-14
plot(univ.location.res$rel.p, univ.location.res$clr.p, pch=19)
abline(h=0.05)
abline(v=0.05)
univ.location.res$rel.fdr =
p.adjust(univ.location.res$rel.p, method="fdr")
univ.location.res$clr.fdr =
p.adjust(univ.location.res$clr.p, method="fdr")
colSums(univ.location.res<0.05)
## Warning in Ops.factor(left, right): '<' not meaningful for factors
## Order rel.p clr.p rel.fdr clr.fdr
## NA 10 9 9 7
univ.location.res
## Order rel.p clr.p rel.fdr
## 1948 Bacteroidales 1.491443e-10 5.298849e-01 7.830076e-10
## 1931 "Erysipelotrichales" 6.511867e-01 2.832666e-01 6.837460e-01
## 4911 Anaeroplasmatales 4.575879e-09 2.560191e-15 1.921869e-08
## 4220 Clostridiales 3.263823e-15 1.063258e-22 3.427014e-14
## 1026 Bacillales 2.620285e-08 3.110364e-06 9.170996e-08
## 1673 "Lactobacillales" 1.987911e-16 2.836508e-19 4.174614e-15
## 4789 Actinobacteridae 1.065711e-01 1.668570e-01 1.721533e-01
## 5995 Burkholderiales 6.907857e-02 8.682374e-01 1.222965e-01
## 5975 Neisseriales 3.480826e-01 1.911682e-02 4.060964e-01
## 5242 Sphingomonadales 6.988373e-02 3.052896e-01 1.222965e-01
## 1453 Rhodobacterales 3.480826e-01 2.261139e-02 4.060964e-01
## 455 Rhizobiales 3.069131e-01 8.319051e-02 4.028235e-01
## 6648 Myxococcales 3.069131e-01 9.658023e-02 4.028235e-01
## 3405 Enterobacteriales 2.640680e-02 5.767138e-01 5.545428e-02
## 2012 Pasteurellales 9.527240e-01 5.229797e-02 9.527240e-01
## 1847 Pseudomonadales 5.578887e-04 1.043209e-01 1.464458e-03
## 2172 Chloroplast 3.250245e-05 1.071959e-03 9.750736e-05
## 3806 Flavobacteriales 1.422842e-01 2.654133e-01 2.134263e-01
## 4609 Mycoplasmatales 5.609610e-01 1.588900e-02 6.200095e-01
## 3348 Desulfovibrionales 1.200849e-02 1.823538e-01 2.801982e-02
## 2626 Coriobacteridae 4.142710e-12 1.611677e-14 2.899897e-11
## clr.fdr
## 1948 5.856623e-01
## 1931 3.499175e-01
## 4911 1.792134e-14
## 4220 2.232841e-21
## 1026 1.306353e-05
## 1673 2.978333e-18
## 4789 2.502855e-01
## 5995 8.682374e-01
## 5975 5.018167e-02
## 5242 3.561712e-01
## 1453 5.275990e-02
## 455 1.588183e-01
## 6648 1.685183e-01
## 3405 6.055494e-01
## 2012 1.098257e-01
## 1847 1.685183e-01
## 2172 3.751855e-03
## 3806 3.483550e-01
## 4609 4.766700e-02
## 3348 2.552953e-01
## 2626 8.461303e-14
Follow https://joey711.github.io/phyloseq/plot_richness-examples.html
plot_heatmap(...)
and plot_bar
functions to visualize the data.?phyloseq::mt