Metagenomic Prediction Tutorial is available from PICRUSt at https://picrust.github.io/picrust/tutorials/metagenome_prediction.html. Use the privided otu_table.biom
file for this analysis.
We will need the biom
and phyloseq
packages.
library(phyloseq); packageVersion('phyloseq')
## [1] '1.28.0'
library(biomformat)
From the picrust pipeline output, we will get the file with the predicted metagenomes. This code assumes that the file name is metagenome_prediction.tab
and that the file is located in the working directory. In addition, we will need the corresponding ‘mapping’ file, or the metadata. For the purposes of this laboratory this file is obesity_mapping.txt
.
mapfilename="obesity_mapping.txt"
picrustfilename="metagenome_predictions.tab"
We will first simply read in the entire biome file into an R object.
(picr = read_biom(picrustfilename))
## biom object.
## type: Gene table
## matrix_type: sparse
## 6909 rows and 96 columns
The function biom_data
will return the table of counts of each enzyme in the dataset.
datatable = biom_data(picr)
dim(datatable)
## [1] 6909 96
datatable[1:5,1:5]
## 5 x 5 sparse Matrix of class "dgCMatrix"
## cecal.V5 cecal.VP9 fecal.C1 fecal.VP10 cecal.T3
## K01365 . . . . .
## K01364 . . . . .
## K01361 . 1 . 2 1
## K01360 . . . . .
## K01362 434 523 557 603 698
Each enzyme is assotiated with pathway information and contains additional descriptions, which will be useful in the analysis. In the next step we will extract that information in a useful format. This information is coded in the $rows
component of the biom object:
picr$rows[1]
## [[1]]
## [[1]]$id
## [1] "K01365"
##
## [[1]]$metadata
## [[1]]$metadata$KEGG_Description
## [1] "cathepsin L [EC:3.4.22.15]"
##
## [[1]]$metadata$KEGG_Pathways
## [[1]]$metadata$KEGG_Pathways[[1]]
## [1] "Human Diseases" "Immune System Diseases"
## [3] "Rheumatoid arthritis"
##
## [[1]]$metadata$KEGG_Pathways[[2]]
## [1] "Metabolism" "Enzyme Families" "Peptidases"
##
## [[1]]$metadata$KEGG_Pathways[[3]]
## [1] "Organismal Systems"
## [2] "Immune System"
## [3] "Antigen processing and presentation"
##
## [[1]]$metadata$KEGG_Pathways[[4]]
## [1] "Cellular Processes" "Transport and Catabolism"
## [3] "Phagosome"
##
## [[1]]$metadata$KEGG_Pathways[[5]]
## [1] "Genetic Information Processing" "Folding, Sorting and Degradation"
## [3] "Chaperones and folding catalysts"
##
## [[1]]$metadata$KEGG_Pathways[[6]]
## [1] "Cellular Processes" "Transport and Catabolism"
## [3] "Lysosome"
First extract the descriptions:
descriptions = unlist(sapply(picr$rows, function(x) paste(x$metadata$KEGG_Description, collapse=',')))
ids = unlist(sapply(picr$rows, function(x) x$id))
names(descriptions) = ids
head(descriptions)
## K01365
## "cathepsin L [EC:3.4.22.15]"
## K01364
## "streptopain [EC:3.4.22.10]"
## K01361
## "lactocepin [EC:3.4.21.96]"
## K01360
## "proprotein convertase subtilisin/kexin type 2 [EC:3.4.21.94]"
## K01362
## "None"
## K02249
## "competence protein ComGG"
The patway information is contained in the KEGG_Pathway
and assigns each enzyme to pathways at 3 levels from more general (Human Diseases) to more specific (‘Rheumatoid arthritis’). Let’s convert these data into a matrix format; first, on a few rows.
(pathway_by_ko = sapply(head(picr$rows, n=3), function(x) sapply(x$metadata$KEGG_Pathways, function(y) c(x$id, y))))
## [[1]]
## [,1] [,2]
## [1,] "K01365" "K01365"
## [2,] "Human Diseases" "Metabolism"
## [3,] "Immune System Diseases" "Enzyme Families"
## [4,] "Rheumatoid arthritis" "Peptidases"
## [,3] [,4]
## [1,] "K01365" "K01365"
## [2,] "Organismal Systems" "Cellular Processes"
## [3,] "Immune System" "Transport and Catabolism"
## [4,] "Antigen processing and presentation" "Phagosome"
## [,5] [,6]
## [1,] "K01365" "K01365"
## [2,] "Genetic Information Processing" "Cellular Processes"
## [3,] "Folding, Sorting and Degradation" "Transport and Catabolism"
## [4,] "Chaperones and folding catalysts" "Lysosome"
##
## [[2]]
## [,1]
## [1,] "K01364"
## [2,] "Metabolism"
## [3,] "Enzyme Families"
## [4,] "Peptidases"
##
## [[3]]
## [,1] [,2]
## [1,] "K01361" "K01361"
## [2,] "Genetic Information Processing" "Metabolism"
## [3,] "Folding, Sorting and Degradation" "Enzyme Families"
## [4,] "Chaperones and folding catalysts" "Peptidases"
(pathways = matrix(unlist(pathway_by_ko), ncol=4, byrow=T))
## [,1] [,2]
## [1,] "K01365" "Human Diseases"
## [2,] "K01365" "Metabolism"
## [3,] "K01365" "Organismal Systems"
## [4,] "K01365" "Cellular Processes"
## [5,] "K01365" "Genetic Information Processing"
## [6,] "K01365" "Cellular Processes"
## [7,] "K01364" "Metabolism"
## [8,] "K01361" "Genetic Information Processing"
## [9,] "K01361" "Metabolism"
## [,3]
## [1,] "Immune System Diseases"
## [2,] "Enzyme Families"
## [3,] "Immune System"
## [4,] "Transport and Catabolism"
## [5,] "Folding, Sorting and Degradation"
## [6,] "Transport and Catabolism"
## [7,] "Enzyme Families"
## [8,] "Folding, Sorting and Degradation"
## [9,] "Enzyme Families"
## [,4]
## [1,] "Rheumatoid arthritis"
## [2,] "Peptidases"
## [3,] "Antigen processing and presentation"
## [4,] "Phagosome"
## [5,] "Chaperones and folding catalysts"
## [6,] "Lysosome"
## [7,] "Peptidases"
## [8,] "Chaperones and folding catalysts"
## [9,] "Peptidases"
Now for the entire dataset:
pathway_by_ko = sapply(picr$rows, function(x) sapply(x$metadata$KEGG_Pathways, function(y) c(x$id, y[1:3])))
pathways = matrix(unlist(pathway_by_ko), ncol=4, byrow=T)
colnames(pathways) = c('ko', 'L1', 'L2', 'L3')
pathways=as.data.frame(pathways, stringsAsFactors=F)
head(pathways)
## ko L1 L2
## 1 K01365 Human Diseases Immune System Diseases
## 2 K01365 Metabolism Enzyme Families
## 3 K01365 Organismal Systems Immune System
## 4 K01365 Cellular Processes Transport and Catabolism
## 5 K01365 Genetic Information Processing Folding, Sorting and Degradation
## 6 K01365 Cellular Processes Transport and Catabolism
## L3
## 1 Rheumatoid arthritis
## 2 Peptidases
## 3 Antigen processing and presentation
## 4 Phagosome
## 5 Chaperones and folding catalysts
## 6 Lysosome
Note, some enzymes may be unassigned to pathways:
head(pathways[pathways$L1 == 'None',])
## ko L1 L2 L3
## 10 K01360 None <NA> <NA>
## 200 K00191 None <NA> <NA>
## 265 K14680 None <NA> <NA>
## 266 K14681 None <NA> <NA>
## 267 K14682 None <NA> <NA>
## 367 K06694 None <NA> <NA>
# read the sample data
meta = sample_data(read.table(mapfilename, header=T, row.names=1, sep="\t"))
abundances = otu_table(as.matrix(datatable), taxa_are_rows = T)
# Make sure the sample_names match up
sample_names(abundances)
## [1] "cecal.V5" "cecal.VP9" "fecal.C1" "fecal.VP10" "cecal.T3"
## [6] "cecal.VP4" "cecal.VP5" "fecal.T9" "cecal.VP2" "cecal.VP8"
## [11] "fecal.VP2" "fecal.VP3" "cecal.V1" "cecal.T10" "fecal.T8"
## [16] "fecal.T10" "fecal.VP7" "fecal.T1" "cecal.C6" "cecal.P7"
## [21] "cecal.T5" "cecal.T6" "cecal.T8" "cecal.VP7" "cecal.C3"
## [26] "cecal.V9" "cecal.C10" "cecal.C2" "cecal.P4" "cecal.V8"
## [31] "cecal.VP1" "cecal.VP10" "fecal.V5" "fecal.V9" "cecal.P5"
## [36] "cecal.V6" "fecal.T3" "cecal.C7" "cecal.P10" "fecal.C4"
## [41] "cecal.P2" "fecal.C9" "cecal.T1" "cecal.V3" "cecal.VP6"
## [46] "fecal.C6" "cecal.C5" "cecal.P3" "cecal.V7" "fecal.V2"
## [51] "cecal.C8" "cecal.P8" "cecal.V4" "cecal.T4" "cecal.T9"
## [56] "fecal.P9" "fecal.V3" "cecal.P9" "fecal.P4" "fecal.T4"
## [61] "cecal.V10" "fecal.VP4" "cecal.P6" "fecal.C2" "fecal.P7"
## [66] "cecal.V2" "cecal.T7" "fecal.P10" "fecal.T6" "fecal.V6"
## [71] "fecal.V8" "fecal.T7" "fecal.VP9" "fecal.V7" "fecal.P6"
## [76] "cecal.P1" "cecal.VP3" "fecal.C5" "fecal.P8" "fecal.VP1"
## [81] "fecal.VP8" "cecal.C1" "fecal.C10" "fecal.C3" "fecal.C7"
## [86] "fecal.C8" "fecal.V10" "fecal.P1" "fecal.V4" "cecal.C9"
## [91] "cecal.T2" "fecal.T2" "fecal.V1" "cecal.C4" "fecal.P3"
## [96] "fecal.P2"
sample_names(meta)
## [1] "cecal_C1" "cecal_C10" "cecal_C2" "cecal_C3" "cecal_C4"
## [6] "cecal_C5" "cecal_C6" "cecal_C7" "cecal_C8" "cecal_C9"
## [11] "cecal_P1" "cecal_P10" "cecal_P2" "cecal_P3" "cecal_P4"
## [16] "cecal_P5" "cecal_P6" "cecal_P7" "cecal_P8" "cecal_P9"
## [21] "cecal_T1" "cecal_T10" "cecal_T2" "cecal_T3" "cecal_T4"
## [26] "cecal_T5" "cecal_T6" "cecal_T7" "cecal_T8" "cecal_T9"
## [31] "cecal_V1" "cecal_V10" "cecal_V2" "cecal_V3" "cecal_V4"
## [36] "cecal_V5" "cecal_V6" "cecal_V7" "cecal_V8" "cecal_V9"
## [41] "cecal_VP1" "cecal_VP10" "cecal_VP2" "cecal_VP3" "cecal_VP4"
## [46] "cecal_VP5" "cecal_VP6" "cecal_VP7" "cecal_VP8" "cecal_VP9"
## [51] "fecal_C1" "fecal_C10" "fecal_C2" "fecal_C3" "fecal_C4"
## [56] "fecal_C5" "fecal_C6" "fecal_C7" "fecal_C8" "fecal_C9"
## [61] "fecal_P1" "fecal_P10" "fecal_P2" "fecal_P3" "fecal_P4"
## [66] "fecal_P6" "fecal_P7" "fecal_P8" "fecal_P9" "fecal_T1"
## [71] "fecal_T10" "fecal_T2" "fecal_T3" "fecal_T4" "fecal_T6"
## [76] "fecal_T7" "fecal_T8" "fecal_T9" "fecal_V1" "fecal_V10"
## [81] "fecal_V2" "fecal_V3" "fecal_V4" "fecal_V5" "fecal_V6"
## [86] "fecal_V7" "fecal_V8" "fecal_V9" "fecal_VP1" "fecal_VP10"
## [91] "fecal_VP2" "fecal_VP3" "fecal_VP4" "fecal_VP7" "fecal_VP8"
## [96] "fecal_VP9"
sample_names(meta) = gsub('_','.', sample_names(meta))
(genes = merge_phyloseq(abundances, meta))
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 6909 taxa and 96 samples ]
## sample_data() Sample Data: [ 96 samples by 5 sample variables ]
head(meta)
## BarcodeSequence LinkerPrimerSequence Treatment Location
## cecal.C1 NA NA C cecal
## cecal.C10 NA NA C cecal
## cecal.C2 NA NA C cecal
## cecal.C3 NA NA C cecal
## cecal.C4 NA NA C cecal
## cecal.C5 NA NA C cecal
## Description
## cecal.C1 missing_description
## cecal.C10 missing_description
## cecal.C2 missing_description
## cecal.C3 missing_description
## cecal.C4 missing_description
## cecal.C5 missing_description
save(genes, pathways, descriptions, file = "STAT.picrust.RData")