1 Packages

1.1 Check packages

packageVersion("clusterSim")
## [1] '0.47.1'
packageVersion("cluster")
## [1] '2.0.7.1'
packageVersion("ROCR")
## [1] '1.0.7'
packageVersion("phyloseq")
## [1] '1.24.0'
packageVersion("ggplot2")
## [1] '3.0.0'

1.2 Load packages, data

library(phyloseq)
load("STAT.RData")

 


2 Normalize the data

normalizeSample = function(x){ x / sum(x)}
physeq.norm = transformSampleCounts(phy, normalizeSample)

3 Compute Distance

3.1 Compute Jensen-Shannon divergence distance

jsd.dist = phyloseq::distance(physeq.norm, "jsd")

4 Hierarchical clustering

csin <- hclust(jsd.dist, method = "single")
ccom <- hclust(jsd.dist, method = "complete")
caver <- hclust(jsd.dist, method = "aver")

4.1 plot dendrograms

par(mfrow=c(1,3))
plot(csin, hang=-1, main="single linkage")
plot(ccom, hang=-1, main="complete linkage")
plot(caver, hang=-1, main="average linkage")

4.2 Compute clusters from hierarchical cluster

# Example of how to select best discrete clusters from the dendrograms
par(mfrow=c(1,1))
plot(ccom, hang = -1)
dcl = rect.hclust(ccom, 3)

4.3 compute co-phenetic distance

par(mfrow=c(1,3))
plot(jsd.dist, cophenetic(csin), asp = 1, main="single linkage")
abline(0,1, col='red', lty='dashed')
plot(jsd.dist, cophenetic(ccom), asp = 1, main="complete linkage")
abline(0,1, col='red', lty='dashed')
plot(jsd.dist, cophenetic(caver), asp = 1, main="average linkage")
abline(0,1, col='red', lty='dashed')

Cophenetic correlation is maximized by average linkage

cor(jsd.dist, cophenetic(csin))
## [1] 0.659062
cor(jsd.dist, cophenetic(ccom))
## [1] 0.7639895
cor(jsd.dist, cophenetic(caver))
## [1] 0.8549823

5 Discrete clustering

## Discrete clustering
library(cluster)
library(clusterSim)

cc = pam(jsd.dist, k=2, cluster.only=T)

table(sample_data(phy)$Treatment, cc)
##     cc
##       1  2
##   C  10 10
##   P  10  9
##   T  10  9
##   V  10 10
##   VP  9  9
table(sample_data(phy)$Location, cc)
##        cc
##          1  2
##   cecal 49  1
##   fecal  0 46
cluster.tr = table(sample_data(phy)$Treatment, cc)
chisq.test(cluster.tr)
## 
##  Pearson's Chi-squared test
## 
## data:  cluster.tr
## X-squared = 0.063624, df = 4, p-value = 0.9995
cluster.loc = table(sample_data(phy)$Location, cc)
cluster.loc
##        cc
##          1  2
##   cecal 49  1
##   fecal  0 46
chisq.test(cluster.loc)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  cluster.loc
## X-squared = 88.198, df = 1, p-value < 2.2e-16

6 Gap Statistic (cluster goodness)

pam1 <- function(x,k) list(cluster = pam(as.dist(x),k, cluster.only=TRUE))
gsPam1 <- clusGap(as.matrix(jsd.dist), FUN = pam1, K.max = 20, B = 100)
## Clustering k = 1,2,..., K.max (= 20): .. done
## Bootstrapping, b = 1,2,..., B (= 100)  [one "." per sample]:
## .................................................. 50 
## .................................................. 100
par(mfrow=c(1,1))
plot(gsPam1)

a slightly nicer way to visualize Gap statistic analysis

library(ggplot2)
m = ggplot(as.data.frame(gsPam1$Tab), aes(y=gap, x=1:20))
m + theme_bw() + 
  geom_line() + 
  geom_errorbar(aes(ymin=gap-2*SE.sim, ymax=gap+2*SE.sim), colour="red") + 
  geom_errorbar(aes(ymin=gap-SE.sim,
                    ymax=gap + SE.sim), 
                colour="black", linetype="dashed") + 
  ylab(expression(paste("Ga",p[k]))) + 
  geom_point() + 
  xlab("Number of clusters (k)") + 
  xlim(0,11)
## Warning: Removed 9 rows containing missing values (geom_path).
## Warning: Removed 9 rows containing missing values (geom_errorbar).

## Warning: Removed 9 rows containing missing values (geom_errorbar).
## Warning: Removed 9 rows containing missing values (geom_point).

7 Classification error estimation

source('CV_protocols.R')
phylum = tax_glom(physeq.norm, taxrank = "Phylum")

7.1 Predict Location

response = sample_data(phylum)$Location
predictors = as.matrix(t(otu_table(phylum)))

7.2 predict using Random Forest

Location.rf.100repCV = 
  sapply(1:100, 
         function(i) 
           mean(rf.kfoldAUC(predictors, 
                            response, 
                            k=6)$aucs))

7.3 predict using Support Vetor Machines

Location.svm.100repCV = 
  sapply(1:100, 
         function(i) 
           mean(svm.kfoldAUC(predictors, 
                             response, 
                             k=6)$aucs))

print('Location')
## [1] "Location"

Compare

rbind(RandomForest = c(mean(Location.rf.100repCV), 
                       quantile(Location.rf.100repCV, 
                                prob=c(0.05, 1))),
      SVM = c(mean(Location.svm.100repCV), 
              quantile(Location.svm.100repCV, prob=c(0.05, 1))))
##                              5% 100%
## RandomForest 0.9999215 1.000000    1
## SVM          0.9956279 0.989542    1

7.4 Predict Antibiotic vs. Control in fecal samples

predictors = as.matrix(t(otu_table(
  subset_samples(phylum,
                 Location=='fecal'))))
resp = sample_data(
  subset_samples(phylum, 
                 Location=='fecal'))$Treatment
response = rep('Atibiotic', length(resp))
response[resp == 'C'] = 'Control'
response = factor(response)
response
##  [1] Control   Control   Control   Control   Control   Control   Control  
##  [8] Control   Control   Control   Atibiotic Atibiotic Atibiotic Atibiotic
## [15] Atibiotic Atibiotic Atibiotic Atibiotic Atibiotic Atibiotic Atibiotic
## [22] Atibiotic Atibiotic Atibiotic Atibiotic Atibiotic Atibiotic Atibiotic
## [29] Atibiotic Atibiotic Atibiotic Atibiotic Atibiotic Atibiotic Atibiotic
## [36] Atibiotic Atibiotic Atibiotic Atibiotic Atibiotic Atibiotic Atibiotic
## [43] Atibiotic Atibiotic Atibiotic Atibiotic
## Levels: Atibiotic Control
Treatmentf.rf.100repCV = 
  sapply(1:100, 
         function(i) 
           mean(rf.kfoldAUC(predictors, 
                            response, k=6)$aucs))
Treatmentf.svm.100repCV = 
  sapply(1:100, 
         function(i) 
           mean(svm.kfoldAUC(predictors, 
                             response, k=6)$aucs))

print('Treatment in fecal')
## [1] "Treatment in fecal"
rbind(RandomForest = c(mean(Treatmentf.rf.100repCV), 
                       quantile(Treatmentf.rf.100repCV, 
                                prob=c(0.05, 1))),
      SVM = c(mean(Treatmentf.svm.100repCV), 
              quantile(Treatmentf.svm.100repCV, 
                       prob=c(0.05, 1))))
##                               5%      100%
## RandomForest 0.7500694 0.6527778 0.8750000
## SVM          0.8333333 0.7354167 0.9305556

7.5 Predict Antibiotic vs. Control in cecal samples

predictors = as.matrix(t(
  otu_table(subset_samples(phylum, 
                           Location=='cecal'))))
resp = sample_data(
  subset_samples(phylum, 
                 Location=='cecal'))$Treatment
response = rep('Atibiotic', length(resp))
response[resp == 'C'] = 'Control'
response = factor(response)

Treatmentc.rf.100repCV = 
  sapply(1:100, 
         function(i) 
           mean(rf.kfoldAUC(predictors, 
                            response, k=6)$aucs))
Treatmentc.svm.100repCV = 
  sapply(1:100, 
         function(i) 
           mean(svm.kfoldAUC(predictors, 
                             response, k=6)$aucs))

print('Treatment in cecal')
## [1] "Treatment in cecal"
rbind(RandomForests = c(mean(Treatmentc.rf.100repCV), 
                        quantile(Treatmentc.rf.100repCV, 
                                 prob=c(0.05, 1))),
      SVM = c(mean(Treatmentc.svm.100repCV), 
              quantile(Treatmentc.svm.100repCV, 
                      prob=c(0.05, 1))))
##                                5%      100%
## RandomForests 0.7009425 0.6121032 0.8194444
## SVM           0.8321429 0.7647817 0.9285714

8 caret Package

caret: Classification and Regression Tools

Alternatively, you can run these exact same methods and CV testing design using the caret package, which provides a convenient unified interface to a large number of statistical/machine learning methods in R.

8.1 Load caret

library("caret")
# install.packages("pROC")
library("pROC")

8.2 Parallel?

caret supports a system-agnostic parallelization framework called “foreach”. You may have this already installed and available. If so, you can change the following runInParallel parameter to TRUE.

If you do set it to TRUE, one of the foreach helper pacakges, “doParallel”, will be loaded and the parallel “backend” will be defined. The code you use for your analysis does not change, other than perhaps the allowParallel option below.

runInParallel = FALSE
if(runInParallel){
  library("doParallel")
  registerDoParallel(cl = parallel::detectCores() - 1L)
}

8.3 Define cross-validation

Define how you want cross-validation to be performed. This is separate from the step where you define the input data and method to use.

fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           classProbs = TRUE,
                           ## repeated ten times
                           repeats = 10,
                           summaryFunction = twoClassSummary,
                           allowParallel = runInParallel)

8.4 Execute CV training on SVM

# Run Training. SVM
fit1svmL <- caret::train(x = predictors,
                         y = as.character(response), 
                         method = "svmLinear",
                         metric = "ROC",
                         trControl = fitControl,
                         verbose = FALSE)

8.5 Execute CV training on RF

# Run Training. SVM
fit1rf <- caret::train(x = predictors,
                       y = response, 
                       metric = "ROC",
                       method = "rf",
                       trControl = fitControl,
                       verbose = FALSE)

8.6 Evaluate, Compare Results

# SVM results summary
fit1svmL
## Support Vector Machines with Linear Kernel 
## 
## 50 samples
##  7 predictor
##  2 classes: 'Atibiotic', 'Control' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 45, 45, 45, 45, 45, 45, ... 
## Resampling results:
## 
##   ROC   Sens   Spec
##   0.58  0.965  0.01
## 
## Tuning parameter 'C' was held constant at a value of 1
# RF results summary
fit1rf
## Random Forest 
## 
## 50 samples
##  7 predictor
##  2 classes: 'Atibiotic', 'Control' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 45, 45, 45, 45, 45, 45, ... 
## Resampling results across tuning parameters:
## 
##   mtry  ROC     Sens    Spec
##   2     0.6950  0.9400  0.23
##   4     0.7375  0.9100  0.40
##   7     0.7675  0.8925  0.46
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.

9 Addressing class imbalance (caret)

caret package supports methods to address class imbalance within the CV framework via the sampling parameter in the trainControl() definition function.

See the sampling help page for more details.

fitControl2 <- trainControl(
  ## 5-fold CV
  method = "repeatedcv",
  number = 5,
  classProbs = TRUE,
  ## repeated ten times
  repeats = 10,
  summaryFunction = twoClassSummary,
  allowParallel = runInParallel,
  sampling = "up")

9.1 Re-run CV training using both methods

# Run Training. SVM
fit2svmL <- caret::train(x = predictors,
                         y = response, 
                         method = "svmLinear",
                         metric = "ROC",
                         trControl = fitControl2,
                         verbose = FALSE)
# RF
fit2rf <- caret::train(x = predictors,
                       y = response, 
                       method = "rf",
                       metric = "ROC",
                       trControl = fitControl2,
                       verbose = FALSE)

Compare (again)

fit2svmL
## Support Vector Machines with Linear Kernel 
## 
## 50 samples
##  7 predictor
##  2 classes: 'Atibiotic', 'Control' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 40, 40, 40, 40, 40, 40, ... 
## Addtional sampling using up-sampling
## 
## Resampling results:
## 
##   ROC    Sens    Spec
##   0.705  0.7275  0.61
## 
## Tuning parameter 'C' was held constant at a value of 1
fit2rf
## Random Forest 
## 
## 50 samples
##  7 predictor
##  2 classes: 'Atibiotic', 'Control' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 40, 40, 40, 40, 40, 40, ... 
## Addtional sampling using up-sampling
## 
## Resampling results across tuning parameters:
## 
##   mtry  ROC       Sens    Spec
##   2     0.723750  0.9075  0.43
##   4     0.735625  0.8900  0.51
##   7     0.745625  0.8725  0.58
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.