There are a lot of rushing to conclusions about the meaning of sparse (many zeroes) count tables in microbiome data. The goal of this short note is to demonstrate that commonly-observed levels of zero-sparity can often be generated by models that are not zero-inflated.
At the moment I’m only trying to show that we can generate data that “has lots of zeroes”, without resorting to zero-inflation. A more complete treatment would include a comparison in which we also go the other direction; that is, start from simulated data, inferring model parameters and evaluating model appropriateness. The philosophy being that we should know how our methods behave when applied to the “wrong” distribution, including whether / how much it affects our inferences.
rpois()
function simulated counts from a Poisson distribution, given a choice of its only parameter, lambda
.
# Ten different trials of poisson(0.3)
rpois(n = 20, lambda = 0.1)
## [1] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0
rpois(n = 20, lambda = 0.2)
## [1] 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
rpois(n = 20, lambda = 0.3)
## [1] 0 0 0 1 0 0 0 1 0 0 1 0 0 1 0 2 0 2 1 0
rpois(n = 20, lambda = 0.6)
## [1] 2 0 1 0 1 2 0 0 0 1 0 1 0 0 2 1 1 1 1 0
rpois(n = 20, lambda = 1)
## [1] 2 2 1 1 1 1 0 1 2 1 1 1 1 1 4 4 0 0 0 1
rpois(n = 20, lambda = 1.5)
## [1] 1 1 5 2 2 1 3 0 0 0 4 1 4 2 2 0 1 3 1 2
rpois(n = 20, lambda = 2)
## [1] 1 1 3 0 4 4 1 1 2 1 1 2 4 0 2 1 4 2 2 1
rpois(n = 20, lambda = 3)
## [1] 1 2 1 4 5 1 4 4 2 3 4 6 4 1 0 5 2 2 5 4
rpois(n = 20, lambda = 5)
## [1] 8 4 5 6 6 6 9 6 4 5 5 2 5 5 2 0 5 5 9 5
Let’s do this a lot of times, count the zeroes, and summarize in a plot.
nObs = 30
(lambdaValues <- 10^seq(-3, 3, 0.25))
## [1] 1.000000e-03 1.778279e-03 3.162278e-03 5.623413e-03 1.000000e-02
## [6] 1.778279e-02 3.162278e-02 5.623413e-02 1.000000e-01 1.778279e-01
## [11] 3.162278e-01 5.623413e-01 1.000000e+00 1.778279e+00 3.162278e+00
## [16] 5.623413e+00 1.000000e+01 1.778279e+01 3.162278e+01 5.623413e+01
## [21] 1.000000e+02 1.778279e+02 3.162278e+02 5.623413e+02 1.000000e+03
tabPoisson <- data.table(lambda = lambdaValues)
# Simulate
tabPoisson <-
tabPoisson[, .(Count = rpois(n = nObs, lambda = lambda)), by = lambda]
head(tabPoisson)
## lambda Count
## 1: 0.001 0
## 2: 0.001 0
## 3: 0.001 0
## 4: 0.001 0
## 5: 0.001 0
## 6: 0.001 0
tail(tabPoisson)
## lambda Count
## 1: 1000 992
## 2: 1000 1005
## 3: 1000 1024
## 4: 1000 983
## 5: 1000 977
## 6: 1000 1056
# Count zeroes
tabPoisson <-
tabPoisson[, .(nZero = sum(Count == 0)), by = lambda]
head(tabPoisson)
## lambda nZero
## 1: 0.001000000 30
## 2: 0.001778279 30
## 3: 0.003162278 30
## 4: 0.005623413 30
## 5: 0.010000000 30
## 6: 0.017782794 30
tail(tabPoisson)
## lambda nZero
## 1: 56.23413 0
## 2: 100.00000 0
## 3: 177.82794 0
## 4: 316.22777 0
## 5: 562.34133 0
## 6: 1000.00000 0
# Plot
ggplot(tabPoisson, aes(lambda, nZero)) +
geom_point() +
# scale_y_sqrt() +
scale_x_log10() +
ggtitle("Number of zeroes observed v. mean for Poisson distribution")
rnbinom(n = 10, size = 0.4, prob = 0.9)
## [1] 0 0 0 1 0 0 0 0 0 0
nObs = 50
(sizeValues <- 10^seq(-3, 3, 0.25))
## [1] 1.000000e-03 1.778279e-03 3.162278e-03 5.623413e-03 1.000000e-02
## [6] 1.778279e-02 3.162278e-02 5.623413e-02 1.000000e-01 1.778279e-01
## [11] 3.162278e-01 5.623413e-01 1.000000e+00 1.778279e+00 3.162278e+00
## [16] 5.623413e+00 1.000000e+01 1.778279e+01 3.162278e+01 5.623413e+01
## [21] 1.000000e+02 1.778279e+02 3.162278e+02 5.623413e+02 1.000000e+03
(probValues <- 10^seq(-3, 0, 0.25))
## [1] 0.001000000 0.001778279 0.003162278 0.005623413 0.010000000
## [6] 0.017782794 0.031622777 0.056234133 0.100000000 0.177827941
## [11] 0.316227766 0.562341325 1.000000000
tabNegBinom <-
data.table(expand.grid(size = sizeValues, prob = probValues))
head(tabNegBinom)
## size prob
## 1: 0.001000000 0.001
## 2: 0.001778279 0.001
## 3: 0.003162278 0.001
## 4: 0.005623413 0.001
## 5: 0.010000000 0.001
## 6: 0.017782794 0.001
tail(tabNegBinom)
## size prob
## 1: 56.23413 1
## 2: 100.00000 1
## 3: 177.82794 1
## 4: 316.22777 1
## 5: 562.34133 1
## 6: 1000.00000 1
# Simulate
tabNegBinom <-
tabNegBinom[, .(Count = rnbinom(n = nObs, size = size, prob = prob)),
by = .(size, prob)]
head(tabNegBinom)
## size prob Count
## 1: 0.001 0.001 0
## 2: 0.001 0.001 0
## 3: 0.001 0.001 0
## 4: 0.001 0.001 0
## 5: 0.001 0.001 0
## 6: 0.001 0.001 0
tail(tabNegBinom)
## size prob Count
## 1: 1000 1 0
## 2: 1000 1 0
## 3: 1000 1 0
## 4: 1000 1 0
## 5: 1000 1 0
## 6: 1000 1 0
# Count zeroes
tabNegBinom <-
tabNegBinom[, .(nZero = sum(Count == 0)), by = .(size, prob)]
head(tabNegBinom)
## size prob nZero
## 1: 0.001000000 0.001 50
## 2: 0.001778279 0.001 49
## 3: 0.003162278 0.001 50
## 4: 0.005623413 0.001 47
## 5: 0.010000000 0.001 43
## 6: 0.017782794 0.001 47
tail(tabNegBinom)
## size prob nZero
## 1: 56.23413 1 50
## 2: 100.00000 1 50
## 3: 177.82794 1 50
## 4: 316.22777 1 50
## 5: 562.34133 1 50
## 6: 1000.00000 1 50
# Plot
ggplot(tabNegBinom, aes(size, prob, z = nZero)) +
geom_contour() +
# scale_y_sqrt() +
scale_x_log10() +
ggtitle("Number of zeroes observed v. mean",
"Negative Binomial distribution")