##---- SLIDE 3 -------------------------------------------------------- # these packages are needed to execute the demo pkgs <- c("SuperLearner", "quadprog", "nloptr", "MASS", "ggplot2", "nnls") # see what packages are currently installed installed_pacakges <- row.names(installed.packages()) # loop over the needed packages for(p in pkgs){ # check if package is installed already_installed <- p %in% installed_pacakges # if not already installed, install it if(!already_installed){ install.packages(p) } # and load package library(p, character.only = TRUE) } ##---- SLIDE 4 -------------------------------------------------------- # set a seed for reproducibility set.seed(212) # sample size n <- 100 # W1 has Normal distribution, W2 has Uniform distribution W1 <- rnorm(n = n); W2 <- runif(n = n) # pr(A = 1 | W) is logistic linear in W g1W <- plogis(-1 + W1 - W2 + W1*W2) # generate binary treatment A <- rbinom(n = n, size = 1, prob = g1W) # E[Y | A, W] is linear in A, W QAW <- W1 - W2 + A # generate outcome by adding random error with N(0,1) distribution Y <- QAW + rnorm(n = n) ##---- SLIDE 3 -------------------------------------------------------- # fit Q1 on full data Q1_fit <- glm(Y ~ W1 + W2 + A) # get predicted value for each observation Q1n <- as.numeric(predict(Q1_fit)) # compute mean squared-error risk_Q1 <- mean((Y - Q1n)^2) # fit Q2 on full data Q2_fit <- glm(Y ~ I(W1^2) + I(W2^2) + W1*W2 + W1*A + W2*A) # get predicted value for each observation Q2n <- as.numeric(predict(Q2_fit)) # compute mean squared-error risk_Q2 <- mean((Y - Q2n)^2) ##---- SLIDE 6 -------------------------------------------------------- # first 100 observations go in split 2, second 100 in split 2 split <- c(rep(1, n/2), rep(2, n/2)) # make a data.frame of all the data full_data <- data.frame(W1 = W1, W2 = W2, A = A, Y = Y) # data.frame of only split 1 observations split1 <- subset(full_data, split == 1) # data.frame of only split 2 observations split2 <- subset(full_data, split == 2) # fit Q1 in split 1 Q1_fit_split1 <- glm(Y ~ W1 + W2 + A, data = split1) # predict from split 1 fit in split 2 Q1n_split2 <- predict(Q1_fit_split1, newdata = split2) # estimate of MSE based on split 2 risk_Q1_split2 <- mean((Y[split == 2] - Q1n_split2)^2) # fit Q1 in split 2 Q1_fit_split2 <- glm(Y ~ W1 + W2 + A, data = split2) # predict from split 2 fit in split 1 Q1n_split1 <- predict(Q1_fit_split1, newdata = split1) # estimate of MSE based on split 1 risk_Q1_split1 <- mean((Y[split == 1] - Q1n_split1)^2) # average the two estimates cv_risk_Q1 <- (risk_Q1_split1 + risk_Q1_split2) / 2 ##---- SLIDE 7 -------------------------------------------------------- # fit Q2 in split 1 # predict from split 1 fit in split 2 # estimate of MSE based on split 2 # fit Q2 in split 2 # predict from split 2 fit in split 1 # estimate of MSE based on split 1 # average the two estimates ##---- SLIDE 10 -------------------------------------------------------- # a function to generate a prediction from an ensemble of Q1 and Q2 # Q1_fit and Q2_fit are fitted glm's # newdata is a data.frame that you want to get predictions on # Q1_weight is the weight given to the prediction from Q1_fit # Q2_weight is 1 - Q1_weight ensemble_predict <- function(Q1_fit, Q2_fit, newdata, Q1_weight, Q2_weight = 1 - Q1_weight){ # make sure weights approximately sum to 1 stopifnot(abs(1 - Q1_weight - Q2_weight) < 1e-5) # prediction from Q1_fit on newdata Q1n <- predict(Q1_fit, newdata = newdata) # prediction from Q2_fit on newdata Q2n <- predict(Q2_fit, newdata = newdata) # weighted combination ensemble_prediction <- Q1_weight * Q1n + Q2_weight * Q2n return(as.numeric(ensemble_prediction)) } ##---- SLIDE 11 -------------------------------------------------------- # just get data for the first observation obs1 <- full_data[1,] obs1 # get ensemble prediction for first observation using equal weights ensemble_predict(Q1_fit = Q1_fit, Q2_fit = Q2_fit, newdata = obs1, Q1_weight = 0.5) # should be the same as Q1n[1] * 0.5 + Q2n[1] * 0.5 ##---- SLIDE 12 -------------------------------------------------------- weight_sequence <- seq(from = 0, to = 1, length = 500) obs1_predict <- sapply(weight_sequence, ensemble_predict, Q1_fit = Q1_fit, Q2_fit = Q2_fit, newdata = obs1) # set graphical parameters par(mar = c(4.1, 4.1, 0.5, 0.5), mgp = c(1.3, 0.45, 0)) plot(obs1_predict ~ weight_sequence, type = "l", lwd = 2, bty = "n", xlab = expression(bar(Q)[list("n",1)]*" weight"), ylab = "Prediction", cex.lab = 0.65, cex.axis = 0.65, ylim = c(-0.6, -0.35)) points(y = obs1_predict[1], x = 0, pch = 19, col = 'red', cex = 1.2) points(y = obs1_predict[500], x = 1, pch = 19, col = 'blue', cex = 1.2) ##---- SLIDE 14 -------------------------------------------------------- # get ensemble predictions on split 2 from fits in split 1 enspred_split2 <- ensemble_predict(Q1_fit = Q1_fit_split1, Q2_fit = Q2_fit_split1, Q1_weight = 0.5, newdata = split2) # estimate of MSE based on split 2 risk_ens_split2 <- mean((Y[split == 2] - enspred_split2)^2) # get ensemble predictions on split 2 from fits in split 1 enspred_split1 <- ensemble_predict(Q1_fit = Q1_fit_split2, Q2_fit = Q2_fit_split2, Q1_weight = 0.5, newdata = split1) # estimate of MSE based on split 1 risk_ens_split1 <- mean((Y[split == 1] - enspred_split1)^2) # average the two estimates cv_risk_ens <- (risk_ens_split1 + risk_ens_split2) / 2 ##---- SLIDE 15 -------------------------------------------------------- # define an empty vector of cv risk values cv_risks <- rep(NA, length(weight_sequence)) # for each value of weights compute the cv risk for(i in seq_along(weight_sequence)){ # get ensemble predictions on split 2 from fits in split 1 enspred_split2 <- ensemble_predict(Q1_fit = Q1_fit_split1, Q2_fit = Q2_fit_split1, Q1_weight = weight_sequence[i], newdata = split2) # estimate of MSE based on split 2 risk_ens_split2 <- mean((Y[split == 2] - enspred_split2)^2) # get ensemble predictions on split 2 from fits in split 1 enspred_split1 <- ensemble_predict(Q1_fit = Q1_fit_split2, Q2_fit = Q2_fit_split2, Q1_weight = weight_sequence[i], newdata = split1) # estimate of MSE based on split 1 risk_ens_split1 <- mean((Y[split == 1] - enspred_split1)^2) # save cv risk in cv_risks vector cv_risks[i] <- (risk_ens_split1 + risk_ens_split2) / 2 } ##---- SLIDE 16 -------------------------------------------------------- # set some graphical parameters par(mar = c(4.1, 4.1, 0.5, 0.5), mgp = c(1.3, 0.45, 0)) # plot all risks plot(cv_risks ~ weight_sequence, bty = 'n', xlab = expression(bar(Q)[list("n",1)]*" weight"), ylab = "CV-risk of ensemble", cex.lab = 0.65, cex.axis = 0.65) # add solid point where risk is lowest min_idx <- which.min(cv_risks) points(y = cv_risks[min_idx], x = weight_sequence[min_idx], col = 'blue', pch = 19, cex = 1.5) ##---- SLIDE 21 -------------------------------------------------------- # set a seed for reproducibility set.seed(123) # basic call to SuperLearner to estimate E[Y | A, W] sl_fit1 <- SuperLearner(Y = Y, X = data.frame(W1 = W1, W2 = W2, A = A), SL.library = c("SL.glm", "SL.mean"), family = gaussian(), method = "method.CC_LS", cvControl = list(V = 2), verbose = FALSE) # see what the printed output contains sl_fit1 ##---- SLIDE 23 -------------------------------------------------------- # what objects are in the output names(sl_fit1) ##---- SLIDE 25 -------------------------------------------------------- # a new observation new_obs <- data.frame(A = 1, W1 = 0, W2 = 0) # call predict using sl_fit1 new_predictions <- predict(sl_fit1, newdata = new_obs) # super learner prediction is in $pred new_predictions$pred # the prediction made by each estimator is in $library.predict new_predictions$library.predict ##---- SLIDE 27 -------------------------------------------------------- SL.glm ##---- SLIDE 29 -------------------------------------------------------- SL.mean ##---- SLIDE 30 -------------------------------------------------------- SL.quadglm <- function(Y, X, newX, family, ...){ # this line assumes X will have a named columns W1, W2, and A # we are also ignoring the family input, assuming that we will be # using this function for linear regression only quadglm_fit <- glm(Y ~ W1 + I(W1^2) + W2 + I(W2^2) + A, data = X) # get predictions on newX pred <- predict(quadglm_fit, newdata = newX) # format the output as named list fit <- list(fitted_model = quadglm_fit, message = "Hello world!") out <- list(fit = fit, pred = pred) # give the object a class class(out$fit) <- "SL.quadglm" # return the output return(out) } ##---- SLIDE 31 -------------------------------------------------------- # set a seed for reproducibility set.seed(123) # call to SuperLearner that includes our own wrapper sl_fit2 <- SuperLearner(Y = Y, X = data.frame(W1 = W1, W2 = W2, A = A), SL.library = c("SL.glm", "SL.mean", "SL.quadglm"), family = gaussian(), method = "method.CC_LS", cvControl = list(V = 2), verbose = FALSE) ##---- SLIDE 32 -------------------------------------------------------- sl_fit2$fitLibrary$SL.quadglm$message predict(sl_fit2, newdata = new_obs) ##---- SLIDE 33 -------------------------------------------------------- predict.SL.quadglm <- function(object, newdata, ...){ # object will be the $fit entry of the output from SL.quadglm # since class(object$fitted_model) = "glm", this will in turn call # predict.glm to obtain predictions pred <- predict(object$fitted_model, newdata = newdata) # return predictions return(pred) } ##---- SLIDE 34 -------------------------------------------------------- # this calls predict.SuperLearner(object = sl_fit2, ...) sl2_pred <- predict(sl_fit2, newdata = new_obs) # super learner prediction sl2_pred$pred # library predictions sl2_pred$library.predict ##---- SLIDE 35 -------------------------------------------------------- # define wrapper function SL.truepropensity <- function(Y, X, newX, family, ...){ # fit logistic regression model with interaction between W1 & W2 # get predictions on newX # format output # return output } # define predict method predict.SL.truepropensity <- function(object, newdata, ...){ # predict from object # return predictions } ##---- SLIDE 36 -------------------------------------------------------- sl_fit3 <- SuperLearner(Y = , X = , family = , SL.library = , # use negative log-likelihood loss method = "method.CC_nloglik", cvControl = list(V = 2), verbose = FALSE) ##---- SLIDE 43 -------------------------------------------------------- screen.corP ##---- SLIDE 44 -------------------------------------------------------- sl_fit4 <- SuperLearner(Y = Y, X = data.frame(W1 = W1, W2 = W2, A = A), SL.library = list(c("SL.glm", "All"), c("SL.glm", "screen.corP"), c("SL.mean", "All")), family = gaussian(), method = "method.CC_LS", cvControl = list(V = 2), verbose = FALSE) ##---- SLIDE 45 -------------------------------------------------------- sl_fit4 ##---- SLIDE 46 -------------------------------------------------------- screen_confounders <- function(Y, X, family, trt_name = "A", change = 0.1, ...){ # fit treatment only model & get coefficient for treatment fit_init <- glm(as.formula(paste0("Y ~ ", trt_name)), data = X, family = family) trt_coef <- fit_init$coef[2] # identify which column of X is the trt variable trt_col <- which(colnames(X) == trt_name) # set all variables except trt to not be included initially include <- rep(FALSE, ncol(X)); include[trt_col] <- TRUE # loop over variables in X other than trt for(j in seq_len(ncol(X))[-trt_col]){ # find variable name var_name <- colnames(X)[j] # fit trt + variable model, get trt coefficient fit <- glm(as.formula(paste0("Y ~ ", trt_name, "+", var_name)), data = X, family = family) new_trt_coef <- fit$coef[2] # check if changed more than specified amount include[j] <- abs((new_trt_coef - trt_coef)/trt_coef) > change } return(include) } ##---- SLIDE 47 -------------------------------------------------------- sl_fit5 <- SuperLearner(Y = Y, X = data.frame(W1 = W1, W2 = W2, A = A), SL.library = list(c("SL.glm", "screen_confounders"), c("SL.mean", "All")), family = gaussian(), method = "method.CC_LS", cvControl = list(V = 2), verbose = FALSE) ##---- SLIDE 48 -------------------------------------------------------- # set seed for reproducibility set.seed(1234) # two-fold cross-validated evaluation of two-fold super learner cv_sl <- CV.SuperLearner( Y = Y, X = data.frame(W1 = W1, W2 = W2, A = A), family = gaussian(), method="method.CC_LS", SL.library = c("SL.glm","SL.mean","SL.quadglm"), cvControl = list(V = 2), innerCvControl = list(list(V = 2), list(V = 2)) ) # plot the results plot(cv_sl)