## Author: Eric Morenz
## Date: 2020/07/23
## Fitting parametric models, plotting different summaries, kernel methods

library(dplyr) ## If you get an error. Please run: install.packages("&package_name")
library(readxl) ## To read .xls files
library(survival) ## Classic and well known
library(flexsurv) ## mostly an extension of survival package.
library(biostat3) ### One package we can use for kernel estimate of hazard
## library(kernhaz) ### Another package for kernel estimates of the hazard

setwd(here::here())

### Reading into memory part of the data
df <- read_excel("PS_Datasets/pbc.xls") %>% 
  select(N, X, D, Z1) %>% 
  rename(ID = N,
         Time = X,
         Death = D,
         Trt = Z1) %>% 
  filter(ID < 200) 

#### Creating a survival object. This is one way to do it.
surv_obj <- df %>% 
  with(Surv(time=Time, event=Death, type="right"))

#### Same Kaplan-Meier code as before but with two groups
survfit_obj <- survfit(surv_obj ~ Trt, data=df) ## 
## Kaplan-Meier plot without CIs
plot(survfit_obj, col=c( 'pink', 'blue'), conf.int=FALSE,
     lty=c("solid", "dashed"), 
     main="Kaplan-Meier Survival Estimate",
     ylab="Survival Probability", xlab="Time")
legend("topright", 
       c("Treatment Group 1", "Treatment Group 2"), 
       col=c("pink", "blue"), 
       lty=c("solid", "solid"), cex=0.9, bty="n")

#################### What about the cumulative hazard #########
plot(survfit_obj, 
     fun="cumhaz",
     col=c( 'pink', 'blue'), lwd=2, 
     xlab="Days", ylab="Cumulative Hazard", main="Nelson−Aalen cumulative hazard estimates")
### Without a legend

################## ################## ################## 
########### Let's fit a few parametric models ##########
################## ################## ################## 
exponential_fit <- flexsurvreg(surv_obj ~ 1, dist="exponential") ## from flexsurv
weibull_fit <- flexsurvreg(surv_obj ~ 1, dist="weibull")
## splinefit <- flexsurvspline(surv_obj ~ 1) ## flexsurv has a spline fit aswell

########### Let's compare the model fit for the survival function

plot(survfit(surv_obj ~ 1, data=df), conf.int=FALSE, xlab="Time", 
     ylab="Survival probability", col="black", lty="solid", lwd=2) ## nonparametric
lines(exponential_fit, ci=FALSE,  col="darkgreen", lty="dotted", lwd=2)
lines(weibull_fit,  ci=FALSE, col="orange", lty="dashed", lwd=2)
legend(x=3300, y=1, bty="n",
       c("Nonparametric", "Exponential", "Weibull"), 
       col=c("black", "darkgreen", "orange"), 
       lty=c("solid", "dotted", "dashed"), lwd=rep(3, 4), cex=0.9)

########## For parametric models we can plot the hazard estimates
########## This is not possible for the nonparametric KM estimator (why?)
########## We will use smoothing techniques to get an estimate of the hazard fxn

### Ignoring groups
survfit_obj2 <- survfit(surv_obj ~ 1, data=df) 
kernelfit_haz_bias <- smoothHaz(survfit_obj2, n.grid=3) ## randomly chosen
kernelfit_haz_variance <- smoothHaz(survfit_obj2, n.grid=50) ## randomly chosen

### Tuning of the bandiwdth is a little beyond the course. 
### But I would be happy to discuss techniques in Slack

### We now plot the 'nonparametric' estimate of the hazard with parametric hazards
time_seq <- seq(1, 5000, by=10)
shape <- exp(weibull_fit$coef[1])
scale <- exp(weibull_fit$coef[2])

plot(kernelfit_haz_variance)### Plot of the hazard with respect to the epanechnikov kernel
lines(kernelfit_haz_bias, lwd=3)
lines(time_seq, rep(exp(exponential_fit$coeff), length(time_seq)),
      col='darkgreen', lty="dotted", lwd=2) ## exponential
lines(time_seq, hweibull(time_seq, shape, scale, log = FALSE),
      col='orange', lty="dashed", lwd=2) ## weibull
legend(x=0, y=0.00038, bty="n", lwd=c(3,1,1,1),
       c("Larger Bias", "Larger Variance", "Exponential", "Weibull"), 
       col=c("black", "black", "darkgreen", "orange"), 
       lty=c("solid", "solid", "dotted", "dashed"), cex=0.9)