################################################################################# ################################################################################# # stat 206, winter 2021 # R code for likelihood and bayesian analyses # in the length of stay (LoS) case study # (lecture notes part 6 pp. 77+) # dd 17-18-19 feb 2021 ################################################################################# # unbuffer the output (if relevant) # set your working directory to a location # where you may, if you wish, export data # and/or plots from R # for me on my desktop at home this is # setwd( 'C:/DD/Teaching/AMS-STAT-206/Winter-2021/LoS-Analysis' ) ################################################################################# ##### the 7 pillars of statistical data science ##### (1) probability calculations # this involves two distributions (below), Poisson and Gamma ################################################################################# ##### (2) design of data-gathering activities # already accomplished in this case study ################################################################################# ##### (3) data curation # define the data set and sample size y <- c( 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 6 ) print( n <- length( y ) ) # [1] 14 ##### no fancy data curation needed here ################################################################################# ##### (4) graphical and numerical descriptive summaries of the data set # a histogram: note the use of the 'breaks' option # to get the bars centered on the data values -1.5 + 1:8 # [1] -0.5 0.5 1.5 2.5 3.5 4.5 5.5 6.5 hist( y, prob = T, breaks = -1.5 + 1:8, main = 'LoS Case Study', xlab = 'Length of Stay', col = 'darkcyan', ylab = 'PMF' ) # a density trace plot( density( y, from = 0 ), main = '', xlab = 'Length of Stay', col = 'darkcyan', lwd = 3 ) # both plots lie to us by not highlighting the discreteness # spike plots are a bit of a pain to make, # but they clearly show the discreteness plot( 0, 0, type = 'n', xlim = c( 0, 6 ), xlab = 'Length of Stay', ylim = c( 0, 5 / 14 ), ylab = 'PMF' ) table( y ) / n # y # 0 1 2 3 4 6 # 0.07142857 0.35714286 0.28571429 0.14285714 0.07142857 0.07142857 y.probabilities <- unname( table( y ) / n ) print( y.probabilities <- c( y.probabilities[ 1:5 ], 0, y.probabilities[ 6 ] ) ) # [1] 0.07142857 0.35714286 0.28571429 0.14285714 0.07142857 0.00000000 0.07142857 for ( i in 1:7 ) { segments( i - 1, 0, i - 1, y.probabilities[ i ], lwd = 3, col = 'darkcyan' ) } par( mfrow = c( 3, 1 ) ) hist( y, prob = T, breaks = -1.5 + 1:8, main = 'LoS Case Study', xlab = 'Length of Stay', col = 'darkcyan', ylab = 'PMF', xlim = c( -0.5, 8 ) ) plot( density( y, from = 0 ), main = '', xlab = 'Length of Stay', col = 'darkcyan', lwd = 3, xlim = c( -0.5, 8 ) ) plot( 0, 0, type = 'n', xlim = c( -0.5, 8 ), xlab = 'Length of Stay', ylim = c( 0, 5 / 14 ), ylab = 'PMF' ) for ( i in 1:7 ) { segments( i - 1, 0, i - 1, y.probabilities[ i ], lwd = 3, col = 'darkcyan' ) } par( mfrow = c( 1, 1 ) ) # it seems that (using the cheating approach) a Poisson # sampling distribution would be reasonable here, # at least as a starting point in the modeling # numerical descriptive summaries: # the sample size n was already defined above print( c( sd( y ), var( y ), summary( y ) ) ) # sd variance Min. 1st Qu. Median Mean 3rd Qu. Max. # 1.542440 2.379121 0.000000 1.000000 2.000000 2.071429 2.750000 6.000000 ##### sanity check on the Poisson assumption: # in the Poisson sampling model the mean and variance are both equal # to lambda # the *variance-to-mean-ratio* for the Poisson is therefore 1 # here y.bar = 2.07 and the sample variance s^2 = 2.38 (decently close) print( vtmr <- var( y ) / mean( y ) ) # [1] 1.148541 ################################################################################# ##### (5) likelihood and bayesian inference ##### let's explore the family of sampling distributions first y.grid <- 0:20 # the PMF in R for the Poisson family is called 'dpois' plot( y.grid, dpois( y.grid, lambda = 1 ), type = 'l' ) # not beautiful M <- 1000000 set.seed( 17 ) y.star.1 <- rpois( M, lambda = 1 ) par( mfrow = c( 3, 1 ) ) hist( y.star.1, breaks = -1.5 + 1:20, main = 'Poisson Family', xlab = 'y', ylab = 'PMF', prob = T, col = 'cornflowerblue', ylim = c( 0, 0.35 ) ) text( 10, 0.2, 'lambda = 1', col = 'cornflowerblue', cex = 1.25 ) y.star.2 <- rpois( M, lambda = 2 ) hist( y.star.2, breaks = -1.5 + 1:20, main = 'Poisson Family', xlab = 'y', ylab = 'PMF', prob = T, col = 'orange', ylim = c( 0, 0.35 ) ) text( 10, 0.2, 'lambda = 2', col = 'orange', cex = 1.25 ) y.star.3 <- rpois( M, lambda = 5 ) hist( y.star.3[ y.star.3 <= 15 ], breaks = -1.5 + 1:20, main = 'Poisson Family', xlab = 'y', ylab = 'PMF', prob = T, col = 'red', ylim = c( 0, 0.35 ) ) text( 10, 0.2, 'lambda = 5', col = 'red', cex = 1.25 ) par( mfrow = c( 1, 1 ) ) # so as lambda increases, the center (mean) of the PMF increases, # the spread (variance or SD) also increases, and the # long right-hand-tail (skewness) decreases; the PMF looks # more and more like a Normal distribution on the non-negative integers ################################################################################# ##### next the likelihood inferential story # compute lambda.hat.mle to see where the likelihood is concentrated print( s <- sum( y ) ) # [1] 29 print( lambda.hat.mle <- s / n ) # [1] 2.071429 # visualize the likelihood and log likelihood functions # let's set the grid by trial and error lambda.grid.1 <- seq( 0.1, 2 * lambda.hat.mle, length = 500 ) poisson.likelihood <- function( lambda, y ) { s <- sum( y ) n <- length( y ) likelihood <- lambda^s * exp( - n * lambda ) return( likelihood ) } # this could produce underflow or even overflow with large s and n values, # but let's see if we're lucky here poisson.log.likelihood <- function( lambda, y ) { s <- sum( y ) n <- length( y ) log.likelihood <- s * log( lambda ) - n * lambda return( log.likelihood ) } plot( lambda.grid.1, poisson.likelihood( lambda.grid.1, y ), type = 'l', lwd = 3, col = 'cornflowerblue', xlab = 'lambda', ylab = 'Likelihood', main = 'LoS Case Study' ) # no underflow here, with s = 29 and n = 14 # we can tighten the grid to get more visual information # and look at the log likelihood function at the same time lambda.grid.2 <- seq( 1, 3.5, length = 500 ) par( mfrow = c( 2, 1 ) ) plot( lambda.grid.2, poisson.likelihood( lambda.grid.2, y ), type = 'l', lwd = 3, col = 'cornflowerblue', xlab = 'lambda', ylab = 'Likelihood', main = 'LoS Case Study' ) abline( v = lambda.hat.mle, lwd = 3, lty = 3, col = 'red' ) legend( 2.75, 3e-04, legend = c( 'Likelihood', 'MLE' ), col = c( 'cornflowerblue', 'red' ), lty = c( 1, 3 ), lwd = 3 ) plot( lambda.grid.2, poisson.log.likelihood( lambda.grid.2, y ), type = 'l', lwd = 3, col = 'cornflowerblue', xlab = 'lambda', ylab = 'Log Likelihood', main = '' ) abline( v = lambda.hat.mle, lwd = 3, lty = 3, col = 'red' ) legend( 2.25, -11, legend = c( 'Log Likelihood', 'MLE' ), col = c( 'cornflowerblue', 'red' ), lty = c( 1, 3 ), lwd = 3 ) par( mfrow = c( 1, 1 ) ) # already with a sample size of only n = 14 the likelihood function # (via the bayesian central limit theorem) looks pretty close to gaussian # get the observed information and build a 99.9% CI for lambda # (document camera notes) the estimated standard error # for the MLE turns out to be print( se.hat.lambda.hat.mle <- sqrt( lambda.hat.mle / n ) ) # [1] 0.3846546 alpha <- 0.001 print( z.alpha <- qnorm( 1 - alpha / 2 ) ) # [1] 3.290527 print( likelihood.confidence.interval <- c( lambda.hat.mle - z.alpha * se.hat.lambda.hat.mle, lambda.hat.mle + z.alpha * se.hat.lambda.hat.mle ) ) # [1] 0.8057122 3.3371449 ################################################################################# ##### now the bayesian inferential story # the conjugate prior family in this sampling model is the # Gamma( alpha, beta ) distributions for alpha > 0 and beta > 0; # the PDF is available in R with the built-in 'dgamma' function ################################################################################# # let's get to know this family graphically lambda.grid.3 <- seq( 0, 10, length = 500 ) # start with ( alpha, beta ) = ( 1, 1 ) plot( lambda.grid.3, dgamma( lambda.grid.3, 1, 1 ), type = 'l', lwd = 3, col = 'black', xlab = 'lambda', ylab = 'PDF', main = 'Poisson Conjugate Prior Family' ) # now compare with ( alpha, beta ) = ( 2, 1 ) lines( lambda.grid.3, dgamma( lambda.grid.3, 2, 1 ), lwd = 3, col = 'red' ) # interesting; what does increasing alpha, holding beta constant, # seem to do? # let's re-grid lambda.grid.4 <- seq( 0, 20, length = 500 ) plot( lambda.grid.4, dgamma( lambda.grid.4, 1, 1 ), type = 'l', lwd = 3, col = 'black', xlab = 'lambda', ylab = 'PDF', main = 'Gamma Conjugate Prior Family For Poisson', lty = 1 ) lines( lambda.grid.4, dgamma( lambda.grid.4, 2, 1 ), lwd = 3, col = 'red', lty = 2 ) lines( lambda.grid.4, dgamma( lambda.grid.4, 5, 1 ), lwd = 3, col = 'darkcyan', lty = 3 ) lines( lambda.grid.4, dgamma( lambda.grid.4, 10, 1 ), lwd = 3, col = 'maroon', lty = 4 ) legend( 10, 0.6, legend = c( '( alpha, beta ) = ( 1, 1 )', '( alpha, beta ) = ( 2, 1 )', '( alpha, beta ) = ( 5, 1 )', '( alpha, beta ) = ( 10, 1 )' ), col = c( 'black', 'red', 'darkcyan', 'maroon' ), lty = c( 1, 2, 3, 4 ), lwd = 3 ) # alpha definitely controls shape and center # now let's hold alpha = 5 (say) and make beta grow lambda.grid.5 <- seq( 0, 12.5, length = 500 ) plot( lambda.grid.5, dgamma( lambda.grid.5, 5, 1 ), type = 'l', lwd = 3, col = 'black', xlab = 'lambda', ylab = 'PDF', main = 'Gamma Conjugate Prior Family For Poisson', lty = 1 ) lines( lambda.grid.5, dgamma( lambda.grid.5, 5, 2 ), lwd = 3, col = 'red', lty = 2 ) lines( lambda.grid.5, dgamma( lambda.grid.5, 5, 5 ), lwd = 3, col = 'darkcyan', lty = 3 ) lines( lambda.grid.5, dgamma( lambda.grid.5, 5, 10 ), lwd = 3, col = 'maroon', lty = 4 ) # oops; need to redefine 'ylim' plot( lambda.grid.5, dgamma( lambda.grid.5, 5, 1 ), type = 'l', lwd = 3, col = 'black', xlab = 'lambda', ylab = 'PDF', main = 'Gamma Conjugate Prior Family For Poisson', lty = 1, ylim = c( 0, 2 ) ) lines( lambda.grid.5, dgamma( lambda.grid.5, 5, 2 ), lwd = 3, col = 'red', lty = 2 ) lines( lambda.grid.5, dgamma( lambda.grid.5, 5, 5 ), lwd = 3, col = 'darkcyan', lty = 3 ) lines( lambda.grid.5, dgamma( lambda.grid.5, 5, 10 ), lwd = 3, col = 'maroon', lty = 4 ) legend( 6, 1, legend = c( '( alpha, beta ) = ( 5, 1 )', '( alpha, beta ) = ( 5, 2 )', '( alpha, beta ) = ( 5, 5 )', '( alpha, beta ) = ( 5, 10 )' ), col = c( 'black', 'red', 'darkcyan', 'maroon' ), lty = c( 1, 2, 3, 4 ), lwd = 3 ) # beta clearly controls *spread*, and in an inverse way: # as beta goes up, the spread goes down ################################################################################# ##### returning to the bayesian inferential analysis # LI prior: choose beta = epsilon = 0.001 and alpha / beta = 5 (prior mean) # initially, based on class discussion, so alpha = 0.005; # the 'utah' prior we found on the web would instead set # alpha / beta = 16 (prior mean) epsilon <- 0.0001 alpha.prior.class <- 5 * epsilon beta.prior <- epsilon alpha.prior.utah <- 16 * epsilon alpha.likelihood <- s + 1 beta.likelihood <- n alpha.posterior.class <- alpha.prior.class + s beta.posterior <- beta.prior + n alpha.posterior.utah <- alpha.prior.utah + s plot( lambda.grid.2, dgamma( lambda.grid.2, shape = alpha.likelihood, rate = beta.likelihood ), type = 'l', lwd = 3, col = 'black', xlab = 'lambda', ylab = 'PDF', lty = 1 ) lines( lambda.grid.2, dgamma( lambda.grid.2, shape = alpha.prior.class, rate = beta.prior ), lwd = 3, col = 'red', lty = 2 ) lines( lambda.grid.2, dgamma( lambda.grid.2, shape = alpha.posterior.class, rate = beta.posterior ), lwd = 3, col = 'chocolate1', lty = 3 ) lines( lambda.grid.2, dgamma( lambda.grid.2, shape = alpha.prior.utah, rate = beta.prior ), lwd = 3, col = 'darkcyan', lty = 4 ) lines( lambda.grid.2, dgamma( lambda.grid.2, shape = alpha.posterior.utah, rate = beta.posterior ), lwd = 3, col = 'cornflowerblue', lty = 5 ) # the utah prior has exactly the same effect as the class prior; # both are low-information (LI) choices with the same posterior consequences # the usual bayesian point estimate would be the posterior mean: print( posterior.mean <- alpha.posterior.class / beta.posterior ) # [1] 2.071449 # almost identical to the sample mean # y.bar = lambda.hat.mle = 2.071429 # the two should be really close under the LI prior # the usual bayesian give-or-take for the bayesian point estimate # would be the posterior sd: print( posterior.sd <- sqrt( alpha.posterior.class ) / beta.posterior ) # [1] 0.3846552 # almost identical to the likelihood standard error # se.hat.lambda.hat.mle = 0.3846546 # again the two should be really close under the LI prior # the bayesian 99.9% interval for lambda is obtained # from the quantiles of the gamma posterior distribution: print( bayesian.interval <- qgamma( c( alpha / 2, 1 - alpha / 2 ), alpha.posterior.class, beta.posterior ) ) # [1] 1.033688 3.574667 # this is quite different from the large-sample likelihood interval # 0.8057122 3.3371449 # the reason is that the bayes interval is correctly asymmetric here # (see the skewness in the posterior and likelihood PDFs), # while the likelihood interval is symmetric about the MLE, # relying as it does on the frequentist CLT (which doesn't # yet apply here with an n of only 14) # the likelihood differs visually from the posterior in this example # because of that pesky ( -1 ) in the PDF exponent for alpha, # but the likelihood and bayes analyses are almost identical anyway, # apart from mr. fisher's reliance on the frequentist CLT, # which produced an inferior interval ################################################################################# ##### frequentist and bayesian nonparametric inferential analysis ##### with the bootstrap number.of.bootstrap.replications <- 1000000 bootstrap.mle.values <- rep( NA, number.of.bootstrap.replications ) set.seed( 1 ) system.time( { for ( i in 1:number.of.bootstrap.replications ) { y.star <- sample( y, replace = T ) bootstrap.mle.values[ i ] <- mean( y.star ) if ( ( i %% 100000 ) == 0 ) print( i ) } } ) hist( bootstrap.mle.values, main = 'LoS Case Study', xlab = 'lambda', ylab = 'Approximate Posterior PDF', col = 'darkcyan', prob = T ) print( c( mean( bootstrap.mle.values ), sd( bootstrap.mle.values ) ) ) # [1] 2.0717881 0.3973817 print( bootstrap.confidence.interval <- quantile( bootstrap.mle.values, probs = c( alpha / 2, 1 - alpha / 2 ) ) ) # 0.05% 99.95% # 1.000000 3.571429 # look at how closely the bootstrap results match # the bayesian results with a LI prior # mean/ 99.9% interval # method estimate SD/SE left right # large-sample # likelihood 2.071429 0.3846546 0.8057122 3.3371449 # bayes with # LI prior 2.071449 0.3846552 1.033688 3.574667 # bootstrap 2.071788 0.3973817 1.000000 3.571429 ################################################################################# # visualization of the Dirichlet Process posterior # implemented by the bootstrap with a simulated Gaussian data set gaussian.mean <- 5 gaussian.sd <- 2 gaussian.sample.size <- 100 set.seed( 2 ) print( gaussian.sample <- sort( rnorm( gaussian.sample.size, gaussian.mean, gaussian.sd ) ) ) number.of.bootstrap.replications <- 200 plot( 0, 0, type = 'n', xlab = 'Simulated Gaussian Data', ylab = 'CDF', xlim = c( 0, 9.5 ), ylim = c( 0, 1 ) ) system.time( { for ( i in 1:number.of.bootstrap.replications ) { y.star <- sample( gaussian.sample, replace = T ) lines( ecdf( y.star ), lty = 3, lwd = 1, col = 'gray', pch = 20 ) } } ) lines( ecdf( gaussian.sample ), lty = 1, lwd = 3, col = 'cornflowerblue' ) ################################################################################# ##### (6) bayesian predictive analysis # see the document camera notes from fri 19 feb 2021 # and lecture notes part 6 pages 105-115 # the following function computes the posterior predictive pmf: poisson.gamma.posterior.predictive.pmf <- function( y, alpha, beta ) { log.pmf <- lgamma( alpha + y ) + alpha * log( beta / ( beta + 1 ) ) - y * log( beta + 1 ) - lgamma( alpha ) - lgamma( y + 1 ) return( exp( log.pmf ) ) } # let's visualize the posterior predictive distribution, # based on the entire data set, for a future observation y.star.maximum.value <- 9 y.star <- 0:y.star.maximum.value print( predictive.pmf <- poisson.gamma.posterior.predictive.pmf( y.star, alpha.posterior.class, beta.posterior ) ) plot( 0:y.star.maximum.value, predictive.pmf, type = 'n', xlab = 'y', ylab = 'Predictive PMF', main = 'Predictive PMF in the LoS Case Study' ) for ( i in 0:y.star.maximum.value ) { segments( i, 0, i, predictive.pmf[ i + 1 ], lwd = 3, lty = 1, col = 'darkcyan' ) } par( mfrow = c( 2, 1 ) ) plot( 0:y.star.maximum.value, predictive.pmf, type = 'n', xlab = 'Length of Stay', ylim = c( 0, 0.35 ), ylab = 'Predictive PMF', main = 'Predictive PMF in the LoS Case Study' ) for ( i in 0:y.star.maximum.value ) { segments( i, 0, i, predictive.pmf[ i + 1 ], lwd = 3, lty = 1, col = 'darkcyan' ) } plot( 0, 0, type = 'n', xlim = c( 0, y.star.maximum.value ), xlab = 'Length of Stay', main = 'Data Spike Plot', ylab = 'PMF', ylim = c( 0, 0.35 ) ) for ( i in 1:( y.star.maximum.value + 1 ) ) { segments( i - 1, 0, i - 1, y.probabilities[ i ], lwd = 3, col = 'red' ) } par( mfrow = c( 1, 1 ) ) # and the following code loops through the data set, # setting aside one observation at a time ("jack-knifing"), # and calculating posterior predictive z-scores # for model-checking z.model.checking <- rep( NA, n ) for ( i in 1:n ) { y.current <- y[ -i ] n.current <- length( y.current ) s.current <- sum( y.current ) alpha.star.posterior <- alpha.prior.class + s.current beta.star.posterior <- beta.prior + n.current predictive.mean.current <- alpha.star.posterior / beta.star.posterior predictive.SD.current <- sqrt( ( alpha.star.posterior / beta.star.posterior ) * ( 1 + 1 / beta.star.posterior ) ) z.model.checking[ i ] <- ( y[ i ] - predictive.mean.current ) / predictive.SD.current } c( mean( z.model.checking ), sd( z.model.checking ) ) # [1] 0.03126664 1.15504934 qqnorm( z.model.checking, main = 'LoS Case Study', xlim = c( -1.75, 3.1 ), ylab = 'Predictive z Scores', type = 'l', lwd = 3, col = 'cornflowerblue', ylim = c( -1.75, 3.1 ) ) abline( 0, 1, lwd = 3, col = 'chocolate1', lty = 2 ) legend( 1, 0, legend = c( 'Predictive z Scores', 'Target Behavior' ), col = c( 'cornflowerblue', 'chocolate1' ), lty = c( 1, 2 ), lwd = 3 ) ################################################################################# #################################################################################