#################################################################################
#################################################################################

# stat 206, winter 2021

# R code for likelihood and bayesian analyses
# in the length of stay (LoS) case study
# (lecture notes part 6 pp. 77+)

# dd 17-18-19 feb 2021

#################################################################################

# unbuffer the output (if relevant)

# set your working directory to a location
# where you may, if you wish, export data
# and/or plots from R

# for me on my desktop at home this is

#   setwd( 'C:/DD/Teaching/AMS-STAT-206/Winter-2021/LoS-Analysis' )

#################################################################################

##### the 7 pillars of statistical data science

##### (1) probability calculations

# this involves two distributions (below), Poisson and Gamma

#################################################################################

##### (2) design of data-gathering activities

# already accomplished in this case study

#################################################################################

##### (3) data curation 

# define the data set and sample size

y <- c( 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 6 )

print( n <- length( y ) )

# [1] 14

##### no fancy data curation needed here

#################################################################################

##### (4) graphical and numerical descriptive summaries of the data set

# a histogram: note the use of the 'breaks' option
# to get the bars centered on the data values

-1.5 + 1:8

# [1] -0.5  0.5  1.5  2.5  3.5  4.5  5.5  6.5

hist( y, prob = T, breaks = -1.5 + 1:8, main = 'LoS Case Study',
  xlab = 'Length of Stay', col = 'darkcyan', ylab = 'PMF' )

# a density trace

plot( density( y, from = 0 ), main = '', xlab = 'Length of Stay', 
  col = 'darkcyan', lwd = 3 )

# both plots lie to us by not highlighting the discreteness

# spike plots are a bit of a pain to make,
# but they clearly show the discreteness

plot( 0, 0, type = 'n', xlim = c( 0, 6 ), xlab = 'Length of Stay',
  ylim = c( 0, 5 / 14 ), ylab = 'PMF' )

table( y ) / n

# y
#          0          1          2          3          4          6 
# 0.07142857 0.35714286 0.28571429 0.14285714 0.07142857 0.07142857 

y.probabilities <- unname( table( y ) / n )

print( y.probabilities <- c( y.probabilities[ 1:5 ], 0, y.probabilities[ 6 ] ) )

# [1] 0.07142857 0.35714286 0.28571429 0.14285714 0.07142857 0.00000000 0.07142857

for ( i in 1:7 ) {

  segments( i - 1, 0, i - 1, y.probabilities[ i ], lwd = 3, col = 'darkcyan' )

}

par( mfrow = c( 3, 1 ) )

hist( y, prob = T, breaks = -1.5 + 1:8, main = 'LoS Case Study',
  xlab = 'Length of Stay', col = 'darkcyan', ylab = 'PMF',
  xlim = c( -0.5, 8 ) )

plot( density( y, from = 0 ), main = '', xlab = 'Length of Stay', 
  col = 'darkcyan', lwd = 3, xlim = c( -0.5, 8 ) )

plot( 0, 0, type = 'n', xlim = c( -0.5, 8 ), xlab = 'Length of Stay',
  ylim = c( 0, 5 / 14 ), ylab = 'PMF' )

for ( i in 1:7 ) {

  segments( i - 1, 0, i - 1, y.probabilities[ i ], lwd = 3, col = 'darkcyan' )

}

par( mfrow = c( 1, 1 ) )

# it seems that (using the cheating approach) a Poisson
# sampling distribution would be reasonable here, 
# at least as a starting point in the modeling

# numerical descriptive summaries:

# the sample size n was already defined above

print( c( sd( y ), var( y ), summary( y ) ) )

#       sd variance     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
# 1.542440 2.379121 0.000000 1.000000 2.000000 2.071429 2.750000 6.000000

##### sanity check on the Poisson assumption:

# in the Poisson sampling model the mean and variance are both equal
# to lambda

# the *variance-to-mean-ratio* for the Poisson is therefore 1

# here y.bar = 2.07 and the sample variance s^2 = 2.38 (decently close)

print( vtmr <- var( y ) / mean( y ) )

# [1] 1.148541

#################################################################################

##### (5) likelihood and bayesian inference

##### let's explore the family of sampling distributions first

y.grid <- 0:20

# the PMF in R for the Poisson family is called 'dpois'

plot( y.grid, dpois( y.grid, lambda = 1 ), type = 'l' )

# not beautiful

M <- 1000000

set.seed( 17 )

y.star.1 <- rpois( M, lambda = 1 )

par( mfrow = c( 3, 1 ) )

hist( y.star.1, breaks = -1.5 + 1:20, main = 'Poisson Family',
  xlab = 'y', ylab = 'PMF', prob = T, col = 'cornflowerblue',
  ylim = c( 0, 0.35 ) )

text( 10, 0.2, 'lambda = 1', col = 'cornflowerblue', cex = 1.25 )

y.star.2 <- rpois( M, lambda = 2 )

hist( y.star.2, breaks = -1.5 + 1:20, main = 'Poisson Family',
  xlab = 'y', ylab = 'PMF', prob = T, col = 'orange', ylim = c( 0, 0.35 ) )

text( 10, 0.2, 'lambda = 2', col = 'orange', cex = 1.25 )

y.star.3 <- rpois( M, lambda = 5 )

hist( y.star.3[ y.star.3 <= 15 ], breaks = -1.5 + 1:20, 
  main = 'Poisson Family', xlab = 'y', ylab = 'PMF', prob = T, 
  col = 'red', ylim = c( 0, 0.35 ) )

text( 10, 0.2, 'lambda = 5', col = 'red', cex = 1.25 )

par( mfrow = c( 1, 1 ) )

# so as lambda increases, the center (mean) of the PMF increases,
# the spread (variance or SD) also increases, and the
# long right-hand-tail (skewness) decreases; the PMF looks
# more and more like a Normal distribution on the non-negative integers

#################################################################################

##### next the likelihood inferential story

# compute lambda.hat.mle to see where the likelihood is concentrated

print( s <- sum( y ) )

# [1] 29

print( lambda.hat.mle <- s / n )

# [1] 2.071429

# visualize the likelihood and log likelihood functions

# let's set the grid by trial and error

lambda.grid.1 <- seq( 0.1, 2 * lambda.hat.mle, length = 500 )

poisson.likelihood <- function( lambda, y ) {

  s <- sum( y )

  n <- length( y )

  likelihood <- lambda^s * exp( - n * lambda )

  return( likelihood )

}

# this could produce underflow or even overflow with large s and n values,
# but let's see if we're lucky here

poisson.log.likelihood <- function( lambda, y ) {

  s <- sum( y )

  n <- length( y )

  log.likelihood <- s * log( lambda ) - n * lambda

  return( log.likelihood )

}

plot( lambda.grid.1, poisson.likelihood( lambda.grid.1, y ),
  type = 'l', lwd = 3, col = 'cornflowerblue', xlab = 'lambda',
  ylab = 'Likelihood', main = 'LoS Case Study' )

# no underflow here, with s = 29 and n = 14

# we can tighten the grid to get more visual information
# and look at the log likelihood function at the same time

lambda.grid.2 <- seq( 1, 3.5, length = 500 )

par( mfrow = c( 2, 1 ) )

plot( lambda.grid.2, poisson.likelihood( lambda.grid.2, y ),
  type = 'l', lwd = 3, col = 'cornflowerblue', xlab = 'lambda',
  ylab = 'Likelihood', main = 'LoS Case Study' )

abline( v = lambda.hat.mle, lwd = 3, lty = 3, col = 'red' )

legend( 2.75, 3e-04, legend = c( 'Likelihood', 'MLE' ), 
  col = c( 'cornflowerblue', 'red' ), lty = c( 1, 3 ), lwd = 3 )

plot( lambda.grid.2, poisson.log.likelihood( lambda.grid.2, y ),
  type = 'l', lwd = 3, col = 'cornflowerblue', xlab = 'lambda',
  ylab = 'Log Likelihood', main = '' )

abline( v = lambda.hat.mle, lwd = 3, lty = 3, col = 'red' )

legend( 2.25, -11, legend = c( 'Log Likelihood', 'MLE' ), 
  col = c( 'cornflowerblue', 'red' ), lty = c( 1, 3 ), lwd = 3 )

par( mfrow = c( 1, 1 ) )

# already with a sample size of only n = 14 the likelihood function
# (via the bayesian central limit theorem) looks pretty close to gaussian

# get the observed information and build a 99.9% CI for lambda

# (document camera notes) the estimated standard error
# for the MLE turns out to be

print( se.hat.lambda.hat.mle <- sqrt( lambda.hat.mle / n ) )

# [1] 0.3846546

alpha <- 0.001

print( z.alpha <- qnorm( 1 - alpha / 2 ) )

# [1] 3.290527

print( likelihood.confidence.interval <- 
  c( lambda.hat.mle - z.alpha * se.hat.lambda.hat.mle,
     lambda.hat.mle + z.alpha * se.hat.lambda.hat.mle ) )

# [1] 0.8057122 3.3371449

#################################################################################

##### now the bayesian inferential story

# the conjugate prior family in this sampling model is the
# Gamma( alpha, beta ) distributions for alpha > 0 and beta > 0;
# the PDF is available in R with the built-in 'dgamma' function

#################################################################################

# let's get to know this family graphically

lambda.grid.3 <- seq( 0, 10, length = 500 )

# start with ( alpha, beta ) = ( 1, 1 )

plot( lambda.grid.3, dgamma( lambda.grid.3, 1, 1 ),
  type = 'l', lwd = 3, col = 'black', xlab = 'lambda',
  ylab = 'PDF', main = 'Poisson Conjugate Prior Family' )  

# now compare with ( alpha, beta ) = ( 2, 1 )

lines( lambda.grid.3, dgamma( lambda.grid.3, 2, 1 ),
  lwd = 3, col = 'red' )  

# interesting; what does increasing alpha, holding beta constant, 
# seem to do?

# let's re-grid

lambda.grid.4 <- seq( 0, 20, length = 500 )

plot( lambda.grid.4, dgamma( lambda.grid.4, 1, 1 ),
  type = 'l', lwd = 3, col = 'black', xlab = 'lambda',
  ylab = 'PDF', main = 'Gamma Conjugate Prior Family For Poisson', lty = 1 )  

lines( lambda.grid.4, dgamma( lambda.grid.4, 2, 1 ),
  lwd = 3, col = 'red', lty = 2 )  

lines( lambda.grid.4, dgamma( lambda.grid.4, 5, 1 ),
  lwd = 3, col = 'darkcyan', lty = 3 )  

lines( lambda.grid.4, dgamma( lambda.grid.4, 10, 1 ),
  lwd = 3, col = 'maroon', lty = 4 )  

legend( 10, 0.6, legend = c( '( alpha, beta ) = ( 1, 1 )', 
  '( alpha, beta ) = ( 2, 1 )', '( alpha, beta ) = ( 5, 1 )',
  '( alpha, beta ) = ( 10, 1 )' ), col = c( 'black', 'red',
  'darkcyan', 'maroon' ), lty = c( 1, 2, 3, 4 ), lwd = 3 )

# alpha definitely controls shape and center

# now let's hold alpha = 5 (say) and make beta grow

lambda.grid.5 <- seq( 0, 12.5, length = 500 )

plot( lambda.grid.5, dgamma( lambda.grid.5, 5, 1 ),
  type = 'l', lwd = 3, col = 'black', xlab = 'lambda',
  ylab = 'PDF', main = 'Gamma Conjugate Prior Family For Poisson', lty = 1 )  

lines( lambda.grid.5, dgamma( lambda.grid.5, 5, 2 ),
  lwd = 3, col = 'red', lty = 2 )  

lines( lambda.grid.5, dgamma( lambda.grid.5, 5, 5 ),
  lwd = 3, col = 'darkcyan', lty = 3 )  

lines( lambda.grid.5, dgamma( lambda.grid.5, 5, 10 ),
  lwd = 3, col = 'maroon', lty = 4 )  

# oops; need to redefine 'ylim'

plot( lambda.grid.5, dgamma( lambda.grid.5, 5, 1 ),
  type = 'l', lwd = 3, col = 'black', xlab = 'lambda',
  ylab = 'PDF', main = 'Gamma Conjugate Prior Family For Poisson', 
  lty = 1, ylim = c( 0, 2 ) )  

lines( lambda.grid.5, dgamma( lambda.grid.5, 5, 2 ),
  lwd = 3, col = 'red', lty = 2 )  

lines( lambda.grid.5, dgamma( lambda.grid.5, 5, 5 ),
  lwd = 3, col = 'darkcyan', lty = 3 )  

lines( lambda.grid.5, dgamma( lambda.grid.5, 5, 10 ),
  lwd = 3, col = 'maroon', lty = 4 )  

legend( 6, 1, legend = c( '( alpha, beta ) = ( 5, 1 )', 
  '( alpha, beta ) = ( 5, 2 )', '( alpha, beta ) = ( 5, 5 )',
  '( alpha, beta ) = ( 5, 10 )' ), col = c( 'black', 'red',
  'darkcyan', 'maroon' ), lty = c( 1, 2, 3, 4 ), lwd = 3 )

# beta clearly controls *spread*, and in an inverse way:
# as beta goes up, the spread goes down

#################################################################################

##### returning to the bayesian inferential analysis

# LI prior: choose beta = epsilon = 0.001 and alpha / beta = 5 (prior mean)
# initially, based on class discussion, so alpha = 0.005; 
# the 'utah' prior we found on the web would instead set 
# alpha / beta = 16 (prior mean)

epsilon <- 0.0001

alpha.prior.class <- 5 * epsilon

beta.prior <- epsilon

alpha.prior.utah <- 16 * epsilon

alpha.likelihood <- s + 1

beta.likelihood <- n

alpha.posterior.class <- alpha.prior.class + s

beta.posterior <- beta.prior + n

alpha.posterior.utah <- alpha.prior.utah + s

plot( lambda.grid.2, dgamma( lambda.grid.2, shape = alpha.likelihood,
  rate = beta.likelihood ), type = 'l', lwd = 3, col = 'black',
  xlab = 'lambda', ylab = 'PDF', lty = 1 )

lines( lambda.grid.2, dgamma( lambda.grid.2, shape = alpha.prior.class,
  rate = beta.prior ), lwd = 3, col = 'red', lty = 2 )

lines( lambda.grid.2, dgamma( lambda.grid.2, shape = alpha.posterior.class,
  rate = beta.posterior ), lwd = 3, col = 'chocolate1', lty = 3 )

lines( lambda.grid.2, dgamma( lambda.grid.2, shape = alpha.prior.utah,
  rate = beta.prior ), lwd = 3, col = 'darkcyan', lty = 4 )

lines( lambda.grid.2, dgamma( lambda.grid.2, shape = alpha.posterior.utah,
  rate = beta.posterior ), lwd = 3, col = 'cornflowerblue', lty = 5 )

# the utah prior has exactly the same effect as the class prior;
# both are low-information (LI) choices with the same posterior consequences

# the usual bayesian point estimate would be the posterior mean:

print( posterior.mean <- alpha.posterior.class / beta.posterior )

# [1] 2.071449

# almost identical to the sample mean 

#   y.bar = lambda.hat.mle = 2.071429

# the two should be really close under the LI prior

# the usual bayesian give-or-take for the bayesian point estimate 
# would be the posterior sd:

print( posterior.sd <- sqrt( alpha.posterior.class ) / beta.posterior )

# [1] 0.3846552

# almost identical to the likelihood standard error

#   se.hat.lambda.hat.mle = 0.3846546

# again the two should be really close under the LI prior

# the bayesian 99.9% interval for lambda is obtained
# from the quantiles of the gamma posterior distribution:

print( bayesian.interval <- qgamma( c( alpha / 2, 1 - alpha / 2 ),
  alpha.posterior.class, beta.posterior ) )

# [1] 1.033688 3.574667

# this is quite different from the large-sample likelihood interval

#    0.8057122 3.3371449

# the reason is that the bayes interval is correctly asymmetric here
# (see the skewness in the posterior and likelihood PDFs),
# while the likelihood interval is symmetric about the MLE,
# relying as it does on the frequentist CLT (which doesn't
# yet apply here with an n of only 14)

# the likelihood differs visually from the posterior in this example
# because of that pesky ( -1 ) in the PDF exponent for alpha,
# but the likelihood and bayes analyses are almost identical anyway,
# apart from mr. fisher's reliance on the frequentist CLT, 
# which produced an inferior interval

#################################################################################

##### frequentist and bayesian nonparametric inferential analysis
##### with the bootstrap

number.of.bootstrap.replications <- 1000000

bootstrap.mle.values <- rep( NA, number.of.bootstrap.replications )

set.seed( 1 )

system.time( {

for ( i in 1:number.of.bootstrap.replications ) {

  y.star <- sample( y, replace = T )

  bootstrap.mle.values[ i ] <- mean( y.star )

  if ( ( i %% 100000 ) == 0 ) print( i )

}

} )

hist( bootstrap.mle.values, main = 'LoS Case Study',
  xlab = 'lambda', ylab = 'Approximate Posterior PDF',
  col = 'darkcyan', prob = T )

print( c( mean( bootstrap.mle.values ), sd( bootstrap.mle.values ) ) )

# [1] 2.0717881 0.3973817

print( bootstrap.confidence.interval <- quantile( bootstrap.mle.values,
  probs = c( alpha / 2, 1 - alpha / 2 ) ) )

#    0.05%   99.95% 
# 1.000000 3.571429

# look at how closely the bootstrap results match 
# the bayesian results with a LI prior

#                   mean/                99.9% interval
#    method       estimate    SD/SE       left     right

# large-sample
#   likelihood    2.071429  0.3846546  0.8057122 3.3371449

# bayes with 
#   LI prior      2.071449  0.3846552  1.033688  3.574667

# bootstrap       2.071788  0.3973817  1.000000  3.571429

#################################################################################

# visualization of the Dirichlet Process posterior
# implemented by the bootstrap with a simulated Gaussian data set

gaussian.mean <- 5

gaussian.sd <- 2

gaussian.sample.size <- 100

set.seed( 2 )

print( gaussian.sample <- sort( rnorm( gaussian.sample.size,
  gaussian.mean, gaussian.sd ) ) )

number.of.bootstrap.replications <- 200

plot( 0, 0, type = 'n', xlab = 'Simulated Gaussian Data',
  ylab = 'CDF', xlim = c( 0, 9.5 ), ylim = c( 0, 1 ) )

system.time( {

for ( i in 1:number.of.bootstrap.replications ) {

  y.star <- sample( gaussian.sample, replace = T )

  lines( ecdf( y.star ), lty = 3, lwd = 1, col = 'gray', pch = 20 )

}

} )

lines( ecdf( gaussian.sample ), lty = 1, lwd = 3, col = 'cornflowerblue' )

#################################################################################

##### (6) bayesian predictive analysis

# see the document camera notes from fri 19 feb 2021
# and lecture notes part 6 pages 105-115

# the following function computes the posterior predictive pmf:

poisson.gamma.posterior.predictive.pmf <- function( y, alpha, beta ) {

  log.pmf <- lgamma( alpha + y ) + alpha *
    log( beta / ( beta + 1 ) ) - y * log( beta + 1 ) -
    lgamma( alpha ) - lgamma( y + 1 )

  return( exp( log.pmf ) )

}

# let's visualize the posterior predictive distribution,
# based on the entire data set, for a future observation

y.star.maximum.value <- 9

y.star <- 0:y.star.maximum.value

print( predictive.pmf <- 
  poisson.gamma.posterior.predictive.pmf( y.star, alpha.posterior.class, 
  beta.posterior ) )

plot( 0:y.star.maximum.value, predictive.pmf, type = 'n', xlab = 'y', 
  ylab = 'Predictive PMF', main = 'Predictive PMF in the LoS Case Study' )

for ( i in 0:y.star.maximum.value ) {

  segments( i, 0, i, predictive.pmf[ i + 1 ], lwd = 3, lty = 1,
    col = 'darkcyan' )

}

par( mfrow = c( 2, 1 ) )

plot( 0:y.star.maximum.value, predictive.pmf, type = 'n', 
  xlab = 'Length of Stay', ylim = c( 0, 0.35 ),
  ylab = 'Predictive PMF', main = 'Predictive PMF in the LoS Case Study' )

for ( i in 0:y.star.maximum.value ) {

  segments( i, 0, i, predictive.pmf[ i + 1 ], lwd = 3, lty = 1,
    col = 'darkcyan' )

}

plot( 0, 0, type = 'n', xlim = c( 0, y.star.maximum.value ), 
  xlab = 'Length of Stay', main = 'Data Spike Plot',
  ylab = 'PMF', ylim = c( 0, 0.35 ) )

for ( i in 1:( y.star.maximum.value + 1 ) ) {

  segments( i - 1, 0, i - 1, y.probabilities[ i ], lwd = 3, col = 'red' )

}

par( mfrow = c( 1, 1 ) )

# and the following code loops through the data set,
# setting aside one observation at a time ("jack-knifing"),
# and calculating posterior predictive z-scores
# for model-checking

z.model.checking <- rep( NA, n )

for ( i in 1:n ) {

  y.current <- y[ -i ]

  n.current <- length( y.current )

  s.current <- sum( y.current )

  alpha.star.posterior <- alpha.prior.class + s.current

  beta.star.posterior <- beta.prior + n.current

  predictive.mean.current <- alpha.star.posterior / beta.star.posterior

  predictive.SD.current <- sqrt( ( alpha.star.posterior / 
    beta.star.posterior ) * ( 1 + 1 / beta.star.posterior ) )

  z.model.checking[ i ] <- ( y[ i ] - predictive.mean.current ) /
    predictive.SD.current

}

c( mean( z.model.checking ), sd( z.model.checking ) )

# [1] 0.03126664 1.15504934

qqnorm( z.model.checking, main = 'LoS Case Study', xlim = c( -1.75, 3.1 ),
  ylab = 'Predictive z Scores', type = 'l', lwd = 3, 
  col = 'cornflowerblue', ylim = c( -1.75, 3.1 ) )

abline( 0, 1, lwd = 3, col = 'chocolate1', lty = 2 )

legend( 1, 0, legend = c( 'Predictive z Scores', 'Target Behavior' ), 
  col = c( 'cornflowerblue', 'chocolate1' ), lty = c( 1, 2 ), lwd = 3 )

#################################################################################
#################################################################################