#################################################################################
#################################################################################

# exploration in R of the bernoulli bayesian story
# with the full-employment data of quiz 2

# dd ( 10 feb 2021 )

##### preliminaries:

# (1) unbuffer the output (if relevant to your R environment)

# (2) set the working directory to the place where you'd like
# PDF versions of plots to be stored

# on my home desktop this is

#   setwd( 'C:/DD/Teaching/AMS-STAT-206/Winter-2021/Bernoulli-Bayesian-Story' )

# (3) ensure that we start with a completely empty R workspace:

rm( list = ls( ) )

#################################################################################

# see the paper-and-pen work on the document camera
# that defines the posterior distribution in the bernoulli sampling model
# with a Beta ( alpha, beta ) prior

# let's explore the Beta family as alpha > 0 and beta > 0 change

# the Beta ( alpha, beta ) PDF is 

#   c * theta^( alpha - 1 ) * ( 1 - theta )^( beta - 1 )

# the relevant R built-in functions are, as usual,

#   dbeta( x, alpha, beta ) gives the Beta ( alpha, beta ) PDF
#   pbeta( q, alpha, beta )                                CDF
#   qbeta( p, alpha, beta )                                inverse CDF (quantile)
#   rbeta( n, alpha, beta ) takes an IID random sample of size n
#                             from the Beta ( alpha, beta ) PDF

# as usual, somewhat strange things may happen at the boundaries of
# the parameter space Theta = ( 0, 1 ), so let's use

theta.grid.1 <- seq( 0.001, 0.999, length = 500 )

# let's start with ( alpha, beta ) = ( 1, 1 )

plot( theta.grid.1, dbeta( theta.grid.1, 1, 1 ), type = 'l', lwd = 3,
  lty = 1, col = 'darkcyan', xlab = 'theta', ylab = 'Beta PDF' )

# R has chosen a silly lower y-limit for this plot; it remains to be seen
# what we'll eventually need for a good upper y-limit

plot( theta.grid.1, dbeta( theta.grid.1, 1, 1 ), type = 'l', lwd = 3,
  lty = 1, col = 'darkcyan', xlab = 'theta', ylab = 'Beta PDF',
  ylim = c( 0, 2 ) )

# let's try holding alpha at 1 and decreasing beta to 0.5

lines( theta.grid.1, dbeta( theta.grid.1, 1, 0.5 ), lwd = 3,
  lty = 2, col = 'red' )

# so we can get what are called J-shaped PDFs in the Beta family
# (negatively skewed; long left-hand tail)

# let's try reversing the role of alpha and beta

# question: look at the PDF formula; what should happen
# when alpha and beta are reversed?

# answer: the graph will be reflected around the vertical line theta = 1 / 2

lines( theta.grid.1, dbeta( theta.grid.1, 0.5, 1 ), lwd = 3,
  lty = 3, col = 'black' )

# this is called a reverse-J shape (positive skew, long right-hand tail)

# let's look at ( alpha, beta ) = ( 0.1, 0.1 )

lines( theta.grid.1, dbeta( theta.grid.1, 0.1, 0.1 ), lwd = 3,
  lty = 4, col = 'cornflowerblue' )

# question: what's going on when either alpha or beta is less than 1?

# answer: the PDF goes to ( + infinity ) at one end or the other,
# but it's still a proper PDF as long as alpha and beta are both > 0

# let's ask wolfram alpha for help

#   integrate theta^( 0.1 - 1 ) * ( 1 - theta )^( 0.1 - 1 ) for theta from 0 to 1

# the result is that if either alpha goes to 0 or beta goes to 0 (or both),
# the Beta ( alpha, beta ) 'PDF' becomes an *improper* distribution,
# which integrates to ( + infinity )

# how about ( alpha, beta ) = ( 2, 2 ) ?

lines( theta.grid.1, dbeta( theta.grid.1, 2, 2 ), lwd = 3,
  lty = 5, col = 'chocolate1' )

# look back at ( alpha, beta ) = ( 1, 1 ) and now ( 2, 2 )

# question: what happens when alpha = beta?

# answer: the Beta ( alpha, beta ) distributions are symmetric about theta = 1 / 2

# let's try ( alpha, beta ) = ( 2, 3 )

lines( theta.grid.1, dbeta( theta.grid.1, 2, 3 ), lwd = 3,
  lty = 6, col = 'brown' )

# now let's start making alpha and beta bigger: ( alpha, beta ) = ( 5, 5 )

lines( theta.grid.1, dbeta( theta.grid.1, 5, 5 ), lwd = 3,
  lty = 7, col = 'orange' )

# okay, so now we need to adjust the upper 'ylim':

plot( theta.grid.1, dbeta( theta.grid.1, 1, 1 ), type = 'l', lwd = 3,
  lty = 1, col = 'darkcyan', xlab = 'theta', ylab = 'Beta PDF',
  ylim = c( 0, 3.5 ) )

lines( theta.grid.1, dbeta( theta.grid.1, 1, 0.5 ), lwd = 3,
  lty = 2, col = 'red' )

lines( theta.grid.1, dbeta( theta.grid.1, 0.5, 1 ), lwd = 3,
  lty = 3, col = 'blue' )

lines( theta.grid.1, dbeta( theta.grid.1, 0.1, 0.1 ), lwd = 3,
  lty = 4, col = 'black' )

lines( theta.grid.1, dbeta( theta.grid.1, 2, 2 ), lwd = 3,
  lty = 5, col = 'purple' )

lines( theta.grid.1, dbeta( theta.grid.1, 2, 3 ), lwd = 3,
  lty = 5, col = 'brown' )

lines( theta.grid.1, dbeta( theta.grid.1, 5, 5 ), lwd = 3,
  lty = 6, col = 'orange' )

# how about ( alpha, beta ) = ( 5, 10 ) ?

lines( theta.grid.1, dbeta( theta.grid.1, 5, 10 ), lwd = 3,
  lty = 8, col = 'green' )

# and then ( alpha, beta ) = ( 20, 20 ): 

lines( theta.grid.1, dbeta( theta.grid.1, 20, 20 ), lwd = 3,
  lty = 9, col = 'steelblue4' )

# now we need to adjust the upper 'ylim' again:

plot( theta.grid.1, dbeta( theta.grid.1, 1, 1 ), type = 'l', lwd = 3,
  lty = 1, col = 'darkcyan', xlab = 'theta', ylab = 'Beta PDF',
  ylim = c( 0, 5.0 ) )

lines( theta.grid.1, dbeta( theta.grid.1, 1, 0.5 ), lwd = 3,
  lty = 2, col = 'red' )

lines( theta.grid.1, dbeta( theta.grid.1, 0.5, 1 ), lwd = 3,
  lty = 3, col = 'blue' )

lines( theta.grid.1, dbeta( theta.grid.1, 0.1, 0.1 ), lwd = 3,
  lty = 4, col = 'black' )

lines( theta.grid.1, dbeta( theta.grid.1, 2, 2 ), lwd = 3,
  lty = 5, col = 'purple' )

lines( theta.grid.1, dbeta( theta.grid.1, 2, 3 ), lwd = 3,
  lty = 5, col = 'brown' )

lines( theta.grid.1, dbeta( theta.grid.1, 5, 5 ), lwd = 3,
  lty = 6, col = 'orange' )

lines( theta.grid.1, dbeta( theta.grid.1, 5, 10 ), lwd = 3,
  lty = 7, col = 'green' )

lines( theta.grid.1, dbeta( theta.grid.1, 20, 20 ), lwd = 3,
  lty = 8, col = 'steelblue4' )

# question: what's happening to the Beta PDFs as alpha and beta grow?

# answer: when alpha = beta = x and x increases, the Beta PDFs
# look more and more like Normal PDFs constrained to live on ( 0, 1 )

# question: what types of shapes are available 
# in the Beta ( alpha, beta ) family?

# answer: symmetric?            yes

#         positive skew?        yes

#         negative skew?        yes

#         J-shaped?             yes

#         reverse-J-shaped?     yes

#         bimodal?              no

#         U-shaped?             yes

#         flat?                 yes

# the only way to get *multimodality* in the Beta family is by
# creating a *mixture* of 2 or more Beta PDFs:

# extremely useful fact: given *any* two PDFs 

#   p_1 ( theta ) and p_2 ( theta )

# that share the same support, you can *always* create a new PDF
# by taking a *mixture* (weighted average) of them: pick any
# number 0 < A < 1; then define

#   p_{ mixture } ( theta ) = A * p_1 ( theta ) + ( 1 - A ) *  p_2 ( theta )

# example: let's mix together the Beta ( 10, 3 ) and Beta ( 3, 10 ) PDFs,
# with mixing weight A = 2 / 3:

par( mfrow = c( 2, 1 ) )

plot( theta.grid.1, dbeta( theta.grid.1, 10, 3 ), type = 'l', lwd = 3,
  lty = 1, col = 'darkcyan', xlab = 'theta', ylab = 'PDF',
  ylim = c( 0, 4 ), main = '2 Beta PDFs' )

lines( theta.grid.1, dbeta( theta.grid.1, 3, 10 ), lwd = 3,
  lty = 2, col = 'red' )

A <- 2 / 3

plot( theta.grid.1, A * dbeta( theta.grid.1, 10, 3 ) + 
  ( 1 - A ) * dbeta( theta.grid.1, 3, 10 ), type = 'l', lwd = 3,
  lty = 3, col = 'blue', xlab = 'theta', ylab = 'PDF',
  ylim = c( 0, 4 ), main = 'Mixture of the 2 Beta PDFs Above' )

par( mfrow = c( 1, 1 ) )

# covered to here as of fri morning 5 feb 2021

#################################################################################

# in the quiz 2 IID Bernoulli( theta ) case study,
# let's see what happens with a low-information 
# Beta ( 1, 1 ) = Uniform( 0, 1 ) prior:

# recall that the Beta ( alpha, beta ) PDF is of the form

#   c * theta^( alpha - 1 ) * ( 1 - theta )^( beta - 1 )

# the prior is Beta ( alpha*, beta* ) = Beta ( 1, 1 )

# with ( s, n ) = ( 830, 921 ), the likelihood is

# ell ( theta | y, B ) = c * theta^s * ( 1 - theta )^( n - s )
#               = c * theta^[ ( s + 1 ) - 1 ] * ( 1 - theta )^[ ( n - s + 1 ) - 1 ) ]
#               = Beta( s + 1, n - s + 1 ) = Beta( 831, 92 )

# and the posterior is therefore

# p ( theta | y, B ) = theta^[ alpha* + s - 1, ( beta* + n - s ) - 1 ]
#                    = Beta( alpha* + s, beta* + n - s )
#                    = Beta( 1 + s, 1 + n - s )
#                    = Beta( 831, 92 )

# in other words, with the Uniform( 0, 1 ) prior,

#   the posterior and likelihood PDFs coincide

# of course this must be true:

#   p ( theta | y, B ) = c p ( theta | B ) * ell ( theta | y, B )

# if the prior is of the form      

#   theta ~ Uniform( 0, 1 ) ,

# then

#   p ( theta | B ) is proportional to the constant 1,

# and the updating rule becomes

#   p ( theta | y, B ) = c * ell ( theta | y, B )

# let's plot the { prior, likelihood, posterior } on the same graph:

s <- 830

n <- 921

alpha.star <- 1

beta.star <- 1

plot( theta.grid.1, dbeta( theta.grid.1, alpha.star + s, beta.star + n - s ),
  type = 'l', lwd = 3, col = 'darkcyan', xlab = 'theta', ylab = 'PDF' )

# let's focus on the interesting theta values between 0.86 and 0.93:

theta.grid.2 <- seq( 0.86, 0.93, length = 500 )

plot( theta.grid.2, dbeta( theta.grid.2, alpha.star + s, beta.star + n - s ),
  type = 'l', lwd = 3, col = 'darkcyan', xlab = 'theta', ylab = 'PDF',
  xlim = c( 0.86, 0.93 ), 
  main = 'Quiz 2 Case Study: Bernoulli Bayesian Analysis' )

lines( theta.grid.2, dbeta( theta.grid.2, alpha.star, beta.star ),
  lwd = 3, lty = 2, col = 'red' )

lines( theta.grid.2, dbeta( theta.grid.2, s + 1, n - s + 1 ),
  lwd = 3, lty = 3, col = 'blue' )

legend( 0.865, 35, legend = c( 'prior', 'likelihood', 'posterior' ),
  col = c( 'red', 'blue', 'darkcyan' ), lty = c( 1, 2, 3 ), lwd = 3 )

# with a low-information uniform( 0, 1 ) prior and a large
# sample size n, the likelihood PDF and the posterior PDF coincide, 
# and both are close to Gaussian

# this means that likelihood and bayesian inferences should
# be extremely similar: 

#   mr. fisher maximizes his likelihood function to get theta.hat.mle;
#   in other words, from a bayesian viewpoint he has chosen
#   the *mode* of the likelihood PDF as his point estimate;
#   but here the likelihood and posterior PDFs are identical,
#   so he has also *maximized the posterior* to get his MLE

#   this means that what mr. fisher actually calculated, from a
#   bayesian point of view, is

#     theta.hat.mle = the *maximum a posteriori (MAP)* point estimate

#   MAP estimates are popular these days in machine learning
#   in settings in which all you want is to compute is a point estimate

#   in this situation bayesians would summarize the posterior PDF
#   by computing its mean; but with the Normal PDF,

#     --> the mode and mean coincide, so 

#     bayesian posterior mean point estimate = MLE = MAP estimate

# this situation is an instance of an important theoretical result
# called the *bernstein-von mises theorem*

# informally this theorem says that

#   when the sample size n is large and context implies the use of
#   a low-information (LI) prior,

#   --> likelihood and bayesian inferences will approximately coincide;
#       but the bayesian story may well involve nasty integrals,
#       whereas the likelihood story is based on differentiation;
#       so we can 'eat our cake and have it too': when this theorem applies,
#       we can (a) make likelihood calculations and (b) interpret
#       the results in a bayesian way

#################################################################################

# let's look at the numbers in the quiz 2 case study

# first the likelihood numerical analysis:

print( c( s, n ) )

# [1] 830 921

print( theta.hat.mle <- s / n )

# [1] 0.9011944

plot( theta.grid.2, dbeta( theta.grid.2, alpha.star + s, beta.star + n - s ),
  type = 'l', lwd = 3, col = 'darkcyan', xlab = 'theta', ylab = 'PDF',
  xlim = c( 0.86, 0.93 ), 
  main = 'Quiz 2 Case Study: Bernoulli Bayesian Analysis' )

lines( theta.grid.2, dbeta( theta.grid.2, alpha.star, beta.star ),
  lwd = 3, lty = 2, col = 'red' )

lines( theta.grid.2, dbeta( theta.grid.2, s + 1, n - s + 1 ),
  lwd = 3, lty = 3, col = 'blue' )

abline( v = theta.hat.mle, lwd = 2, lty = 4, col = 'black' )

legend( 0.865, 35, legend = c( 'prior', 'likelihood', 'posterior', 'MLE' ),
  col = c( 'red', 'blue', 'darkcyan', 'black' ), lty = c( 1, 2, 3, 4 ), lwd = 3 )

print( se.hat.theta.hat.mle <- 
  sqrt( theta.hat.mle * ( 1 - theta.hat.mle ) / n ) )

# [1] 0.009832644

alpha <- 0.001

print( z.alpha <- qnorm( 1 - alpha / 2 ) )

# [1] 3.290527

print( likelihood.confidence.interval <- 
  c( theta.hat.mle - z.alpha * se.hat.theta.hat.mle,
     theta.hat.mle + z.alpha * se.hat.theta.hat.mle ) )

# [1] 0.8688398 0.9335489

# let's superimpose the Normal curve with the same mean and SD
# as the ( likelihood, posterior ) PDF

plot( theta.grid.2, dbeta( theta.grid.2, alpha.star + s, beta.star + n - s ),
  type = 'l', lwd = 3, col = 'darkcyan', xlab = 'theta', ylab = 'PDF',
  xlim = c( 0.86, 0.93 ), 
  main = 'Quiz 2 Case Study: Bernoulli Bayesian Analysis' )

lines( theta.grid.2, dbeta( theta.grid.2, alpha.star, beta.star ),
  lwd = 3, lty = 2, col = 'red' )

lines( theta.grid.2, dbeta( theta.grid.2, s + 1, n - s + 1 ),
  lwd = 3, lty = 3, col = 'blue' )

abline( v = theta.hat.mle, lwd = 2, lty = 4, col = 'black' )

lines( theta.grid.2, dnorm( theta.grid.2, theta.hat.mle,
  se.hat.theta.hat.mle ), lwd = 3, lty = 5, col = 'orange' )

legend( 0.865, 35, legend = c( 'prior', 'likelihood', 'posterior', 'MLE',
  'Normal' ), col = c( 'red', 'blue', 'darkcyan', 'black', 'orange' ), 
  lty = c( 1, 2, 3, 4, 5 ), lwd = 3 )

# now the bayesian numerical analysis

# watch out for the notational collision with alpha, between
# the 100 ( 1 - alpha )% confidence level and the first parameter
# of the Beta distribution

alpha.prior <- 1

beta.prior <- 1

alpha.posterior <- alpha.prior + s

beta.posterior <- beta.prior + ( n - s )

print( posterior.mean <- alpha.posterior / ( alpha.posterior + beta.posterior ) )

# [1] 0.900325

# almost identical to the theta.hat.mle -- the difference is
# that the Uniform( 0, 1 ) prior added 1 vote for 'yes' and 1 vote for 'other'
# into the posterior to go along with the data 

print( posterior.sd <- sqrt( alpha.posterior * beta.posterior /
  ( ( alpha.posterior + beta.posterior )^2 * 
  ( alpha.posterior + beta.posterior + 1 ) ) ) )

# [1] 0.009855003

# again, almost identical to the information-based se.hat.theta.hat.mle

# bayesians typically create a 100 ( 1 - alpha / 2 )% 
# posterior (*credible*) interval by computing the ( alpha / 2 )
# and ( 1 - alpha / 2 ) quantiles of the posterior distribution:

print( bayesian.credible.interval <- c( qbeta( alpha / 2, alpha.posterior,
  beta.posterior ), qbeta( 1 - alpha / 2, alpha.posterior,
  beta.posterior ) ) )

# [1] 0.8651070 0.9298801

# again, almost identical

# a summary:

#   method      estimate      SD/SE          interval

# likelihood    0.9011944  0.009832644  0.8688398 0.9335489

# bayes with 
#   LI prior    0.900325   0.009855003  0.8651070 0.9298801

# this is the bernstein-von mises theorem in action

# finally, what's the posterior probability that theta > 0.95 given
# the ( data, prior, background ) information?

1 - pbeta( 0.95, alpha.posterior, beta.posterior )

# [1] 5.100148e-10

#################################################################################
#################################################################################