################################################################################# ################################################################################# # exploration in R of the bernoulli bayesian story # with the full-employment data of quiz 2 # dd ( 10 feb 2021 ) ##### preliminaries: # (1) unbuffer the output (if relevant to your R environment) # (2) set the working directory to the place where you'd like # PDF versions of plots to be stored # on my home desktop this is # setwd( 'C:/DD/Teaching/AMS-STAT-206/Winter-2021/Bernoulli-Bayesian-Story' ) # (3) ensure that we start with a completely empty R workspace: rm( list = ls( ) ) ################################################################################# # see the paper-and-pen work on the document camera # that defines the posterior distribution in the bernoulli sampling model # with a Beta ( alpha, beta ) prior # let's explore the Beta family as alpha > 0 and beta > 0 change # the Beta ( alpha, beta ) PDF is # c * theta^( alpha - 1 ) * ( 1 - theta )^( beta - 1 ) # the relevant R built-in functions are, as usual, # dbeta( x, alpha, beta ) gives the Beta ( alpha, beta ) PDF # pbeta( q, alpha, beta ) CDF # qbeta( p, alpha, beta ) inverse CDF (quantile) # rbeta( n, alpha, beta ) takes an IID random sample of size n # from the Beta ( alpha, beta ) PDF # as usual, somewhat strange things may happen at the boundaries of # the parameter space Theta = ( 0, 1 ), so let's use theta.grid.1 <- seq( 0.001, 0.999, length = 500 ) # let's start with ( alpha, beta ) = ( 1, 1 ) plot( theta.grid.1, dbeta( theta.grid.1, 1, 1 ), type = 'l', lwd = 3, lty = 1, col = 'darkcyan', xlab = 'theta', ylab = 'Beta PDF' ) # R has chosen a silly lower y-limit for this plot; it remains to be seen # what we'll eventually need for a good upper y-limit plot( theta.grid.1, dbeta( theta.grid.1, 1, 1 ), type = 'l', lwd = 3, lty = 1, col = 'darkcyan', xlab = 'theta', ylab = 'Beta PDF', ylim = c( 0, 2 ) ) # let's try holding alpha at 1 and decreasing beta to 0.5 lines( theta.grid.1, dbeta( theta.grid.1, 1, 0.5 ), lwd = 3, lty = 2, col = 'red' ) # so we can get what are called J-shaped PDFs in the Beta family # (negatively skewed; long left-hand tail) # let's try reversing the role of alpha and beta # question: look at the PDF formula; what should happen # when alpha and beta are reversed? # answer: the graph will be reflected around the vertical line theta = 1 / 2 lines( theta.grid.1, dbeta( theta.grid.1, 0.5, 1 ), lwd = 3, lty = 3, col = 'black' ) # this is called a reverse-J shape (positive skew, long right-hand tail) # let's look at ( alpha, beta ) = ( 0.1, 0.1 ) lines( theta.grid.1, dbeta( theta.grid.1, 0.1, 0.1 ), lwd = 3, lty = 4, col = 'cornflowerblue' ) # question: what's going on when either alpha or beta is less than 1? # answer: the PDF goes to ( + infinity ) at one end or the other, # but it's still a proper PDF as long as alpha and beta are both > 0 # let's ask wolfram alpha for help # integrate theta^( 0.1 - 1 ) * ( 1 - theta )^( 0.1 - 1 ) for theta from 0 to 1 # the result is that if either alpha goes to 0 or beta goes to 0 (or both), # the Beta ( alpha, beta ) 'PDF' becomes an *improper* distribution, # which integrates to ( + infinity ) # how about ( alpha, beta ) = ( 2, 2 ) ? lines( theta.grid.1, dbeta( theta.grid.1, 2, 2 ), lwd = 3, lty = 5, col = 'chocolate1' ) # look back at ( alpha, beta ) = ( 1, 1 ) and now ( 2, 2 ) # question: what happens when alpha = beta? # answer: the Beta ( alpha, beta ) distributions are symmetric about theta = 1 / 2 # let's try ( alpha, beta ) = ( 2, 3 ) lines( theta.grid.1, dbeta( theta.grid.1, 2, 3 ), lwd = 3, lty = 6, col = 'brown' ) # now let's start making alpha and beta bigger: ( alpha, beta ) = ( 5, 5 ) lines( theta.grid.1, dbeta( theta.grid.1, 5, 5 ), lwd = 3, lty = 7, col = 'orange' ) # okay, so now we need to adjust the upper 'ylim': plot( theta.grid.1, dbeta( theta.grid.1, 1, 1 ), type = 'l', lwd = 3, lty = 1, col = 'darkcyan', xlab = 'theta', ylab = 'Beta PDF', ylim = c( 0, 3.5 ) ) lines( theta.grid.1, dbeta( theta.grid.1, 1, 0.5 ), lwd = 3, lty = 2, col = 'red' ) lines( theta.grid.1, dbeta( theta.grid.1, 0.5, 1 ), lwd = 3, lty = 3, col = 'blue' ) lines( theta.grid.1, dbeta( theta.grid.1, 0.1, 0.1 ), lwd = 3, lty = 4, col = 'black' ) lines( theta.grid.1, dbeta( theta.grid.1, 2, 2 ), lwd = 3, lty = 5, col = 'purple' ) lines( theta.grid.1, dbeta( theta.grid.1, 2, 3 ), lwd = 3, lty = 5, col = 'brown' ) lines( theta.grid.1, dbeta( theta.grid.1, 5, 5 ), lwd = 3, lty = 6, col = 'orange' ) # how about ( alpha, beta ) = ( 5, 10 ) ? lines( theta.grid.1, dbeta( theta.grid.1, 5, 10 ), lwd = 3, lty = 8, col = 'green' ) # and then ( alpha, beta ) = ( 20, 20 ): lines( theta.grid.1, dbeta( theta.grid.1, 20, 20 ), lwd = 3, lty = 9, col = 'steelblue4' ) # now we need to adjust the upper 'ylim' again: plot( theta.grid.1, dbeta( theta.grid.1, 1, 1 ), type = 'l', lwd = 3, lty = 1, col = 'darkcyan', xlab = 'theta', ylab = 'Beta PDF', ylim = c( 0, 5.0 ) ) lines( theta.grid.1, dbeta( theta.grid.1, 1, 0.5 ), lwd = 3, lty = 2, col = 'red' ) lines( theta.grid.1, dbeta( theta.grid.1, 0.5, 1 ), lwd = 3, lty = 3, col = 'blue' ) lines( theta.grid.1, dbeta( theta.grid.1, 0.1, 0.1 ), lwd = 3, lty = 4, col = 'black' ) lines( theta.grid.1, dbeta( theta.grid.1, 2, 2 ), lwd = 3, lty = 5, col = 'purple' ) lines( theta.grid.1, dbeta( theta.grid.1, 2, 3 ), lwd = 3, lty = 5, col = 'brown' ) lines( theta.grid.1, dbeta( theta.grid.1, 5, 5 ), lwd = 3, lty = 6, col = 'orange' ) lines( theta.grid.1, dbeta( theta.grid.1, 5, 10 ), lwd = 3, lty = 7, col = 'green' ) lines( theta.grid.1, dbeta( theta.grid.1, 20, 20 ), lwd = 3, lty = 8, col = 'steelblue4' ) # question: what's happening to the Beta PDFs as alpha and beta grow? # answer: when alpha = beta = x and x increases, the Beta PDFs # look more and more like Normal PDFs constrained to live on ( 0, 1 ) # question: what types of shapes are available # in the Beta ( alpha, beta ) family? # answer: symmetric? yes # positive skew? yes # negative skew? yes # J-shaped? yes # reverse-J-shaped? yes # bimodal? no # U-shaped? yes # flat? yes # the only way to get *multimodality* in the Beta family is by # creating a *mixture* of 2 or more Beta PDFs: # extremely useful fact: given *any* two PDFs # p_1 ( theta ) and p_2 ( theta ) # that share the same support, you can *always* create a new PDF # by taking a *mixture* (weighted average) of them: pick any # number 0 < A < 1; then define # p_{ mixture } ( theta ) = A * p_1 ( theta ) + ( 1 - A ) * p_2 ( theta ) # example: let's mix together the Beta ( 10, 3 ) and Beta ( 3, 10 ) PDFs, # with mixing weight A = 2 / 3: par( mfrow = c( 2, 1 ) ) plot( theta.grid.1, dbeta( theta.grid.1, 10, 3 ), type = 'l', lwd = 3, lty = 1, col = 'darkcyan', xlab = 'theta', ylab = 'PDF', ylim = c( 0, 4 ), main = '2 Beta PDFs' ) lines( theta.grid.1, dbeta( theta.grid.1, 3, 10 ), lwd = 3, lty = 2, col = 'red' ) A <- 2 / 3 plot( theta.grid.1, A * dbeta( theta.grid.1, 10, 3 ) + ( 1 - A ) * dbeta( theta.grid.1, 3, 10 ), type = 'l', lwd = 3, lty = 3, col = 'blue', xlab = 'theta', ylab = 'PDF', ylim = c( 0, 4 ), main = 'Mixture of the 2 Beta PDFs Above' ) par( mfrow = c( 1, 1 ) ) # covered to here as of fri morning 5 feb 2021 ################################################################################# # in the quiz 2 IID Bernoulli( theta ) case study, # let's see what happens with a low-information # Beta ( 1, 1 ) = Uniform( 0, 1 ) prior: # recall that the Beta ( alpha, beta ) PDF is of the form # c * theta^( alpha - 1 ) * ( 1 - theta )^( beta - 1 ) # the prior is Beta ( alpha*, beta* ) = Beta ( 1, 1 ) # with ( s, n ) = ( 830, 921 ), the likelihood is # ell ( theta | y, B ) = c * theta^s * ( 1 - theta )^( n - s ) # = c * theta^[ ( s + 1 ) - 1 ] * ( 1 - theta )^[ ( n - s + 1 ) - 1 ) ] # = Beta( s + 1, n - s + 1 ) = Beta( 831, 92 ) # and the posterior is therefore # p ( theta | y, B ) = theta^[ alpha* + s - 1, ( beta* + n - s ) - 1 ] # = Beta( alpha* + s, beta* + n - s ) # = Beta( 1 + s, 1 + n - s ) # = Beta( 831, 92 ) # in other words, with the Uniform( 0, 1 ) prior, # the posterior and likelihood PDFs coincide # of course this must be true: # p ( theta | y, B ) = c p ( theta | B ) * ell ( theta | y, B ) # if the prior is of the form # theta ~ Uniform( 0, 1 ) , # then # p ( theta | B ) is proportional to the constant 1, # and the updating rule becomes # p ( theta | y, B ) = c * ell ( theta | y, B ) # let's plot the { prior, likelihood, posterior } on the same graph: s <- 830 n <- 921 alpha.star <- 1 beta.star <- 1 plot( theta.grid.1, dbeta( theta.grid.1, alpha.star + s, beta.star + n - s ), type = 'l', lwd = 3, col = 'darkcyan', xlab = 'theta', ylab = 'PDF' ) # let's focus on the interesting theta values between 0.86 and 0.93: theta.grid.2 <- seq( 0.86, 0.93, length = 500 ) plot( theta.grid.2, dbeta( theta.grid.2, alpha.star + s, beta.star + n - s ), type = 'l', lwd = 3, col = 'darkcyan', xlab = 'theta', ylab = 'PDF', xlim = c( 0.86, 0.93 ), main = 'Quiz 2 Case Study: Bernoulli Bayesian Analysis' ) lines( theta.grid.2, dbeta( theta.grid.2, alpha.star, beta.star ), lwd = 3, lty = 2, col = 'red' ) lines( theta.grid.2, dbeta( theta.grid.2, s + 1, n - s + 1 ), lwd = 3, lty = 3, col = 'blue' ) legend( 0.865, 35, legend = c( 'prior', 'likelihood', 'posterior' ), col = c( 'red', 'blue', 'darkcyan' ), lty = c( 1, 2, 3 ), lwd = 3 ) # with a low-information uniform( 0, 1 ) prior and a large # sample size n, the likelihood PDF and the posterior PDF coincide, # and both are close to Gaussian # this means that likelihood and bayesian inferences should # be extremely similar: # mr. fisher maximizes his likelihood function to get theta.hat.mle; # in other words, from a bayesian viewpoint he has chosen # the *mode* of the likelihood PDF as his point estimate; # but here the likelihood and posterior PDFs are identical, # so he has also *maximized the posterior* to get his MLE # this means that what mr. fisher actually calculated, from a # bayesian point of view, is # theta.hat.mle = the *maximum a posteriori (MAP)* point estimate # MAP estimates are popular these days in machine learning # in settings in which all you want is to compute is a point estimate # in this situation bayesians would summarize the posterior PDF # by computing its mean; but with the Normal PDF, # --> the mode and mean coincide, so # bayesian posterior mean point estimate = MLE = MAP estimate # this situation is an instance of an important theoretical result # called the *bernstein-von mises theorem* # informally this theorem says that # when the sample size n is large and context implies the use of # a low-information (LI) prior, # --> likelihood and bayesian inferences will approximately coincide; # but the bayesian story may well involve nasty integrals, # whereas the likelihood story is based on differentiation; # so we can 'eat our cake and have it too': when this theorem applies, # we can (a) make likelihood calculations and (b) interpret # the results in a bayesian way ################################################################################# # let's look at the numbers in the quiz 2 case study # first the likelihood numerical analysis: print( c( s, n ) ) # [1] 830 921 print( theta.hat.mle <- s / n ) # [1] 0.9011944 plot( theta.grid.2, dbeta( theta.grid.2, alpha.star + s, beta.star + n - s ), type = 'l', lwd = 3, col = 'darkcyan', xlab = 'theta', ylab = 'PDF', xlim = c( 0.86, 0.93 ), main = 'Quiz 2 Case Study: Bernoulli Bayesian Analysis' ) lines( theta.grid.2, dbeta( theta.grid.2, alpha.star, beta.star ), lwd = 3, lty = 2, col = 'red' ) lines( theta.grid.2, dbeta( theta.grid.2, s + 1, n - s + 1 ), lwd = 3, lty = 3, col = 'blue' ) abline( v = theta.hat.mle, lwd = 2, lty = 4, col = 'black' ) legend( 0.865, 35, legend = c( 'prior', 'likelihood', 'posterior', 'MLE' ), col = c( 'red', 'blue', 'darkcyan', 'black' ), lty = c( 1, 2, 3, 4 ), lwd = 3 ) print( se.hat.theta.hat.mle <- sqrt( theta.hat.mle * ( 1 - theta.hat.mle ) / n ) ) # [1] 0.009832644 alpha <- 0.001 print( z.alpha <- qnorm( 1 - alpha / 2 ) ) # [1] 3.290527 print( likelihood.confidence.interval <- c( theta.hat.mle - z.alpha * se.hat.theta.hat.mle, theta.hat.mle + z.alpha * se.hat.theta.hat.mle ) ) # [1] 0.8688398 0.9335489 # let's superimpose the Normal curve with the same mean and SD # as the ( likelihood, posterior ) PDF plot( theta.grid.2, dbeta( theta.grid.2, alpha.star + s, beta.star + n - s ), type = 'l', lwd = 3, col = 'darkcyan', xlab = 'theta', ylab = 'PDF', xlim = c( 0.86, 0.93 ), main = 'Quiz 2 Case Study: Bernoulli Bayesian Analysis' ) lines( theta.grid.2, dbeta( theta.grid.2, alpha.star, beta.star ), lwd = 3, lty = 2, col = 'red' ) lines( theta.grid.2, dbeta( theta.grid.2, s + 1, n - s + 1 ), lwd = 3, lty = 3, col = 'blue' ) abline( v = theta.hat.mle, lwd = 2, lty = 4, col = 'black' ) lines( theta.grid.2, dnorm( theta.grid.2, theta.hat.mle, se.hat.theta.hat.mle ), lwd = 3, lty = 5, col = 'orange' ) legend( 0.865, 35, legend = c( 'prior', 'likelihood', 'posterior', 'MLE', 'Normal' ), col = c( 'red', 'blue', 'darkcyan', 'black', 'orange' ), lty = c( 1, 2, 3, 4, 5 ), lwd = 3 ) # now the bayesian numerical analysis # watch out for the notational collision with alpha, between # the 100 ( 1 - alpha )% confidence level and the first parameter # of the Beta distribution alpha.prior <- 1 beta.prior <- 1 alpha.posterior <- alpha.prior + s beta.posterior <- beta.prior + ( n - s ) print( posterior.mean <- alpha.posterior / ( alpha.posterior + beta.posterior ) ) # [1] 0.900325 # almost identical to the theta.hat.mle -- the difference is # that the Uniform( 0, 1 ) prior added 1 vote for 'yes' and 1 vote for 'other' # into the posterior to go along with the data print( posterior.sd <- sqrt( alpha.posterior * beta.posterior / ( ( alpha.posterior + beta.posterior )^2 * ( alpha.posterior + beta.posterior + 1 ) ) ) ) # [1] 0.009855003 # again, almost identical to the information-based se.hat.theta.hat.mle # bayesians typically create a 100 ( 1 - alpha / 2 )% # posterior (*credible*) interval by computing the ( alpha / 2 ) # and ( 1 - alpha / 2 ) quantiles of the posterior distribution: print( bayesian.credible.interval <- c( qbeta( alpha / 2, alpha.posterior, beta.posterior ), qbeta( 1 - alpha / 2, alpha.posterior, beta.posterior ) ) ) # [1] 0.8651070 0.9298801 # again, almost identical # a summary: # method estimate SD/SE interval # likelihood 0.9011944 0.009832644 0.8688398 0.9335489 # bayes with # LI prior 0.900325 0.009855003 0.8651070 0.9298801 # this is the bernstein-von mises theorem in action # finally, what's the posterior probability that theta > 0.95 given # the ( data, prior, background ) information? 1 - pbeta( 0.95, alpha.posterior, beta.posterior ) # [1] 5.100148e-10 ################################################################################# #################################################################################