#################################################################################
#################################################################################

# exploration in R of sampling model (SM) uncertainty
# with the NB10 case study (lecture notes part 6 pp. 123+)

# dd ( 23 feb 2021 )

##### preliminaries:

# (1) unbuffer the output (if relevant to your R environment)

# (2) set the working directory to the place where you'd like
# PDF versions of plots to be stored

# on my home desktop this is

#   setwd( 'C:/DD/Teaching/AMS-STAT-206/Winter-2021/Sampling-Distribution-Uncertainty' )

# (3) ensure that we start with a completely empty R workspace:

rm( list = ls( ) )

#################################################################################

##### using the nonparametric bootstrap to illustrate 
##### sampling distribution uncertainty on the CDF scale

# define the raw NB10 data set and sort it from smallest to largest

y <- c( 409., 400., 406., 399., 402., 406., 401., 403., 401., 403.,
        398., 403., 407., 402., 401., 399., 400., 401., 405., 402., 
        408., 399., 399., 402., 399., 397., 407., 401., 399., 401., 
        403., 400., 410., 401., 407., 423., 406., 406., 402., 405., 
        405., 409., 399., 402., 407., 406., 413., 409., 404., 402., 
        404., 406., 407., 405., 411., 410., 410., 410., 401., 402., 
        404., 405., 392., 407., 406., 404., 403., 408., 404., 407., 
        412., 406., 409., 400., 408., 404., 401., 404., 408., 406., 
        408., 406., 401., 412., 393., 437., 418., 415., 404., 401., 
        401., 407., 412., 375., 409., 406., 398., 406., 403., 404. 
)

y <- sort( y )

print( n <- length( y ) )

# [1] 100

# plot the empirical CDF of the data vector

plot( ecdf( y ), main = 'NB10 Case Study', xlab = 'y', ylab = 'CDF',
  lwd = 3, col = 'darkcyan' )

# the gaps in the empirical CDF are because there are
# a lot of tied values in the data set: the built-in function 'table'
# creates a raw frequency distribution summary of a data vector

table( y )

# y

# 375 392 393 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 415 418 423 437 
#   1   1   1   1   2   7   4  12   8   6   9   5  12   8   5   5   4   1   3   1   1   1   1   1

number.of.bootstrap.replications <- 1000

set.seed( 4391557 )

# with the nonparametric bootstrap we take IID samples *from the sample*
# (the data vector y) to get an idea of what a new data set 
# (also of size n = 100) would look like when generated by
# the same data-generating process that gave us y

for ( i in 1:number.of.bootstrap.replications ) {

  y.star <- sample( y, replace = T )

  lines( ecdf( y.star ), lwd = 1, lty = 3, col = 'grey',
    pch = 1 )

}

lines( ecdf( y ), lwd = 3, col = 'darkcyan' )

y.grid <- seq( min( y ), max( y ), length = 500 )

lines( y.grid, pnorm( y.grid, mean( y ), sd( y ) ), lty = 2,
  lwd = 3, col = 'red' )

legend( 370, 0.8, legend = c( 'Data ECDF', 'Bootstrap ECDFs', 'Normal' ),
  col = c( 'darkcyan', 'grey', 'red' ), 
  lty = c( 1, 3, 2 ), lwd = 3 )

# the Normal CDF with the same mean and SD as our data vector y
# is inconsistent with the bootstrap uncertainty around
# the underlying CDF F of the data-generating process

#############################################################################

# using the nonparametric bootstrap to illustrate 
# sampling distribution uncertainty on the PDF scale

set.seed( 4391557 )

# we can do something similar on the density scale:

plot( density( y ), main = 'NB10 Case Study', xlab = 'y', ylab = 'PDF',
  lwd = 3, col = 'darkcyan', ylim = c( 0, 0.135 ) )

for ( i in 1:number.of.bootstrap.replications ) {

  y.star <- sample( y, replace = T )

  lines( density( y.star ), lwd = 1, lty = 3, col = 'grey',
    pch = 1 )

}

lines( density( y ), lwd = 3, col = 'darkcyan' )

lines( y.grid, dnorm( y.grid, mean( y ), sd( y ) ), lty = 2,
  lwd = 3, col = 'red' )

legend( 415, 0.12, legend = c( 'Data PDF', 'Bootstrap PDFs', 'Normal' ),
  col = c( 'darkcyan', 'grey', 'red' ), 
  lty = c( 1, 3, 2 ), lwd = 3 )

# the Normal PDF with the same mean and SD as our data vector y
# is inconsistent with the bootstrap uncertainty around
# the underlying PDF of the data-generating process

#############################################################################

# using the parametric bootstrap to illustrate sampling distribution uncertainty
# on the Normal qqplot scale

qqnorm.data.results <- qqnorm( y, plot.it = F )

plot( 0, 0, type = 'n', main = 'NB10 Case Study', xlab = 'Normal Quantiles', 
  ylab = 'Data Quantiles',xlim = c( -3, 3 ), ylim = c( min( y ), max( y ) ) )

set.seed( 4391557 )

# here i'm generating many random samples from the
# Normal distribution with the same mean and sd
# as the observed data vector, and superimposing
# Normal qqplots based on those random samples

# this provides an uncertainty envelope around
# the Normal distribution with an n of 100

for ( i in 1:number.of.bootstrap.replications ) {

  y.star <- sort( rnorm( n, mean( y ), sd( y ) ) )

  qqnorm.bootstrap.results <- qqnorm( y.star, plot.it = F )

  lines( qqnorm.bootstrap.results$x, qqnorm.bootstrap.results$y, 
    lty = 3, lwd = 1, col = 'gray' )

}

abline( mean( y ), sd( y ), lwd = 3, col = 'blue', lty = 2 )

# now i superimpose the Normal qqplot of our actual data set

lines( qqnorm.data.results$x, qqnorm.data.results$y, 
  lty = 1, lwd = 5, col = 'darkcyan' )

legend( -2.5, 430, legend = c( 'Data Normal QQplot', 
  'Bootstrap Normal QQpplots', 'Target Normal Shape' ),
  col = c( 'darkcyan', 'grey', 'blue' ), 
  lty = c( 1, 3, 2 ), lwd = 3 )

# again, the Normal PDF with the same mean and SD as our data vector y
# is inconsistent with the bootstrap uncertainty around
# the underlying PDF of the data-generating process

#############################################################################

# conclusion: from all three plots, this data set is not consistent with 
# an IID Normal sampling distribution assumption

#############################################################################
#############################################################################