################################################################################# ################################################################################# # exploration in R of sampling model (SM) uncertainty # with the NB10 case study (lecture notes part 6 pp. 123+) # dd ( 23 feb 2021 ) ##### preliminaries: # (1) unbuffer the output (if relevant to your R environment) # (2) set the working directory to the place where you'd like # PDF versions of plots to be stored # on my home desktop this is # setwd( 'C:/DD/Teaching/AMS-STAT-206/Winter-2021/Sampling-Distribution-Uncertainty' ) # (3) ensure that we start with a completely empty R workspace: rm( list = ls( ) ) ################################################################################# ##### using the nonparametric bootstrap to illustrate ##### sampling distribution uncertainty on the CDF scale # define the raw NB10 data set and sort it from smallest to largest y <- c( 409., 400., 406., 399., 402., 406., 401., 403., 401., 403., 398., 403., 407., 402., 401., 399., 400., 401., 405., 402., 408., 399., 399., 402., 399., 397., 407., 401., 399., 401., 403., 400., 410., 401., 407., 423., 406., 406., 402., 405., 405., 409., 399., 402., 407., 406., 413., 409., 404., 402., 404., 406., 407., 405., 411., 410., 410., 410., 401., 402., 404., 405., 392., 407., 406., 404., 403., 408., 404., 407., 412., 406., 409., 400., 408., 404., 401., 404., 408., 406., 408., 406., 401., 412., 393., 437., 418., 415., 404., 401., 401., 407., 412., 375., 409., 406., 398., 406., 403., 404. ) y <- sort( y ) print( n <- length( y ) ) # [1] 100 # plot the empirical CDF of the data vector plot( ecdf( y ), main = 'NB10 Case Study', xlab = 'y', ylab = 'CDF', lwd = 3, col = 'darkcyan' ) # the gaps in the empirical CDF are because there are # a lot of tied values in the data set: the built-in function 'table' # creates a raw frequency distribution summary of a data vector table( y ) # y # 375 392 393 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 415 418 423 437 # 1 1 1 1 2 7 4 12 8 6 9 5 12 8 5 5 4 1 3 1 1 1 1 1 number.of.bootstrap.replications <- 1000 set.seed( 4391557 ) # with the nonparametric bootstrap we take IID samples *from the sample* # (the data vector y) to get an idea of what a new data set # (also of size n = 100) would look like when generated by # the same data-generating process that gave us y for ( i in 1:number.of.bootstrap.replications ) { y.star <- sample( y, replace = T ) lines( ecdf( y.star ), lwd = 1, lty = 3, col = 'grey', pch = 1 ) } lines( ecdf( y ), lwd = 3, col = 'darkcyan' ) y.grid <- seq( min( y ), max( y ), length = 500 ) lines( y.grid, pnorm( y.grid, mean( y ), sd( y ) ), lty = 2, lwd = 3, col = 'red' ) legend( 370, 0.8, legend = c( 'Data ECDF', 'Bootstrap ECDFs', 'Normal' ), col = c( 'darkcyan', 'grey', 'red' ), lty = c( 1, 3, 2 ), lwd = 3 ) # the Normal CDF with the same mean and SD as our data vector y # is inconsistent with the bootstrap uncertainty around # the underlying CDF F of the data-generating process ############################################################################# # using the nonparametric bootstrap to illustrate # sampling distribution uncertainty on the PDF scale set.seed( 4391557 ) # we can do something similar on the density scale: plot( density( y ), main = 'NB10 Case Study', xlab = 'y', ylab = 'PDF', lwd = 3, col = 'darkcyan', ylim = c( 0, 0.135 ) ) for ( i in 1:number.of.bootstrap.replications ) { y.star <- sample( y, replace = T ) lines( density( y.star ), lwd = 1, lty = 3, col = 'grey', pch = 1 ) } lines( density( y ), lwd = 3, col = 'darkcyan' ) lines( y.grid, dnorm( y.grid, mean( y ), sd( y ) ), lty = 2, lwd = 3, col = 'red' ) legend( 415, 0.12, legend = c( 'Data PDF', 'Bootstrap PDFs', 'Normal' ), col = c( 'darkcyan', 'grey', 'red' ), lty = c( 1, 3, 2 ), lwd = 3 ) # the Normal PDF with the same mean and SD as our data vector y # is inconsistent with the bootstrap uncertainty around # the underlying PDF of the data-generating process ############################################################################# # using the parametric bootstrap to illustrate sampling distribution uncertainty # on the Normal qqplot scale qqnorm.data.results <- qqnorm( y, plot.it = F ) plot( 0, 0, type = 'n', main = 'NB10 Case Study', xlab = 'Normal Quantiles', ylab = 'Data Quantiles',xlim = c( -3, 3 ), ylim = c( min( y ), max( y ) ) ) set.seed( 4391557 ) # here i'm generating many random samples from the # Normal distribution with the same mean and sd # as the observed data vector, and superimposing # Normal qqplots based on those random samples # this provides an uncertainty envelope around # the Normal distribution with an n of 100 for ( i in 1:number.of.bootstrap.replications ) { y.star <- sort( rnorm( n, mean( y ), sd( y ) ) ) qqnorm.bootstrap.results <- qqnorm( y.star, plot.it = F ) lines( qqnorm.bootstrap.results$x, qqnorm.bootstrap.results$y, lty = 3, lwd = 1, col = 'gray' ) } abline( mean( y ), sd( y ), lwd = 3, col = 'blue', lty = 2 ) # now i superimpose the Normal qqplot of our actual data set lines( qqnorm.data.results$x, qqnorm.data.results$y, lty = 1, lwd = 5, col = 'darkcyan' ) legend( -2.5, 430, legend = c( 'Data Normal QQplot', 'Bootstrap Normal QQpplots', 'Target Normal Shape' ), col = c( 'darkcyan', 'grey', 'blue' ), lty = c( 1, 3, 2 ), lwd = 3 ) # again, the Normal PDF with the same mean and SD as our data vector y # is inconsistent with the bootstrap uncertainty around # the underlying PDF of the data-generating process ############################################################################# # conclusion: from all three plots, this data set is not consistent with # an IID Normal sampling distribution assumption ############################################################################# #############################################################################