################################################################################## ################################################################################## # stat 206: inferential analysis in R of the captopril data # dd (26 jan 2021) # preamble # unbuffer the output (if relevant in your R environment) # change directory to where you downloaded the file 'captopril-data.txt' # on my home desktop this may be accomplished with the function call # setwd( 'C:/DD/Teaching/AMS-STAT-206/Winter-2021' ) ################################################################################## y <- scan( 'captopril-data.txt' ) # Read 12 items # graphically examine the plausibility of # the assumption that the population PDF is Normal qqnorm( y, main = 'Captopril: Systolic Blood Pressure Differences' ) qqline( y, lwd = 3, col = 'darkcyan' ) # the points do approximately follow the line; the assumption is plausible # calculate numerical descriptive summaries print( Y.bar.n <- mean( y ) ) # [1] 18.58333 print( s.D <- sd( y ) ) # [1] 10.10363 n <- 12 # calculate inferential summaries print( estimated.SE.of.Y.bar.n <- s.D / sqrt( n ) ) # [1] 2.915619 alpha <- 0.001 print( z.alpha <- qnorm( 1 - alpha / 2 ) ) # [1] 3.290527 t.alpha.n.minus.1 <- qt( 1 - alpha / 2, n - 1 ) # [1] 4.436979 4.436979 / 3.290527 # [1] 1.34841 # the correct t interval is 35% wider # than the z interval, which pretends # that we don't have any price to pay # for { cheating by using the sample SD # in place of the unknown population SD } print( confidence.interval <- c( Y.bar.n - t.alpha.n.minus.1 * estimated.SE.of.Y.bar.n, Y.bar.n + t.alpha.n.minus.1 * estimated.SE.of.Y.bar.n ) ) # [1] 5.642144 31.524523 # another way to get the CI in R t.test( y, conf.level = 1 - alpha ) # One Sample t-test # data: y # t = 6.3714, df = 11, p-value = 5.285e-05 # alternative hypothesis: true mean is not equal to 0 # 99.9 percent confidence interval: # 5.642144 31.524523 # sample estimates: # mean of x # 18.58333 # using the bootstrap to check the uncertainty of the qqnorm plot # with our n of only 12 # for this to work, y must first be sorted from smallest to largest y <- sort( y ) qqnorm( y, main = 'Captopril: Systolic Blood Pressure Differences' ) str( qqnorm.data.results <- qqnorm( y, plot.it = F ) ) # List of 2 # $ x: num [1:12] -0.812 -1.15 0.105 -1.732 -0.105 ... # $ y: num [1:12] 9 4 21 3 20 31 17 26 26 10 ... plot( 0, 0, type = 'n', xlab = 'Standard Normal Quantiles', ylab = 'Data Quantiles', xlim = c( -2, 2 ), ylim = c( 0, 35 ), main = 'Captopril: Systolic Blood Pressure Differences' ) number.of.bootstrap.replicates <- 1000 set.seed( 314159265 ) for ( i in 1:number.of.bootstrap.replicates ) { y.star <- sort( sample( y, replace = T ) ) qqnorm.bootstrap.results <- qqnorm( y.star, plot.it = F ) lines( qqnorm.bootstrap.results$x, qqnorm.bootstrap.results$y, lty = 3, lwd = 1, col = 'gray40' ) } lines( qqnorm.data.results$x, qqnorm.data.results$y, lty = 1, lwd = 5, col = 'darkcyan' ) qqline( y, lwd = 3, col = 'blue', lty = 2 ) # this is a version of the normal qqplot in which the sorted data points # have been connected with small line segments # with only n = 12 observations, there's a lot of overlap # in the bootstrap samples, but you can see pretty clearly # that our actual data set is consistent with the # assumption of Normality of the population PDF # illustration of what happens to the bootstrap # uncertainty bands with a larger value of n n.illustration <- 100 y.illustration <- sort( rnorm( n.illustration, mean( y ), sd( y ) ) ) plot( 0, 0, type = 'n', xlab = 'Standard Normal Quantiles', ylab = 'Data Quantiles', xlim = c( -3, 3 ), ylim = c( min( y.illustration ), max( y.illustration ) ), main = paste( 'Simulation With Captopril-Like Data With n = ', n.illustration ) ) number.of.bootstrap.replicates <- 1000 set.seed( 307157 ) for ( i in 1:number.of.bootstrap.replicates ) { y.star <- sort( sample( y.illustration, replace = T ) ) qqnorm.bootstrap.results <- qqnorm( y.star, plot.it = F ) lines( qqnorm.bootstrap.results$x, qqnorm.bootstrap.results$y, lty = 3, lwd = 1, col = 'gray' ) } qqnorm.illustration.results <- qqnorm( y.illustration, plot.it = F ) lines( qqnorm.illustration.results$x, qqnorm.illustration.results$y, lty = 1, lwd = 5, col = 'darkcyan' ) qqline( y.illustration, lwd = 3, col = 'blue', lty = 2 ) # with n = 100 observations the sampling uncertainty in # the normal qqplot is much smaller than with n = 10, # and you can also now see clearly that the uncertainty # is largest at the left and right edges of the data # on the horizontal axis, which makes sense ################################################################################## ##################################################################################