##################################################################################
##################################################################################

# stat 206: inferential analysis in R of the captopril data

# dd (26 jan 2021)

# preamble

# unbuffer the output (if relevant in your R environment)

# change directory to where you downloaded the file 'captopril-data.txt'

# on my home desktop this may be accomplished with the function call

#   setwd( 'C:/DD/Teaching/AMS-STAT-206/Winter-2021' )

##################################################################################

y <- scan( 'captopril-data.txt' )

# Read 12 items

# graphically examine the plausibility of 
# the assumption that the population PDF is Normal

qqnorm( y, main = 'Captopril: Systolic Blood Pressure Differences' )

qqline( y, lwd = 3, col = 'darkcyan' )

# the points do approximately follow the line; the assumption is plausible

# calculate numerical descriptive summaries

print( Y.bar.n <- mean( y ) )

# [1] 18.58333

print( s.D <- sd( y ) )

# [1] 10.10363

n <- 12

# calculate inferential summaries

print( estimated.SE.of.Y.bar.n <- s.D / sqrt( n ) ) 

# [1] 2.915619

alpha <- 0.001

print( z.alpha <- qnorm( 1 - alpha / 2 ) )

# [1] 3.290527

t.alpha.n.minus.1 <- qt( 1 - alpha / 2, n - 1 )

# [1] 4.436979

4.436979 / 3.290527

# [1] 1.34841

# the correct t interval is 35% wider
# than the z interval, which pretends
# that we don't have any price to pay
# for { cheating by using the sample SD
# in place of the unknown population SD }

print( confidence.interval <- c( Y.bar.n - 
  t.alpha.n.minus.1 * estimated.SE.of.Y.bar.n, 
  Y.bar.n + t.alpha.n.minus.1 * estimated.SE.of.Y.bar.n ) )

# [1]  5.642144 31.524523

# another way to get the CI in R

t.test( y, conf.level = 1 - alpha )

#         One Sample t-test

# data:  y
# t = 6.3714, df = 11, p-value = 5.285e-05
# alternative hypothesis: true mean is not equal to 0
# 99.9 percent confidence interval:
#   5.642144 31.524523
# sample estimates:
# mean of x 
#  18.58333

# using the bootstrap to check the uncertainty of the qqnorm plot
# with our n of only 12

# for this to work, y must first be sorted from smallest to largest

y <- sort( y )

qqnorm( y, main = 'Captopril: Systolic Blood Pressure Differences' )

str( qqnorm.data.results <- qqnorm( y, plot.it = F ) )

# List of 2
#  $ x: num [1:12] -0.812 -1.15 0.105 -1.732 -0.105 ...
#  $ y: num [1:12] 9 4 21 3 20 31 17 26 26 10 ...

plot( 0, 0, type = 'n', xlab = 'Standard Normal Quantiles',
  ylab = 'Data Quantiles', xlim = c( -2, 2 ), ylim = c( 0, 35 ),
  main = 'Captopril: Systolic Blood Pressure Differences' )

number.of.bootstrap.replicates <- 1000

set.seed( 314159265 )

for ( i in 1:number.of.bootstrap.replicates ) {

  y.star <- sort( sample( y, replace = T ) )

  qqnorm.bootstrap.results <- qqnorm( y.star, plot.it = F )

  lines( qqnorm.bootstrap.results$x, qqnorm.bootstrap.results$y, 
    lty = 3, lwd = 1, col = 'gray40' )

}

lines( qqnorm.data.results$x, qqnorm.data.results$y, 
  lty = 1, lwd = 5, col = 'darkcyan' )

qqline( y, lwd = 3, col = 'blue', lty = 2 )

# this is a version of the normal qqplot in which the sorted data points
# have been connected with small line segments

# with only n = 12 observations, there's a lot of overlap
# in the bootstrap samples, but you can see pretty clearly
# that our actual data set is consistent with the
# assumption of Normality of the population PDF

# illustration of what happens to the bootstrap
# uncertainty bands with a larger value of n

n.illustration <- 100

y.illustration <- sort( rnorm( n.illustration, mean( y ), sd( y ) ) )

plot( 0, 0, type = 'n', xlab = 'Standard Normal Quantiles',
  ylab = 'Data Quantiles', xlim = c( -3, 3 ), 
  ylim = c( min( y.illustration ), max( y.illustration ) ),
  main = paste( 'Simulation With Captopril-Like Data With n = ',
    n.illustration ) )

number.of.bootstrap.replicates <- 1000

set.seed( 307157 )

for ( i in 1:number.of.bootstrap.replicates ) {

  y.star <- sort( sample( y.illustration, replace = T ) )

  qqnorm.bootstrap.results <- qqnorm( y.star, plot.it = F )

  lines( qqnorm.bootstrap.results$x, qqnorm.bootstrap.results$y, 
    lty = 3, lwd = 1, col = 'gray' )

}

qqnorm.illustration.results <- qqnorm( y.illustration, plot.it = F )

lines( qqnorm.illustration.results$x, qqnorm.illustration.results$y, 
  lty = 1, lwd = 5, col = 'darkcyan' )

qqline( y.illustration, lwd = 3, col = 'blue', lty = 2 )

# with n = 100 observations the sampling uncertainty in
# the normal qqplot is much smaller than with n = 10,
# and you can also now see clearly that the uncertainty
# is largest at the left and right edges of the data
# on the horizontal axis, which makes sense

##################################################################################
##################################################################################