###############################################################################
###############################################################################

# stat 206, winter 2021

# dd 16 mar 2021

#     R code for a partial analysis of the spam data set
#     in THT 3 problem 2(B)

#     fitting maximum-likelihood and bayesian logistic regression models
#     to predict a dichotomous outcome from some quantitative predictors

# (0) each new task for you below begins with a number, e.g., (1);
#     please ensure that you don't miss any of these numbers
#     as you work sequentially through the analysis from the top
#     to the bottom of this file

#     also: there are a number of stop signs below; most of them 
#     look like this:

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

#     i *highly* recommend that you do what the stop signs suggest,
#     each time that they appear; this will break the analysis down
#     into bite-sized pieces

################## the first block of code starts here ########################

##### data set context and description ########################################

# (1) please begin this analysis by studying the file

#       'spam-data-context.txt'

#     on the course web page, for important context and data description

#     we have 4601 email messages, each marked 'spam' or non-spam ('ham')
#     in a gold-standard manner, and 58 other variables:

#     57 of them are potential predictors of the dichotomous (1/0) outcome
#     'spam', and 1 is a dichotomy that records whether each email message
#     belongs either to the randomly-partitioned modeling or validation
#     subsets for cross-validation purposes

#     note: in machine learning the usual terminology is 'training' for
#     what i call 'modeling' and 'test' for what i call 'validation',
#     and where statisticians would say 'predictor variable',
#     machine learners would usually say 'feature' or 'attribute'

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

# (2) make a new (empty) directory in which to store the input and output
#     files for this analysis, and download into this directory, from the 
#     course website, all of the .txt files with the word 'spam' in their 
#     names and/or descriptions

#     on my desktop at home this directory is called

#       'C:/DD/Teaching/AMS-STAT-206/Winter-2021/Spam-Data-Analysis'

# please stop here and ensure that everything so far has been
# completed successfully

##### preamble ################################################################

# (3) start up a fresh copy of R, with no R objects in your current session;
#     verify this with the function call

ls( )

#     R should reply

#       character(0)

#     if it doesn't, issue the function calls

rm( list = ls( ) ) ; ls( )

#     now R should reply

#       character(0)

# please stop here and ensure that everything so far has been
# completed successfully

# (4) if relevant on your machine (e.g., Windows people using R instead
#     of RStudio), unbuffer the output

# please stop here and ensure that everything so far has been
# completed successfully

# (5) set your working directory to the place where you downloaded
#     the 'spam' files

#     as noted above, on my desktop at home this is

#       setwd( 'C:/DD/Teaching/AMS-STAT-206/Winter-2021/Spam-Data-Analysis' )

# please stop here and ensure that everything so far has been
# completed successfully

# (6) the next command reads in code for some functions we'll use below

source( 'spam-analysis-functions-R.txt' )

# please stop here and ensure that everything so far has been
# completed successfully

# (7) the next command calls a function defined in 
#     'spam-analysis-functions-R.txt' that automatically
#     installs a package (if it has not previously been installed)
#     and brings it into the current R session with 'library'; if it
#     *has* already been installed the function just invokes 'library'

dynamic.require( 'gtools' )

#     the output of this function should end with

#       "done"

# please stop here and ensure that everything so far has been
# completed successfully

##### read in the data set and examine its structure ##########################

# (8) let's get the data set into R and see what we've got:

raw.data <- read.table( 'spam.txt', header = T )

str( raw.data )

#     'data.frame':   4601 obs. of  59 variables:

#     $ spam      : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
#     $ testid    : logi  TRUE FALSE TRUE FALSE FALSE FALSE ...
#     $ make      : num  0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
#     $ address   : num  0.64 0.28 0 0 0 0 0 0 0 0.12 ...
#     $ all       : num  0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
#     $ X3d       : num  0 0 0 0 0 0 0 0 0 0 ...
#     $ our       : num  0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
#     $ over      : num  0 0.28 0.19 0 0 0 0 0 0 0.32 ...
#     $ remove    : num  0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
#     $ internet  : num  0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
#     $ order     : num  0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
#     $ mail      : num  0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
#     $ receive   : num  0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
#     $ will      : num  0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
#     $ people    : num  0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
#     $ report    : num  0 0.21 0 0 0 0 0 0 0 0 ...
#     $ addresses : num  0 0.14 1.75 0 0 0 0 0 0 0.12 ...
#     $ free      : num  0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
#     $ business  : num  0 0.07 0.06 0 0 0 0 0 0 0 ...
#     $ email     : num  1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
#     $ you       : num  1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
#     $ credit    : num  0 0 0.32 0 0 0 0 0 3.53 0.06 ...
#     $ your      : num  0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
#     $ font      : num  0 0 0 0 0 0 0 0 0 0 ...
#     $ X000      : num  0 0.43 1.16 0 0 0 0 0 0 0.19 ...
#     $ money     : num  0 0.43 0.06 0 0 0 0 0 0.15 0 ...
#     $ hp        : num  0 0 0 0 0 0 0 0 0 0 ...
#     $ hpl       : num  0 0 0 0 0 0 0 0 0 0 ...
#     $ george    : num  0 0 0 0 0 0 0 0 0 0 ...
#     $ X650      : num  0 0 0 0 0 0 0 0 0 0 ...
#     $ lab       : num  0 0 0 0 0 0 0 0 0 0 ...
#     $ labs      : num  0 0 0 0 0 0 0 0 0 0 ...
#     $ telnet    : num  0 0 0 0 0 0 0 0 0 0 ...
#     $ X857      : num  0 0 0 0 0 0 0 0 0 0 ...
#     $ data      : num  0 0 0 0 0 0 0 0 0.15 0 ...
#     $ X415      : num  0 0 0 0 0 0 0 0 0 0 ...
#     $ X85       : num  0 0 0 0 0 0 0 0 0 0 ...
#     $ technology: num  0 0 0 0 0 0 0 0 0 0 ...
#     $ X1999     : num  0 0.07 0 0 0 0 0 0 0 0 ...
#     $ parts     : num  0 0 0 0 0 0 0 0 0 0 ...
#     $ pm        : num  0 0 0 0 0 0 0 0 0 0 ...
#     $ direct    : num  0 0 0.06 0 0 0 0 0 0 0 ...
#     $ cs        : num  0 0 0 0 0 0 0 0 0 0 ...
#     $ meeting   : num  0 0 0 0 0 0 0 0 0 0 ...
#     $ original  : num  0 0 0.12 0 0 0 0 0 0.3 0 ...
#     $ project   : num  0 0 0 0 0 0 0 0 0 0.06 ...
#     $ re        : num  0 0 0.06 0 0 0 0 0 0 0 ...
#     $ edu       : num  0 0 0.06 0 0 0 0 0 0 0 ...
#     $ table     : num  0 0 0 0 0 0 0 0 0 0 ...
#     $ conference: num  0 0 0 0 0 0 0 0 0 0 ...
#     $ ch.       : num  0 0 0.01 0 0 0 0 0 0 0.04 ...
#     $ ch..1     : num  0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
#     $ ch..2     : num  0 0 0 0 0 0 0 0 0 0 ...
#     $ ch..3     : num  0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
#     $ ch..4     : num  0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
#     $ ch..5     : num  0 0.048 0.01 0 0 0 0 0 0.022 0 ...
#     $ crl.ave   : num  3.76 5.11 9.82 3.54 3.54 ...
#     $ crl.long  : int  61 101 485 40 40 15 4 11 445 43 ...
#     $ crl.tot   : int  278 1028 2259 191 191 54 112 49 1257 749 ...

#     the data frame 'raw.data' behaves in this case study like a matrix
#     with 4601 rows (one for each email message) and 59 columns (one for
#     each variable measured on the email messages), except that it's not
#     the case that every entry in the matrix is a real number

#     the 54 predictors from 'make' to 'ch..5' are expressed as percentages
#     from 0 to 100

#     to avoid variable names beginning with a number, which are not allowed
#     in R, note that the variables 'X000', 'X650', 'X857', 'X415','X85',
#     and 'X1999' all have names starting with 'X'

#     in 'crl.ave', 'crl.long', and 'crl.tot', the character string
#     'crl' is short for 'capital run length'; in other words, 'crl.ave'
#     is the average length in each email message of consecutive runs
#     of capital letters, 'crl.long' is the longest run of consecutive
#     capital letters in the email message, and 'crl.tot' is the total
#     number of consecutive capital letters in each message

#     note: ch.   = ;
#           ch..1 = (
#           ch..2 = [
#           ch..3 = !
#           ch..4 = $
#           ch..5 = #

#     note that the variables 'spam' and 'testid' are of type 'logi'
#     (short for 'logical'), with possible values 'TRUE' and 'FALSE';
#     the variables 'crl.long' and 'crl.tot' are of type 'int' (short 
#     for 'integer'), with possible values 0, 1, ...; and all of the other 
#     variables are of type 'num' (short for 'numerical'), with possible
#     values of the form 

#       any real number truncated to about 16 significant figures

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

##### data curation preceding the modeling, part (I) ##########################

# (9) now let's begin getting the data ready for modeling

#     the next function call examines all 4601 * 59 = 271459 values in
#     the data frame 'raw.data' and counts the total number of missing values
#     in the data set (recall that 'NA' (not available) is the missing data
#     code in R, so the function that looks for missing values is called 
#     'is.na'; it returns a logical matrix of 271459 TRUE and FALSE values
#     (TRUE if the data frame entry is 'NA', FALSE otherwise), which are 
#     converted to 1s (TRUE) and 0s (FALSE) by the 'sum' function:

sum( is.na( raw.data ) )

# [1] 0

#     the total number of missing values in 'raw.data' is 0; excellent

#     in a follow-on class to STAT 206, we would need several lectures
#     to discuss how to handle missingness in simple ways, and quite
#     a few more lectures to discuss advanced methods for coping with
#     missing data

print( n <- nrow( raw.data ) )

# [1] 4601

#     this is of course the number of documents (email messages) we have

print( k <- ncol( raw.data ) - 2 )

# [1] 57

#     and this is of course the number of predictors we have

#     the next command recodes the outcome variable to numeric from logical

raw.data$spam <- ifelse( raw.data$spam == T, 1, 0 )

#     the next command computes the relative frequency distribution of 'spam'

with( raw.data, table( spam ) / n )

# spam

#         0         1 
# 0.6059552 0.3940448

#     so we have about 39% 1s and 61% 0s; that should be enough of each
#     category to have a decent chance of predicting the 1s

#     in some of the modeling we'll do below, we'll need *standardized*
#     versions of all of the predictor variables: to standardize a predictor x
#     we compute 

#       x.standardized <- ( x - mean( x ) ) / sd( x ) , 

#     which makes x.standardized have mean 0 and SD 1  

#     since this is a linear transformation, it can't increase or decrease
#     the amount of signal for predicting an outcome y from x; what it *does* do
#     is put the regression coefficients on a common scale in the
#     *logistic regression* models we're going to fit

scaled.X.data.frame <- as.data.frame( scale( raw.data[ , 3:( k + 2 ) ] ) )

#     scaled.X.data.frame is now a data frame (not a matrix) consisting of
#     all of the predictor variables, scaled as above

spam <- raw.data$spam

scaled.data.frame <- cbind( spam, scaled.X.data.frame )

#     and 'scaled.data.frame' now has all 57 standardized predictors 
#     and the outcome

#     the next command computes the relative frequency distribution of 'testid'

with( raw.data, table( testid ) / n )

# testid

#     FALSE      TRUE 
# 0.6661595 0.3338405

#     i use 'T' and 'F' as abbreviations for 'TRUE' and 'FALSE';
#     R allows us to do this in our code as well as when describing our code

#     the people who created this data set put almost exactly 2/3
#     of the data in the modeling subset ('testid' = F) and 1/3
#     in the validation subset ('testid' = T); this is a reasonable choice

#     another reasonable choice would be 50/50; the research literature
#     is still unsettled over the question of the optimal modeling/validation
#     split

#     the next two commands extract the modeling and validation subsets;
#     our plan in this analysis is to use 2-way cross-validation, i.e.,
#     to build predictive models on the 'modeling' data and see how well 
#     they perform on the 'validation' data

raw.data.modeling <- raw.data[ ! raw.data$testid, ]

print( n.modeling <- dim( raw.data.modeling )[ 1 ] )

# [1] 3065

raw.data.validation <- raw.data[ raw.data$testid, ]

print( n.validation <- dim( raw.data.validation )[ 1 ] )

# [1] 1536

scaled.data.modeling <- scaled.data.frame[ ! raw.data$testid, ]

scaled.data.validation <- scaled.data.frame[ raw.data$testid, ]

#     from now on, for quite awhile, we're only going to work with
#     the modeling subset, leaving the validation subset untouched
#     to find out how good our modeling is on data not used in
#     the fitting process

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

##### data curation preceding the modeling, part (II) #########################

##### initial model-building on the 'modeling' subset #########################

# (10) now we can begin looking at the distributions of 
#      our predictor variables

#      at this point in careful statistical data science work, we would
#      examine each predictor variable one by one, to see if 
#      *non-linearly transformed* versions of any of the variables might have
#      more predictive signal in them than the raw versions do

#      since this is quite time-consuming here with 57 predictors, 
#      i'm only going to ask you to examine several of them, to give you
#      a sense of how this process goes

#      we want to make numerical and graphical descriptive summaries
#      (a) of all of the predictors univariately and (b) of the
#      bivariate relationships between each of the predictors and the outcome

#      the built-in function 'summary' is a good place to start

#      please examine the file

#       'spam-data-analysis-variable-summary.txt'

#      on the course web page, which includes a summary of all 59 variables
#      obtained with the command

summary( raw.data.modeling )

#      study this file; then run this code:

for ( i in 1:57 ) {

  if ( mean( raw.data.modeling[ , i + 2 ] ) > 
    median( raw.data.modeling[ , i + 2 ] ) ) {

    print( paste( 'yes, mean > median for predictor ', i, sep = '' ) )

  }

  else {

    print( paste( 'no, mean <= median for predictor ', i, sep = '' ) )

  }

}

for ( i in 1:57 ) {

  if ( min( raw.data.modeling[ , i + 2 ] ) >= 0 ) {

    print( paste( 'yes, min >= 0 for predictor ', i, sep = '' ) )

  }

  else {

    print( paste( 'no, min < 0 for predictor ', i, sep = '' ) )

  }

}

#      note that *every one* of the 57 predictors has its mean bigger
#      than its median while at the same time being non-negative;
#      what does that tell you about the histograms shapes?

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

################## the first block of code ends here ########################

################## the second block of code begins here #####################

# (11) to see all 57 histograms of the predictor variables below, 
#      you may need to adjust the choice of 'n.histogram.matrix' 
#      and fiddle with the rest of the histogram code, 
#      if your plotting window is too small, or you could try
#      resizing your plotting window to make it bigger

#      now run only plotting code block 1 and examine the plot

##### plotting code block 1 begins on this line #############################

n.histogram.matrix <- 5

par( mfrow = c( n.histogram.matrix, n.histogram.matrix ) )

for ( i in 1:n.histogram.matrix^2 ) {

  hist( raw.data.modeling[ , i + 2 ], main = '', prob = T,
    xlab = names( raw.data )[ i + 2 ] )

}

par( mfrow = c( 1, 1 ) )

##### plotting code block 1 ends on this line ###############################

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

#      after examining the first set of plots, run plotting code block 2

##### plotting code block 2 begins on this line #############################

par( mfrow = c( n.histogram.matrix, n.histogram.matrix ) )

for ( i in ( n.histogram.matrix^2 + 1 ):( 2 * n.histogram.matrix^2 ) ) {

  hist( raw.data.modeling[ , i + 2 ], main = '', prob = T,
    xlab = names( raw.data )[ i + 2 ] )

}

par( mfrow = c( 1, 1 ) )

##### plotting code block 2 ends on this line ##############################

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

#      and after examining the second set of plots, run plotting code block 3

##### plotting code block 3 begins on this line ############################

#      run this plotting code block twice: the first time as is,
#      the second time removing the # comment character at the beginning
#      of the pdf( ... ) and dev.off( ) lines, to save this plot to include
#      in your solutions

# pdf( 'stat-206-take-home-test-3-solutions-figure-for-2(B)(d).pdf' )

par( mfrow = c( 3, 3 ) )

for ( i in ( 2 * n.histogram.matrix^2 + 1 ):( ncol( raw.data ) - 2 ) ) {

  hist( raw.data.modeling[ , i + 2 ], main = '', prob = T,
    xlab = names( raw.data )[ i + 2 ] )

}

par( mfrow = c( 1, 1 ) )

# dev.off( )

##### plotting code block 3 ends on this line #############################

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

################## the second block of code ends here #####################

################## the third block of code begins here ####################

#     this code block computes the correlations between the outcome
#     variable 'spam' and all of the predictors and sorts them from largest
#     to smallest in absolute value before printing them, to show us
#     which predictors are most strongly correlated (univariately) with 'spam'

correlation.vector <- rep( NA, k - 1 )

for ( i in 1:k ) {

  correlation.vector[ i ] <- cor( raw.data.modeling$spam,
    raw.data.modeling[ , i + 2 ] )

}

correlation.table <- cbind( names( raw.data.modeling[ , 3:( k + 2 ) ] ), 
  signif( correlation.vector, 4 ) )

correlation.table[ order( abs( correlation.vector ), 
  decreasing = T ), ]

#       [,1]         [,2]      
#  [1,] "your"       "0.4012"  
#  [2,] "remove"     "0.3421"  
#  [3,] "free"       "0.3381"  
#  [4,] "X000"       "0.3369"  
#  [5,] "ch..4"      "0.3271"  
#  [6,] "you"        "0.2778"  
#  [7,] "business"   "0.2724"  
#  [8,] "hp"         "-0.2646" 
#  [9,] "receive"    "0.2467"  
# [10,] "over"       "0.2427"  
# 
#   . 
#   .
#   .
# 
# [48,] "ch..2"      "-0.06692"
# [49,] "ch..5"      "0.06583" 
# [50,] "report"     "0.0648"
# [51,] "direct"     "-0.06262"
# [52,] "X3d"        "0.05758" 
# [53,] "ch."        "-0.05623"
# [54,] "table"      "-0.04498"
# [55,] "address"    "-0.04102"
# [56,] "parts"      "-0.03357"
# [57,] "will"       "0.007804"

################## the third block of code ends here ######################

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

with( raw.data.modeling, plot( your, spam ) )

with( raw.data.modeling, lines( lowess( your, spam ), lty = 2, lwd = 3,
  col = 'maroon' ) )

with( raw.data.modeling, plot( will, spam ) )

with( raw.data.modeling, lines( lowess( will, spam ), lty = 2, lwd = 3,
  col = 'maroon' ) )

################## the fourth block of code begins here ###################

##### run this first sub-block, stop and examine the output ###############

#     long-right-hand-tailed variables that (a) take on only positive values
#     and (b) have a mode to the right of 0 can sometimes be transformed 
#     to Normality by taking logs: if the mode of such a variable is positive,
#     the log transform may well work; if not, the transformed variable
#     will have a spike in the left tail, with the rest of the transformed
#     values perhaps looking Normal

#     this is an instance of method (I) in the problem statement
#     for finding extra predictive signal

#     let's look at the variable 'make'

with( raw.data.modeling, hist( make, breaks = 100, prob = T, main = '' ) )

with( raw.data.modeling, sum( make == 0 ) / n )

# [1] 0.5090198

#     this shows that the mode of 'make' is 0

#     if we were to try to compute log( make ), 50.9% of the values
#     would be ( - infinity )

with( raw.data.modeling, min( make[ make > 0 ] ) )

# [1] 0.01

#     a standard thing to try with a variable such as 'make'
#     is the transformation

#   x -> log( x + C ),

#     where C is the smallest positive value x takes on

log.make.modeling <- log( raw.data.modeling$make + 0.01 )

hist( log.make.modeling, breaks = 100, prob = T, main = '', 
  xlab = 'log( make + 0.01 )' )

par( mfrow = c( 2, 1 ) )

with( raw.data.modeling, hist( make[ make > 0 ], 
  breaks = 100, prob = T, main = '' ) )

with( raw.data.modeling, hist( log( make[ make > 0 ] ), 
  breaks = 20, prob = T, main = '' ) )

par( mfrow = c( 1, 1 ) )

#     we see the anticipated shape

#     a good bit of the predictive signal from such a variable x
#     can be extracted by (a) making an indicator variable that's 1
#     when x is 0 and (b) predicting y from both log( x + C ) and
#     the indicator

make.is.0.modeling <- ifelse( raw.data.modeling$make == 0, 1, 0 )

tab.sum( make.is.0.modeling, raw.data.modeling$spam )

#      make.is.0.modeling n    mean      sd       
# [1,] 0                  723  0.593361  0.4915464
# [2,] 1                  2342 0.3368915 0.4727484
# [3,] "Total"            3065 0.3973899 0.4894378

#     you can see that the indicator variable for ( make = 0 )
#     by itself has a lot of predictive signal

#     here are the results of a logistic regression of 'spam'
#     just on 'make' (note that it's always good form to store
#     the results of the regression fitting in an object):

logistic.regression.of.spam.on.make <- glm( spam ~ make, 
  family = binomial( link = 'logit' ), data = raw.data.modeling )

summary( logistic.regression.of.spam.on.make )

# Call:
# glm(formula = raw.data.modeling$spam ~ raw.data.modeling$make, 
#   family = binomial(link = 'logit'))

# Deviance Residuals: 
#     Min       1Q   Median       3Q      Max  
# -2.4152  -0.9729  -0.9729   1.3967   1.3967  

# Coefficients:
#                        Estimate Std. Error z value Pr(>|z|)    
# (Intercept)            -0.50213    0.03967  -12.66  < 2e-16 ***
# raw.data.modeling$make  0.77487    0.13201    5.87 4.36e-09 ***
# ---
# Signif. codes:  0 ?***? 0.001 ?**? 0.01 ?*? 0.05 ?.? 0.1 ? ? 1

# (Dispersion parameter for binomial family taken to be 1)

#     Null deviance: 4119  on 3064  degrees of freedom
# Residual deviance: 4078  on 3063  degrees of freedom
# AIC: 4082

# Number of Fisher Scoring iterations: 4

#     there are logistic regression situations in which maximum-likelihood (ML)
#     fitting fails, because the log-likelihood function goes to + infinity
#     when one or more of the estimated beta coefficients tends to
#     infinity in absolute value; this may occur when some of the p.hat
#     values from the ML fit are identically 1 or 0 (or close enough to 1 or 0
#     to be essentially 1 or 0 in practice), because 
#     logit( 1 ) = + infinity and logit( 0 ) = - infinity

#     glm tries to cope with this by enforcing a condition of the form

#       - boundary < beta.hat < + boundary

#     where 'boundary' is a value that represents 'essentially infinite'

#     this means that we always need to follow a call to glm by
#     asking it two questions: do you think that the Fisher scoring algorithm
#     converged? and did it converge to a vector of beta.hat values
#     in which one or more of the beta.hats touched the boundary?

#     having previously stored the results of the glm fitting in an object,
#     (here 'logistic.regression.of.spam.on.make'), this can be done 
#     with the following command:

print( paste( 'converged?', 
  logistic.regression.of.spam.on.make$converged, 'boundary?', 
  logistic.regression.of.spam.on.make$boundary ) )

# [1] "converged? TRUE boundary? FALSE"

#     this means that everything is ok: convergence occurred,
#     and not to the boundary

############# the first sub-block ends here ################################

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

#     when you've absorbed the output of the first sub-block,

##### run this second sub-block, stop and examine the output ###############

#     and here are the results of a logistic regression of 'spam'
#     on log( 'make' + 0.01 ) and 'make.is.0':

logistic.regression.of.spam.on.log.make.and.make.is.0 <- 
  glm( spam ~ log.make.modeling + make.is.0.modeling, 
  family = binomial( link = 'logit' ), data = raw.data.modeling )

summary( logistic.regression.of.spam.on.log.make.and.make.is.0 )

# Call:
# glm(formula = raw.data.modeling$spam ~ log.make.modeling + 
# make.is.0.modeling, family = binomial(link = 'logit'))

# Deviance Residuals: 
#     Min       1Q   Median       3Q      Max  
# -1.5085  -0.9064  -0.9064   1.4751   1.4751  

# Coefficients:
#                    Estimate Std. Error z value Pr(>|z|)    
# (Intercept)         0.54374    0.12340   4.406 1.05e-05 ***
# log.make.modeling   0.14127    0.08209   1.721   0.0853 .  
# make.is.0.modeling -0.57034    0.29408  -1.939   0.0524 .  
# ---
# Signif. codes:  0 ?***? 0.001 ?**? 0.01 ?*? 0.05 ?.? 0.1 ? ? 1

# (Dispersion parameter for binomial family taken to be 1)

#     Null deviance: 4119.0  on 3064  degrees of freedom
# Residual deviance: 3966.8  on 3062  degrees of freedom
# AIC: 3972.8

# Number of Fisher Scoring iterations: 4

print( paste( 'converged?', 
  logistic.regression.of.spam.on.log.make.and.make.is.0$converged, 
  'boundary?', 
  logistic.regression.of.spam.on.log.make.and.make.is.0$boundary ) )

# [1] "converged? TRUE boundary? FALSE"

################## the second sub-block ends here ###########################

################## the fourth block of code ends here #######################

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

################## the fifth block of code begins here ######################

#     a different method for extracting more predictive signal
#     that often works is to transform a continuous variable
#     such as 'make' into a set of categorical indicators,
#     by cutting it at (for example) its deciles (10th percentile,
#     20th percentile, ..., 100th percentile)

#     this permits non-linear relationships between the predictor
#     and outcome to be explored without having to specify
#     the nature of the non-linearity: that makes it a
#     *nonparametric* way to look for extra signal

#     this is an instance of method (II) in the problem statement
#     for finding extra predictive signal

#     i've written a function called 

#       'logistic.regression.univariate.exploration'

#     that does this; i included it in the file 'spam-analysis-functions-R.txt'
#     that you 'source'd at the beginning of this analysis

#     the syntax of this function is as follows:

#       logistic.regression.univariate.exploration( x, y, name.x,
#         n.bootstrap.replicates, n.cutpoints, plot.pdf )

#     'x' is the predictor; 'y' is the binary outcome; name.x
#     is a character string giving the name of the variable x,
#     'n.bootstrap.replicates' is the desired number of lowess bootstrap curves 
#     in the plot; 'n.cutpoints' is the desired number of categories
#     defined by the quantiles of 'x' (if a single value of 'x' occurs with 
#     high frequency, you will get fewer categories; for example, 'make' 
#     is exactly 0 76% of the time, so when i ask for 10 groups cut at the 
#     deciles of 'make' only 3 groups are defined: (0th to 80th percentile), 
#     (80th to 90th), (90th to 100th); and you say 'plot.pdf = T' if
#     you want to make a PDF copy of the plot generated by this command
#     (this will be stored in your current working directory) and 
#     'plot.pdf = F if you don't want the PDF copy made

#     a reminder: *lowess* (locally weighted scatterplot smoother)
#     is a nonparametric way to explore the relationship between a single
#     real-valued predictor x and a single real-valued outcome y;
#     it does this by estimating the conditional expectation of y given x
#     in a number of overlapping vertical strips in the scatterplot
#     and then smoothing the resulting estimates

#     now run only exploration code block 1 and examine the output

##### exploration code block 1 begins on this line ########################

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

#     the next command examines the predictor 'make' with 10 requested
#     quantile cuts

with( raw.data.modeling, 
  logistic.regression.univariate.exploration( make, spam, 'make', 100, 10, 
  plot.pdf = F ) )

##### exploration code block 1 ends on this line ########################

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

#     after studying the first set of outputs, run exploration code block 2

##### exploration code block 2 begins on this line ########################

#     the next command examines 'make' with 20 requested quantile cuts

with( raw.data.modeling, 
  logistic.regression.univariate.exploration( make, spam, 'make', 100, 20, 
  plot.pdf = F ) )

##### exploration code block 2 ends on this line ########################

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

#     after studying the second set of outputs, run exploration code block 3

##### exploration code block 3 begins on this line ########################

#     the next command examines the predictor 'your' with 10 requested 
#     quantile cuts, and saves the plot in your current working directory

with( raw.data.modeling, 
  logistic.regression.univariate.exploration( your, spam, 'your', 100, 10, 
  plot.pdf = T ) )

#     include the resulting plot 'your.pdf' in your solutions

##### exploration code block 3 ends on this line ########################

################## the fifth block of code ends here #######################

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

#     at this point, if we had enough time, it would be good data-science 
#     practice to transform all of the predictors to extract maximum 
#     predictive signal before fitting a version of the full model 
#     with all predictors included

#     but instead we're now just going to fit the full model without taking 
#     advantage of the extra signal from transformations, because something 
#     happens that's worth knowing about when fitting GLMs to binary outcomes

#     a reminder: this is not cheating -- in the 2-way cross-validation
#     method we're using here, we're allowed to mess around any way
#     we want in the modeling data, as long as the result validates
#     in the validation data (if it doesn't ...)

################## the sixth block of code begins here ######################

#     the next command (a) fits a maximum likelihood logistic regression 
#     model to the modeling subset, with 'spam' as the outcome and all 
#     of the predictors standardized, and (b) prints a summary 
#     of the results:

#     the output of this command is sufficiently lengthy that i've put it
#     in a file called 'spam-data-analysis-glm-results.txt' on the
#     course web page

system.time( 

  print( summary( full.model.all.x.scaled.modeling <- glm( spam ~ ., 
    data = scaled.data.modeling, family = binomial( link = 'logit' ) ) ) )

)

# Call:

# glm(formula = spam ~ ., family = binomial(link = "logit"), 
#   data = scaled.data.modeling)

# Deviance Residuals: 
#     Min       1Q   Median       3Q      Max  
# -4.4162  -0.1754   0.0000   0.0959   3.6230  

# Coefficients:
#              Estimate Std. Error z value Pr(>|z|)    
# (Intercept) -18.98938    2.84602  -6.672 2.52e-11 ***
# make         -0.05356    0.07873  -0.680 0.496312    
# address      -0.18425    0.10554  -1.746 0.080860 .  
# all           0.11758    0.07840   1.500 0.133705    
# X3d           3.38348    2.63833   1.282 0.199690    
# our           0.26995    0.07383   3.656 0.000256 ***

#  .
#  .
#  .

# ch..4         1.65143    0.24410   6.765 1.33e-11 ***
# ch..5         0.93880    0.64286   1.460 0.144193    
# crl.ave       1.17019    0.93301   1.254 0.209766    
# crl.long      2.40336    0.70537   3.407 0.000656 ***
# crl.tot       0.35077    0.17446   2.011 0.044367 *  
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# (Dispersion parameter for binomial family taken to be 1)

#     Null deviance: 4119  on 3064  degrees of freedom
# Residual deviance: 1125  on 3007  degrees of freedom
# AIC: 1241

# Number of Fisher Scoring iterations: 13

#   user  system elapsed 
#   0.25    1.05    1.29

#     on my desktop at home, even with 3065 observations and 57 predictors,
#     the iterative maximum-likelihood fitting only takes about 1 second

#     (a good resource for understanding the iterative fitting algorithm
#     used by R in logistic regression is here)

#       http://hua-zhou.github.io/teaching/biostatm280-2017spring/slides/18-newton/newton.html

# Warning message:
# glm.fit: fitted probabilities numerically 0 or 1 occurred 

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

#     the warning message above may be telling us that the maximum-likelihood
#     fitting process has bumped up against the boundary for one or more
#     of the beta.hats; let's find out

print( paste( 'converged?', full.model.all.x.scaled.modeling$converged, 
  'boundary?', full.model.all.x.scaled.modeling$boundary ) )

# [1] "converged? TRUE boundary? FALSE"

#     so we don't have a boundary problem

#     however, after standardization of the predictors, any regression 
#     coefficient bigger than about 3 in absolute value is really big

#     let's see why this is true

#     here's a simple function that computes the marginal effect of adding
#     a new binary predictor variable x with logistic regression coefficient
#     beta.x to the prediction process:

logistic.regression.effect.of.adding.a.new.variable <- 
  function( p.hat.old, beta.x ) {

    l.old <- log( p.hat.old / ( 1 - p.hat.old ) )

    l.new <- l.old + beta.x

    p.hat.new <- 1 / ( 1 + exp( - l.new ) )

    return( cbind( p.hat.old, p.hat.new ) )

}

p.hat.old <- c( 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99 )

#     if beta.x = 0, of course the old and new p.hat values are the same:

logistic.regression.effect.of.adding.a.new.variable( p.hat.old, -3 )

#     try calling this function (a) with beta.x = -3 
#     and (b) with beta.x = +3

#     would you say that the p.hat values change dramatically with
#     beta.x = +/- 3?

#     looking at the output above, you can see that we *do* have several 
#     predictors with massive beta.hats after standardization, but some of 
#     these predictors (which ones?) also have massive beta.hat 
#     standard errors; bayesian logistic regression with a 
#     *regularizing prior* will help to stabilize maximum-likelihood 
#     estimation here

#     as we've discussed in class, maximum likelihood estimates can be 
#     biased; with n as the number of observations (rows) and k as the
#     number of predictors in the data set, if k is large relative to n,
#     maximum-likelihood estimates of logistic regression coefficients 
#     can be seriously biased away from 0 (i.e., if the population beta is
#     (e.g.) 5 the MLE might come out 7.5, or if beta = -5 the MLE
#     could easily be -6.9)

#     switching from a maximum likelihood analysis to a bayesian analysis 
#     with a prior distribution on the beta coefficients that's centered 
#     at 0 for each coefficient can de-bias the resulting estimates 
#     of the betas; such priors are called *regularizing* because 
#     the posterior modes of the beta coefficients -- these are called the

#       maximum a posteriori (MAP) estimates --

#     are arrived at by essentially grabbing the MLEs and shrinking them 
#     toward 0 in a way that makes their behavior 'more regular' 
#     (i.e., closer to unbiased)

################## the sixth block of code ends here ######################

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

################## the seventh block of code begins here ##################

#     first let's see how good the maximum likelihood logistic regression 
#     fitting thinks it's doing *internal* to the modeling data set, 
#     by looking at its predicted P ( spam | X ... B ) values on 
#     the modeling subset

p.hat.full.model.all.x.scaled.modeling <- 
  predict( full.model.all.x.scaled.modeling, type = 'response' )

par( mfrow = c( 2, 1 ) )

hist( p.hat.full.model.all.x.scaled.modeling[ 
  scaled.data.modeling$spam == 0 ], nclass = 100, main = 'y = 0', 
  xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) )

hist( p.hat.full.model.all.x.scaled.modeling[ 
  scaled.data.modeling$spam == 1 ], nclass = 100, main = 'y = 1', 
  xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) )

par( mfrow = c( 1, 1 ) )

#     the plot shows a decent degree of separation between the
#     ( p.hat values when spam = 1 ) and the ( p.hat values when spam = 0 )

c( mean( p.hat.full.model.all.x.scaled.modeling[ 
  scaled.data.modeling$spam == 0 ] ),
  mean( p.hat.full.model.all.x.scaled.modeling[ 
  scaled.data.modeling$spam == 1 ] ),
  mean( p.hat.full.model.all.x.scaled.modeling[ 
  scaled.data.modeling$spam == 1 ] ) -
  mean( p.hat.full.model.all.x.scaled.modeling[ 
  scaled.data.modeling$spam == 0 ] ) )

#                                  PSI
# [1] 0.09029248 0.86307864 0.77278616

#     now let's see how the maximum likelihood logistic regression model 
#     *fit to the modeling subset* does in making predictions 
#     in the *validation subset*     

scaled.data.validation.X <- subset( scaled.data.validation,
  select = - spam )

p.hat.full.model.all.x.scaled.validation <- 
  predict( full.model.all.x.scaled.modeling, type = 'response',
  newdata = scaled.data.validation.X )

c( mean( p.hat.full.model.all.x.scaled.validation[ 
  scaled.data.validation$spam == 0 ] ),
  mean( p.hat.full.model.all.x.scaled.validation[ 
  scaled.data.validation$spam == 1 ] ),
  mean( p.hat.full.model.all.x.scaled.validation[ 
  scaled.data.validation$spam == 1 ] ) -
  mean( p.hat.full.model.all.x.scaled.validation[ 
  scaled.data.validation$spam == 0 ] ) )

#                                  PSI
# [1] 0.09941087 0.85350183 0.75409096

( 0.77278616 - 0.75409096 ) / 0.75409096

# [1] 0.0247917

################## the seventh block of code ends here ##################

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

################## the eighth block of code begins here ##################

#     now let's fit a bayesian logistic regression model to the
#     modeling data, using an excellent regularization option called the
#     *horseshoe* prior

#     this is a carefully-constructed prior for the beta coefficients
#     that puts most of its weight near 0 but that has extremely long tails

#     there's a good MCMC CRAN package for doing this fitting 
#     called 'bayesreg'
 
dynamic.require( 'bayesreg' )

scaled.data.modeling.spam.factor <- scaled.data.modeling

scaled.data.modeling.spam.factor$spam <- as.factor( 
  scaled.data.modeling$spam )

#     in their documentation, the 'bayesreg' package developers unhelpfully 
#     forgot to mention what they use for starting values, and moreover 
#     you can't change their choice; let's assume that a burn-in of 
#     1000 iterations (their default choice) is (more than) enough 
#     to reach equilibrium

#     additionally, they have alpha hardwired at 0.05 in reporting
#     100 ( 1 - alpha )% credible intervals for the regression coefficients
#     and other interesting quantities; the output of their function call
#     is a list of 32 objects, from which we could extract the MCMC data set
#     and build our own 99.9% intervals, but i haven't supplied the code
#     for that here

#     in the next function call, you may need to adjust n.samples (the number
#     of monitoring iterations) down from 100000, depending on how fast
#     your computer is and how long you want to wait for the results;
#     please don't go below n.samples = 10000

#     the 'bayesreg' function unfortunately only uses a single thread
#     on whatever machine it's run on

#     on my desktop at home, in which the threads run at 3.40GHz,
#     101000 (burn-in + monitoring) iterations took 20-30 minutes,
#     depending on system load outside R

n.samples <- 100000

system.time( 

  bayesian.logistic.regression.modeling.subset <- 
    bayesreg( spam ~ ., data = scaled.data.modeling.spam.factor, 
    model = 'logistic', prior = 'horseshoe',
    n.samples = n.samples, burnin = 1000, thin = 1 )

)

summary( bayesian.logistic.regression.modeling.subset )

#     the full output from this command is lengthy; i've put it into a file
#     called 'spam-data-analysis-bayesreg-results.txt' on the course
#     web page; here's a partial summary

# ==========================================================================================
# |              Bayesreg: Bayesian Penalised Regression Estimation ver. 1.1               |
# |                        (c) Daniel F Schmidt, Enes Makalic. 2016-9                      |
# ==========================================================================================
# Bayesian logistic horseshoe regression                          Number of obs   =     3065
#                                                                 Number of vars  =       57
# MCMC Samples   = 100000                                         Log. Likelihood =  -566.35
# MCMC Burnin    =   1000                                         Pseudo R2       =   0.7250
# MCMC Thinning  =      1                                         WAIC            =   626.02

# -------------+----------------------------------------------------------------------------
#    Parameter |   med(Coef)    std(OR)    [95% Cred. Interval]      tStat    Rank     ESS %
# -------------+----------------------------------------------------------------------------
#         make |    -0.03533    0.06490     -0.19245    0.06379     -0.700      51 *    21.6

#    .
#    .
#    .

#      crl.tot |     0.23470    0.14389      0.00770    0.56465      1.722      29 **    2.1
#        _cons |   -16.61868    2.09895    -20.08681  -11.64032          .       .         .
# -------------+----------------------------------------------------------------------------

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

#     let's compare the maximum likelihood and bayes coefficients, 
#     and make a computation that's relevant to how well the markov chain 
#     is mixing:

full.model.all.x.scaled.modeling.ess <- 
  floor( bayesian.logistic.regression.modeling.subset$ess.frac * n.samples )

#     the ESS (effective sample size) for a beta coefficient is an estimate of

#       the number of IID samples that would have yielded
#       the same MCMC accuracy in estimating that beta coefficient
#       as the MCMC sample from a monitoring run of length 'n.samples'

options( scipen = 100 )

cbind( full.model.all.x.scaled.modeling$coefficients[ 2:( k + 1 ) ],
  bayesian.logistic.regression.modeling.subset$mu.beta,
  abs( bayesian.logistic.regression.modeling.subset$mu.beta ) <
  abs( full.model.all.x.scaled.modeling$coefficients[ 2:( k + 1 ) ] ),
  full.model.all.x.scaled.modeling.ess )

#     note that the ESS values below are from an MCMC monitoring run 
#     of length 100,000; if you used a shorter monitoring run, e.g.,
#     of length 10,000, your ESS values will be 10 times smaller
#     than those in the table below

#                                         is bayes closer                                          
#                     MLE          bayes  to 0 than MLE?     ESS
# make        -0.05355924  -0.0454105038         1         21561
# address     -0.18424821  -0.1766225067         1          8988
# all          0.11757740   0.0860558929         1         17865

#  .
#  .
#  .

# crl.ave      1.17018695   0.3113693163         1           437
# crl.long     2.40335702   2.6548719278         0          2145
# crl.tot      0.35076674   0.2478465043         1          2117

options( scipen = 0 )

sum( abs( bayesian.logistic.regression.modeling.subset$mu.beta ) <
  abs( full.model.all.x.scaled.modeling$coefficients[ 2:( k + 1 ) ] ) )

# [1] 45

sum( abs( bayesian.logistic.regression.modeling.subset$mu.beta ) <
  abs( full.model.all.x.scaled.modeling$coefficients[ 2:( k + 1 ) ] ) ) / k

# [1] 0.7894737

#     let's see how accurate the bayesian p.hat values are
#     *internal* to the modeling data set (i.e., using the modeling subset
#     to predict the y values in the modeling subset)

scaled.data.modeling.X <- subset( scaled.data.modeling, select = - spam )

p.hat.bayesian.logistic.regression.modeling.subset <- 
  predict( bayesian.logistic.regression.modeling.subset, 
  newdata = scaled.data.modeling.X, type = 'prob' )

par( mfrow = c( 2, 1 ) )

hist( p.hat.bayesian.logistic.regression.modeling.subset[ 
  scaled.data.modeling$spam == 0 ], nclass = 100, main = 'y = 0', 
  xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) )

hist( p.hat.bayesian.logistic.regression.modeling.subset[ 
  scaled.data.modeling$spam == 1 ], nclass = 100, main = 'y = 1', 
  xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) )

par( mfrow = c( 1, 1 ) )

c( mean( p.hat.bayesian.logistic.regression.modeling.subset[ 
  scaled.data.modeling$spam == 0 ] ),
  mean( p.hat.bayesian.logistic.regression.modeling.subset[ 
  scaled.data.modeling$spam == 1 ] ),
  mean( p.hat.bayesian.logistic.regression.modeling.subset[ 
  scaled.data.modeling$spam == 1 ] ) -
  mean( p.hat.bayesian.logistic.regression.modeling.subset[ 
  scaled.data.modeling$spam == 0 ] ) )

#                                  PSI
# [1] 0.09061353 0.85969332 0.76907979

#     now let's see how well the bayesian model, fit to the modeling
#     subset, does in predicting the spam status of the email messages 
#     in the validation subset:

p.hat.bayesian.logistic.regression.validation.subset <- 
  predict( bayesian.logistic.regression.modeling.subset, 
  newdata = scaled.data.validation.X, type = 'prob' )

par( mfrow = c( 2, 1 ) )

hist( p.hat.bayesian.logistic.regression.validation.subset[ 
  scaled.data.validation$spam == 0 ], nclass = 100, main = 'y = 0', 
  xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) )

hist( p.hat.bayesian.logistic.regression.validation.subset[ 
  scaled.data.validation$spam == 1 ], nclass = 100, main = 'y = 1', 
  xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) )

par( mfrow = c( 1, 1 ) )

c( mean( p.hat.bayesian.logistic.regression.validation.subset[ 
  scaled.data.validation$spam == 0 ] ),
  mean( p.hat.bayesian.logistic.regression.validation.subset[ 
  scaled.data.validation$spam == 1 ] ),
  mean( p.hat.bayesian.logistic.regression.validation.subset[ 
  scaled.data.validation$spam == 1 ] ) -
  mean( p.hat.bayesian.logistic.regression.validation.subset[ 
  scaled.data.validation$spam == 0 ] ) )

#                                  PSI
# [1] 0.09834665 0.85003499 0.75168834

#     one of the advantages of bayesian prediction of dichotomous outcomes
#     is that we get posterior distributions for each of the p.hat values,
#     so that we can look at the amount of uncertainty in the prediction
#     process

system.time( 

  p.hat.bayesian.modeling.intervals <- 
    predict( bayesian.logistic.regression.modeling.subset, 
    newdata = scaled.data.modeling.X, type = 'prob',
    bayes.avg = T )

)

#   user  system elapsed 
#  64.35    4.47   68.86

#     here are the results for the first 10 email messages
#     in the modeling subset

cbind( p.hat.bayesian.modeling.intervals[ 1:10, ], 
  scaled.data.modeling$spam[ 1:10 ] )

#            pred    se(pred)   CI 2.5%  CI 97.5%  
#  [1,] 0.9886944 0.004456867 0.9780111 0.9952436 1
#  [2,] 0.8229983 0.039932794 0.7367634 0.8918002 1
#  [3,] 0.8230338 0.039918916 0.7368092 0.8918023 1
#  [4,] 0.6431008 0.099470669 0.4448321 0.8288990 1
#  [5,] 0.7456441 0.077796560 0.5668872 0.8703213 1
#  [6,] 0.6380242 0.101359240 0.4367421 0.8279391 1
#  [7,] 0.8155320 0.089943487 0.5973087 0.9430495 1
#  [8,] 0.5014264 0.042634727 0.4192702 0.5862896 1
#  [9,] 0.9806469 0.008753362 0.9593688 0.9928564 1
# [10,] 0.5369823 0.051884474 0.4349436 0.6384813 1

#     and here are the results for the last 10 email messages
#     in the modeling subset

cbind( p.hat.bayesian.modeling.intervals[ ( n.modeling - 9 ):n.modeling, ], 
  scaled.data.modeling$spam[ ( n.modeling - 9 ):n.modeling ] )

#               pred     se(pred)      CI 2.5%     CI 97.5%  
#  [1,] 1.853997e-04 0.0003784888 2.414882e-06 1.088104e-03 0
#  [2,] 1.980770e-02 0.0144276265 2.295455e-03 5.673050e-02 0
#  [3,] 1.658516e-03 0.0030508096 1.769964e-05 9.659285e-03 0
#  [4,] 4.889281e-06 0.0001053029 2.525120e-11 3.136042e-05 0
#  [5,] 1.459612e-01 0.0411230216 7.610701e-02 2.357302e-01 0
#  [6,] 5.591704e-02 0.0178632481 2.638937e-02 9.565168e-02 0
#  [7,] 2.119996e-02 0.0191095306 1.932728e-03 7.280270e-02 0
#  [8,] 1.687736e-02 0.0103388278 3.750901e-03 4.273397e-02 0
#  [9,] 8.308764e-02 0.0253543768 3.910164e-02 1.380537e-01 0
# [10,] 2.892517e-02 0.0131386881 1.016472e-02 6.062356e-02 0

################## the eighth block of code ends here ####################

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

################## the ninth block of code begins here ###################

#     let's see how much better the predictions are (or at least
#     how much better they *seem* to be) when we include not only 
#     the intercept and all k = 57 predictors but also all ( k choose 2 ) 
#     *two-way interactions* among the predictors

#     this is part I of a highly successful current modeling strategy:

#       (I) flood the model with predictors

1 + k + choose( k, 2 )

# [1] 1654

#     this model has a total of 1654 predictors; let's fit it to the
#     modeling data (3065 email messages)

system.time( 

  full.model.all.x.scaled.all.interactions.modeling <- glm( spam ~ .^2, 
    data = scaled.data.modeling, family = binomial( link = 'logit' ) )

)

#    user  system elapsed 
#  123.89    0.18  124.33 

# Warning message:
# glm.fit: fitted probabilities numerically 0 or 1 occurred

print( paste( 'converged?', 
  full.model.all.x.scaled.all.interactions.modeling$converged, 'boundary?', 
  full.model.all.x.scaled.all.interactions.modeling$boundary ) )

# [1] "converged? TRUE boundary? FALSE"

summary( full.model.all.x.scaled.all.interactions.modeling$coefficients )

#       Min.    1st Qu.     Median       Mean    3rd Qu.       Max.       NA's 
# -3.748e+28 -7.171e+14 -2.486e+13  9.013e+24  6.399e+14  3.801e+28         98

#     has logistic regression gone crazy?

#     here's a summary of the maximum likelihood beta.hat values
#     from the main-effects-only fit:

summary( full.model.all.x.scaled.modeling$coefficients )

#      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
# -68.84359  -0.34494   0.00067  -1.79510   0.30611   3.38348

#     one way to see if logistic regression has gone crazy is to fit 
#     an ols multiple linear regression model to the same data set and see
#     if its coefficients are also crazy-looking

system.time( 

  ols.full.model.all.x.scaled.all.interactions.modeling <- lm( spam ~ .^2, 
    data = scaled.data.modeling )

)

#   user  system elapsed 
#   6.94    0.03    6.97

summary( ols.full.model.all.x.scaled.all.interactions.modeling$coefficients )

#       Min.    1st Qu.     Median       Mean    3rd Qu.       Max.       NA's 
# -1.104e+12  0.000e+00  0.000e+00  1.651e+09  0.000e+00  3.655e+12         99

#     these also look crazy

#     but let's look at the signal-to-noise ratios for the coefficients

se.hat.beta.hat.ols.full.model.all.x.scaled.all.interactions.modeling <- 
  sqrt( diag( vcov( ols.full.model.all.x.scaled.all.interactions.modeling ) ) )

z.ols.full.model.all.x.scaled.all.interactions.modeling <- 
  ols.full.model.all.x.scaled.all.interactions.modeling$coefficients /
  se.hat.beta.hat.ols.full.model.all.x.scaled.all.interactions.modeling

summary( z.ols.full.model.all.x.scaled.all.interactions.modeling )

#     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
# -5.80125 -0.69504 -0.05328 -0.06482  0.57694  4.85481       99

#     these don't seem crazy at all

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

#     now let's look at the quality of the p.hat values,
#     from the logistic regression model fit to the *modeling* subset, applied to
#     predict the *modeling* subset

p.hat.full.model.all.interactions.scaled.modeling <-
  predict( full.model.all.x.scaled.all.interactions.modeling,
  type = 'response' )

par( mfrow = c( 2, 1 ) )

hist( p.hat.full.model.all.interactions.scaled.modeling[ 
  scaled.data.modeling$spam == 0 ], nclass = 100, main = 'y = 0', 
  xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) )

hist( p.hat.full.model.all.interactions.scaled.modeling[ 
  scaled.data.modeling$spam == 1 ], nclass = 100, main = 'y = 1', 
  xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) )

par( mfrow = c( 1, 1 ) )

c( mean( p.hat.full.model.all.interactions.scaled.modeling[ 
  scaled.data.modeling$spam == 0 ] ),
  mean( p.hat.full.model.all.interactions.scaled.modeling[ 
  scaled.data.modeling$spam == 1 ] ),
  mean( p.hat.full.model.all.interactions.scaled.modeling[ 
  scaled.data.modeling$spam == 1 ] ) -
  mean( p.hat.full.model.all.interactions.scaled.modeling[ 
  scaled.data.modeling$spam == 0 ] ) )

#                                  PSI
# [1] 0.01407688 0.98357964 0.96950276

#     and now let's look at the quality of the p.hat values,
#     from the model fit to the *modeling* subset, applied to
#     predict the *validation* subset

p.hat.full.model.all.interactions.scaled.validation <- 
  predict( full.model.all.x.scaled.all.interactions.modeling, 
  newdata = scaled.data.validation.X, type = 'response' )

# Warning message:
# In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  :
#   prediction from a rank-deficient fit may be misleading

par( mfrow = c( 2, 1 ) )

hist( p.hat.full.model.all.interactions.scaled.validation[ 
  scaled.data.validation$spam == 0 ], nclass = 100, main = 'y = 0', 
  xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) )

hist( p.hat.full.model.all.interactions.scaled.validation[ 
  scaled.data.validation$spam == 1 ], nclass = 100, main = 'y = 1', 
  xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) )

par( mfrow = c( 1, 1 ) )

c( mean( p.hat.full.model.all.interactions.scaled.validation[ 
  scaled.data.validation$spam == 0 ] ),
  mean( p.hat.full.model.all.interactions.scaled.validation[ 
  scaled.data.validation$spam == 1 ] ),
  mean( p.hat.full.model.all.interactions.scaled.validation[ 
  scaled.data.validation$spam == 1 ] ) -
  mean( p.hat.full.model.all.interactions.scaled.validation[ 
  scaled.data.validation$spam == 0 ] ) )

#                               PSI
# [1] 0.2337938 0.8184874 0.5846936

#     what do you conclude about the extent of the over-fitting
#     with the all-main-effects-and-all-2-way-interactions model?

#     how do you feel about part I of the modeling strategy described above
#     (flood the model with predictors) if we don't follow it with
#     some sort of part II?

##################### the ninth block of code ends here #######################

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

##################### the tenth block of code begins here #####################

#     a frequently highly successful modeling strategy that's widely used
#     today is as follows:

#        (I) flood the model with predictors, and then
#       (II) regularize it by shrinking most of its coefficients
#            most or all of the way to 0, in a clever data-determined way

#     the CRAN package 'glmnet' does this extremely well
#     and with unbelievable computational efficiency

#     glmnet makes operational an idea called the *lasso* applied
#     to generalized linear models (GLMs, with logistic regression as a
#     special case)

#     in machine learning data science this would be called 

#       *regularization via penalized maximum likelihood*

#     in bayesian data science it would be an instance of

#       bayesian fitting of GLMs with a prior that shrinks the 
#       beta coefficients most or all of the way to 0 when the data evidence 
#       for them belonging in the model is weak

#     the regularization penalty acts like a log prior term 
#     in the following optimization:

#       { find the beta vector that minimizes the *penalized deviance*

#         - 2 * log( posterior ) = - 2 * log( likelihood ) - 2 * log( prior ) }

#     in other words, from a bayesian viewpoint the resulting beta.hat
#     values are the MAP (maximum a posteriori) estimates

#     a really nice tutorial on glmnet is available at

#       https://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html

#     the main advantage of the lasso is that it does an excellent job of
#     preventing over-fitting of a model that's been flooded with predictors,
#     by serving as step (II) in the modeling strategy above

#     the main disadvantage is that it has many tuning options, which can lead
#     to markedly better and worse results; it requires patience to explore
#     all (or even many) of them

#     first let's see how well glmnet performs on the logistic regression
#     model that includes only the 57 main effects

dynamic.require( 'glmnet' )

scaled.data.modeling.X <- model.matrix( spam ~ ., 
  data = scaled.data.modeling )

spam.modeling <- raw.data$spam[ ! raw.data$testid ]

spam.validation <- raw.data$spam[ raw.data$testid ]

n.lambda <- 100

system.time(

  spam.glmnet.modeling <- glmnet( x = scaled.data.modeling.X, 
    y = spam.modeling, family = 'binomial', alpha = 1, nlambda = n.lambda )

)

#   user  system elapsed 
#   2.48    0.00    2.56

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

#     ok, let's begin examining the results

print( spam.glmnet.modeling )

# Call: glmnet( x = scaled.data.modeling.X, y = spam.modeling, 
#   family = 'binomial', alpha = 1, nlambda = n.lambda ) 

#     Df  %Dev   Lambda
#  1   0  0.00 0.196300
#  2   1  2.01 0.178900
#  3   1  3.66 0.163000
#  4   5  7.51 0.148500
#  5   5 11.36 0.135300

#  .
#  .
#  .

# 90  57 72.55 0.000050
# 91  57 72.57 0.000045
# 92  57 72.57 0.000041

#     the tuning constant 'lambda' in the lasso controls the strength
#     of the regularization penalty

#     lambda is optimized when the percentage of the 

#       penalized deviance from a null model with no predictors

#     'explained' by the model stops increasing

#     here i told it to examine n.lambda = 100 values of lambda, 
#     but it stopped after 92 values because the percentage of deviance 
#     'explained' stopped increasing

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

#     the next block of code compares the logistic regression coefficients
#     from three fitting methods used with the model containing all,
#     and only, main effects

final.lambda.index.modeling <- 92

options( scipen = 100 )

cbind( full.model.all.x.scaled.modeling$coefficients[ 2:( k + 1 ) ],
  bayesian.logistic.regression.modeling.subset$mu.beta,
  coef( spam.glmnet.modeling, 
  s = spam.glmnet.modeling$lambda[ final.lambda.index.modeling ] )[ 3:59 ] )

#                logistic regression coefficients
                                          
#                     MLE          bayes        lasso
# make        -0.05355924  -0.0453229075  -0.05163405
# address     -0.18424821  -0.1778458973  -0.18751606
# all          0.11757740   0.0885063904   0.11427255
# X3d          3.38348273   1.8386863082   2.71297346
# our          0.26995411   0.2648551860   0.27173197

#  .
#  .
#  .

# hp          -3.52061215  -3.6082669712  -3.45132980
# hpl         -0.29979501  -0.2219481305  -0.32325034
# george     -68.84358891 -49.4865995543 -50.77450023 <--
# X650        -0.06749844  -0.0703094172  -0.06806936
# lab         -1.45852164  -0.9112353687  -1.29589733

#  .
#  .
#  .

# direct      -0.06047450  -0.0125036076  -0.04062539
# cs         -15.69455882  -2.8051143512  -4.67886981 <--
# meeting     -1.96951438  -1.6506354557  -1.83238320

#  .
#  .
#  .

# crl.long     2.40335702   2.3422198151   2.32867257
# crl.tot      0.35076674   0.2563091373   0.31292510

options( scipen = 0 )

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

#     the next function call creates a plot that shows
#     the coefficient shrinkage as a function of the strength
#     of the penalty (values to the left arise from the
#     strongest penalties)

plot( spam.glmnet.modeling, lwd = 3 )

#     the path of the coefficient for 'george' is shown in green
#     and starts at about 'coefficients = -50' at the bottom right
#     of the plot; you can see it get hammered all the way back to 0
#     as your eye moves from right to left (i.e., as the penalty
#     increases)

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

#     let's see how well the lasso fit to the modeling subset
#     predicts the validation subset:

scaled.data.validation.X <- model.matrix( spam ~ ., 
  data = scaled.data.validation )

p.hat.spam.glmnet.validation <- predict( spam.glmnet.modeling,
  newx = scaled.data.validation.X, type = 'response',
  s = spam.glmnet.modeling$lambda[ final.lambda.index.modeling ] )

par( mfrow = c( 2, 1 ) )

hist( p.hat.spam.glmnet.validation[ 
  scaled.data.validation$spam == 0 ], nclass = 100, main = 'y = 0', 
  xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) )

hist( p.hat.spam.glmnet.validation[ 
  scaled.data.validation$spam == 1 ], nclass = 100, main = 'y = 1', 
  xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) )

par( mfrow = c( 1, 1 ) )

c( mean( p.hat.spam.glmnet.validation[ 
  scaled.data.validation$spam == 0 ] ),
  mean( p.hat.spam.glmnet.validation[ 
  scaled.data.validation$spam == 1 ] ),
  mean( p.hat.spam.glmnet.validation[ 
  scaled.data.validation$spam == 1 ] ) -
  mean( p.hat.spam.glmnet.validation[ 
  scaled.data.validation$spam == 0 ] ) )

#     summary (all models use all main effects only)

#                                  PSI

# [1] 0.1006029  0.8513315  0.75072860    logistic regression lasso

# [1] 0.09061353 0.85969332 0.76907979    bayesian logistic regression
#                                         with the horseshoe prior

# [1] 0.09941087 0.85350183 0.75409096    ordinary maximum likelihood
#                                         logistic regression

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

#     as our final look at the lasso, let's turn it loose
#     on the model with all main effects and all 2-way interactions:

scaled.data.modeling.with.interactions.X <- 
  model.matrix( spam ~ .^2, data = scaled.data.modeling )

n.lambda <- 100

system.time(

  spam.glmnet.modeling.with.interactions <- 
    glmnet( x = scaled.data.modeling.with.interactions.X, y = spam.modeling,
    family = 'binomial', alpha = 1, nlambda = n.lambda )

)

#   user  system elapsed 
#  13.36    0.05   13.40

#     glmnet just fit a lasso logistic regression with 1654 predictors
#     3065 subjects, and 100 different lambda values, in 13 seconds 
#     of clock time (!)

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

print( spam.glmnet.modeling.with.interactions )

# Call:  glmnet( x = scaled.data.modeling.with.interactions.X, 
#   y = spam.modeling, family = 'binomial', alpha = 1, nlambda = n.lambda ) 

#       Df  %Dev   Lambda
#   1    0  0.00 0.196300
#   2    1  2.01 0.178900
#   3    1  3.66 0.163000

#   .
#   .
#   .

#  98  467 96.77 0.000024
#  99  470 96.83 0.000022
# 100  483 96.92 0.000020

#     i tried n.lambda = 200 but it found the same solution as with 100

#     notice that the lasso retains only 483 of the 1654 predictors
#     with the optimal lambda, and achieves a 96.9% value for the
#     percentage of the null penalized deviance 'explained'

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

plot( spam.glmnet.modeling.with.interactions, lwd = 3 )

#     this plot is hard to interpret with so many predictors

#     and finally for the lasso work, let's see how well the lasso 
#     fit to the modeling subset predicts the modeling and validation subsets

#     this is the ultimate test of whether the over-fitting
#     exhibited by vanilla maximum likelihood has been overcome
#     by the lasso

#     first, how well does the lasso fit to the modeling subset 
#     predict the modeling subset?

scaled.data.modeling.with.interactions.X <- 
  model.matrix( spam ~ .^2, data = scaled.data.modeling )

p.hat.spam.glmnet.modeling.with.interactions <- 
  predict( spam.glmnet.modeling.with.interactions,
  newx = scaled.data.modeling.with.interactions.X, type = 'response',
  s = spam.glmnet.modeling.with.interactions$lambda[ 100 ] )

par( mfrow = c( 2, 1 ) )

hist( p.hat.spam.glmnet.modeling.with.interactions[ 
  scaled.data.modeling$spam == 0 ], nclass = 100, main = 'y = 0', 
  xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) )

hist( p.hat.spam.glmnet.modeling.with.interactions[ 
  scaled.data.modeling$spam == 1 ], nclass = 100, main = 'y = 1', 
  xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) )

par( mfrow = c( 1, 1 ) )

c( mean( p.hat.spam.glmnet.modeling.with.interactions[ 
  scaled.data.modeling$spam == 0 ] ),
  mean( p.hat.spam.glmnet.modeling.with.interactions[ 
  scaled.data.modeling$spam == 1 ] ),
  mean( p.hat.spam.glmnet.modeling.with.interactions[ 
  scaled.data.modeling$spam == 1 ] ) -
  mean( p.hat.spam.glmnet.modeling.with.interactions[ 
  scaled.data.modeling$spam == 0 ] ) )

# [1] 0.01170338 0.98225276 0.97054938  lasso, modeling fit, modeling predicted

# [1] 0.01407688 0.98357964 0.96950276  ordinary maximum likelihood,
#                                       modeling fit, modeling predicted

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

#     and now, how well does the lasso fit to the modeling subset 
#     predict the validation subset?

scaled.data.validation.with.interactions.X <- 
  model.matrix( spam ~ .^2, data = scaled.data.validation )

p.hat.spam.glmnet.validation.with.interactions <- 
  predict( spam.glmnet.modeling.with.interactions,
  newx = scaled.data.validation.with.interactions.X, type = 'response',
  s = spam.glmnet.modeling.with.interactions$lambda[ 100 ] )

par( mfrow = c( 2, 1 ) )

hist( p.hat.spam.glmnet.validation.with.interactions[ 
  scaled.data.validation$spam == 0 ], nclass = 100, main = 'y = 0', 
  xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) )

hist( p.hat.spam.glmnet.validation.with.interactions[ 
  scaled.data.validation$spam == 1 ], nclass = 100, main = 'y = 1', 
  xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) )

par( mfrow = c( 1, 1 ) )

c( mean( p.hat.spam.glmnet.validation.with.interactions[ 
  scaled.data.validation$spam == 0 ] ),
  mean( p.hat.spam.glmnet.validation.with.interactions[ 
  scaled.data.validation$spam == 1 ] ),
  mean( p.hat.spam.glmnet.validation.with.interactions[ 
  scaled.data.validation$spam == 1 ] ) -
  mean( p.hat.spam.glmnet.validation.with.interactions[ 
  scaled.data.validation$spam == 0 ] ) )

# [1] 0.09390095 0.90410495 0.81020400  lasso, modeling fit, validation predicted

#     summary: all methods use all main effects and all 2-way interactions 

#                                  PSI

# [1] 0.01407688 0.98357964 0.96950276  ordinary maximum likelihood, modeling fit, 
#                                       modeling predicted

# [1] 0.2337938  0.8184874  0.5846936   ordinary maximum likelihood,
#                                       modeling fit, validation predicted

# [1] 0.01170338 0.98225276 0.97054938  lasso, modeling fit, modeling predicted

# [1] 0.09390095 0.90410495 0.81020400  lasso, modeling fit, validation predicted

#     does the lasso pass the overfitting test at an A+ level?

###################### the tenth block of code ends here ######################

# please stop here, ensure that everything so far has been
# completed successfully, and study the most recent results

################### the eleventh block of code begins here ####################

#     and now, for dessert, here's a machine learning method that
#     typically achieves *really good* out-of-sample predictions

#     it's called *extreme gradient boosting (GB)*, as implemented in the
#     CRAN package 'xgboost' (the 'x' in 'extreme' refers to the fact
#     that 'xgboost' automatically uses all of the threads your CPU has
#     to do really fast parallel computation)

#     i'll explain the GB idea in office hours; it's a bit complicated

#     the main strength of GB is that it creates (highly) accurate
#     predictions for data not used in the fitting process, and it does so
#     in a nonparametric way

#     its main weakness is that it *really* is a black box

#     here, since this is dessert, let's just see 'xgboost' in action:

dynamic.require( 'xgboost' )

spam.modeling <- raw.data$spam[ ! raw.data$testid ]

spam.validation <- raw.data$spam[ raw.data$testid ]

spam.modeling.xgb <- xgb.DMatrix( data = 
  as.matrix( scaled.data.modeling )[ , 2:58 ], label = spam.modeling )

spam.validation.xgb <- xgb.DMatrix( 
  data = as.matrix( scaled.data.validation )[ , 2:58 ], label = spam.validation )

spam.watchlist <- list( train = spam.modeling.xgb, test = spam.validation.xgb )

set.seed( 5 )

spam.cross.validated.xgboost.model <- 
  xgb.train( data = spam.modeling.xgb, nrounds = 1000, 
  watchlist = spam.watchlist,
  max_depth = 4, eta = 0.5, nthread = 8, subsample = 0.5, 
  objective = 'binary:logistic', verbose = 1, 
  early_stopping_rounds = 100, eval_metric = 'error' )

# [1]     train-error:0.092007    test-error:0.107422 

# Multiple eval metrics are present. Will use test_error for early stopping.
# Will train until test_error hasn't improved in 100 rounds.

#   [2]   train-error:0.085155    test-error:0.103516 
#   [3]   train-error:0.080914    test-error:0.093750 
#   [4]   train-error:0.071126    test-error:0.084635 
#   [5]   train-error:0.062643    test-error:0.068359 

#    .
#    .
#    .

# [133]   train-error:0.000653    test-error:0.044922 
# [134]   train-error:0.000653    test-error:0.045573 
# [135]   train-error:0.000979    test-error:0.043620 <--
# [136]   train-error:0.000653    test-error:0.046875 
# [137]   train-error:0.000653    test-error:0.047526 
# [138]   train-error:0.000653    test-error:0.046875 

#    .
#    .
#    .

# [139]   train-error:0.000326    test-error:0.046224 
# [140]   train-error:0.000653    test-error:0.047526 

#    .
#    .
#    .

# [234]   train-error:0.000326    test-error:0.054036 
# [235]   train-error:0.000653    test-error:0.055339 

# Stopping. Best iteration:

# [135]   train-error:0.000979    test-error:0.043620

#     here 'error' = 
#       ( number of incorrectly classified email messages ) /
#       ( total number of email messages ),

#     using the simple rule 'predict spam if p.hat > 0.5'

p.hat.spam.xgb.validation <- predict( spam.cross.validated.xgboost.model,
  newdata = spam.validation.xgb )

par( mfrow = c( 2, 1 ) )

hist( p.hat.spam.xgb.validation[ 
  scaled.data.validation$spam == 0 ], nclass = 100, main = 'y = 0', 
  xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) )

hist( p.hat.spam.xgb.validation[ 
  scaled.data.validation$spam == 1 ], nclass = 100, main = 'y = 1', 
  xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) )

par( mfrow = c( 1, 1 ) )

c( mean( p.hat.spam.xgb.validation[ 
  scaled.data.validation$spam == 0 ] ),
  mean( p.hat.spam.xgb.validation[ 
  scaled.data.validation$spam == 1 ] ),
  mean( p.hat.spam.xgb.validation[ 
  scaled.data.validation$spam == 1 ] ) -
  mean( p.hat.spam.xgb.validation[ 
  scaled.data.validation$spam == 0 ] ) )

#                                  PSI

# [1] 0.04916793 0.91758378 0.86841585  xgboost, no need to specify whether
#                                       main effects only are used, or whether
#                                       interactions are included, 
#                                       validation predicted without being used

# [1] 0.09390095 0.90410495 0.81020400  lasso, modeling fit (main effects
#                                       plus all 2-way interactions), 
#                                       validation predicted without being used

#################### the eleventh block of code ends here #####################

#     there's lots more to do with this case study,
#     but let's end here for now

###############################################################################
###############################################################################