############################################################################### ############################################################################### # stat 206, winter 2021 # dd 16 mar 2021 # R code for a partial analysis of the spam data set # in THT 3 problem 2(B) # fitting maximum-likelihood and bayesian logistic regression models # to predict a dichotomous outcome from some quantitative predictors # (0) each new task for you below begins with a number, e.g., (1); # please ensure that you don't miss any of these numbers # as you work sequentially through the analysis from the top # to the bottom of this file # also: there are a number of stop signs below; most of them # look like this: # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results # i *highly* recommend that you do what the stop signs suggest, # each time that they appear; this will break the analysis down # into bite-sized pieces ################## the first block of code starts here ######################## ##### data set context and description ######################################## # (1) please begin this analysis by studying the file # 'spam-data-context.txt' # on the course web page, for important context and data description # we have 4601 email messages, each marked 'spam' or non-spam ('ham') # in a gold-standard manner, and 58 other variables: # 57 of them are potential predictors of the dichotomous (1/0) outcome # 'spam', and 1 is a dichotomy that records whether each email message # belongs either to the randomly-partitioned modeling or validation # subsets for cross-validation purposes # note: in machine learning the usual terminology is 'training' for # what i call 'modeling' and 'test' for what i call 'validation', # and where statisticians would say 'predictor variable', # machine learners would usually say 'feature' or 'attribute' # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results # (2) make a new (empty) directory in which to store the input and output # files for this analysis, and download into this directory, from the # course website, all of the .txt files with the word 'spam' in their # names and/or descriptions # on my desktop at home this directory is called # 'C:/DD/Teaching/AMS-STAT-206/Winter-2021/Spam-Data-Analysis' # please stop here and ensure that everything so far has been # completed successfully ##### preamble ################################################################ # (3) start up a fresh copy of R, with no R objects in your current session; # verify this with the function call ls( ) # R should reply # character(0) # if it doesn't, issue the function calls rm( list = ls( ) ) ; ls( ) # now R should reply # character(0) # please stop here and ensure that everything so far has been # completed successfully # (4) if relevant on your machine (e.g., Windows people using R instead # of RStudio), unbuffer the output # please stop here and ensure that everything so far has been # completed successfully # (5) set your working directory to the place where you downloaded # the 'spam' files # as noted above, on my desktop at home this is # setwd( 'C:/DD/Teaching/AMS-STAT-206/Winter-2021/Spam-Data-Analysis' ) # please stop here and ensure that everything so far has been # completed successfully # (6) the next command reads in code for some functions we'll use below source( 'spam-analysis-functions-R.txt' ) # please stop here and ensure that everything so far has been # completed successfully # (7) the next command calls a function defined in # 'spam-analysis-functions-R.txt' that automatically # installs a package (if it has not previously been installed) # and brings it into the current R session with 'library'; if it # *has* already been installed the function just invokes 'library' dynamic.require( 'gtools' ) # the output of this function should end with # "done" # please stop here and ensure that everything so far has been # completed successfully ##### read in the data set and examine its structure ########################## # (8) let's get the data set into R and see what we've got: raw.data <- read.table( 'spam.txt', header = T ) str( raw.data ) # 'data.frame': 4601 obs. of 59 variables: # $ spam : logi TRUE TRUE TRUE TRUE TRUE TRUE ... # $ testid : logi TRUE FALSE TRUE FALSE FALSE FALSE ... # $ make : num 0 0.21 0.06 0 0 0 0 0 0.15 0.06 ... # $ address : num 0.64 0.28 0 0 0 0 0 0 0 0.12 ... # $ all : num 0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ... # $ X3d : num 0 0 0 0 0 0 0 0 0 0 ... # $ our : num 0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ... # $ over : num 0 0.28 0.19 0 0 0 0 0 0 0.32 ... # $ remove : num 0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ... # $ internet : num 0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ... # $ order : num 0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ... # $ mail : num 0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ... # $ receive : num 0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ... # $ will : num 0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ... # $ people : num 0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ... # $ report : num 0 0.21 0 0 0 0 0 0 0 0 ... # $ addresses : num 0 0.14 1.75 0 0 0 0 0 0 0.12 ... # $ free : num 0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ... # $ business : num 0 0.07 0.06 0 0 0 0 0 0 0 ... # $ email : num 1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ... # $ you : num 1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ... # $ credit : num 0 0 0.32 0 0 0 0 0 3.53 0.06 ... # $ your : num 0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ... # $ font : num 0 0 0 0 0 0 0 0 0 0 ... # $ X000 : num 0 0.43 1.16 0 0 0 0 0 0 0.19 ... # $ money : num 0 0.43 0.06 0 0 0 0 0 0.15 0 ... # $ hp : num 0 0 0 0 0 0 0 0 0 0 ... # $ hpl : num 0 0 0 0 0 0 0 0 0 0 ... # $ george : num 0 0 0 0 0 0 0 0 0 0 ... # $ X650 : num 0 0 0 0 0 0 0 0 0 0 ... # $ lab : num 0 0 0 0 0 0 0 0 0 0 ... # $ labs : num 0 0 0 0 0 0 0 0 0 0 ... # $ telnet : num 0 0 0 0 0 0 0 0 0 0 ... # $ X857 : num 0 0 0 0 0 0 0 0 0 0 ... # $ data : num 0 0 0 0 0 0 0 0 0.15 0 ... # $ X415 : num 0 0 0 0 0 0 0 0 0 0 ... # $ X85 : num 0 0 0 0 0 0 0 0 0 0 ... # $ technology: num 0 0 0 0 0 0 0 0 0 0 ... # $ X1999 : num 0 0.07 0 0 0 0 0 0 0 0 ... # $ parts : num 0 0 0 0 0 0 0 0 0 0 ... # $ pm : num 0 0 0 0 0 0 0 0 0 0 ... # $ direct : num 0 0 0.06 0 0 0 0 0 0 0 ... # $ cs : num 0 0 0 0 0 0 0 0 0 0 ... # $ meeting : num 0 0 0 0 0 0 0 0 0 0 ... # $ original : num 0 0 0.12 0 0 0 0 0 0.3 0 ... # $ project : num 0 0 0 0 0 0 0 0 0 0.06 ... # $ re : num 0 0 0.06 0 0 0 0 0 0 0 ... # $ edu : num 0 0 0.06 0 0 0 0 0 0 0 ... # $ table : num 0 0 0 0 0 0 0 0 0 0 ... # $ conference: num 0 0 0 0 0 0 0 0 0 0 ... # $ ch. : num 0 0 0.01 0 0 0 0 0 0 0.04 ... # $ ch..1 : num 0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ... # $ ch..2 : num 0 0 0 0 0 0 0 0 0 0 ... # $ ch..3 : num 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ... # $ ch..4 : num 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ... # $ ch..5 : num 0 0.048 0.01 0 0 0 0 0 0.022 0 ... # $ crl.ave : num 3.76 5.11 9.82 3.54 3.54 ... # $ crl.long : int 61 101 485 40 40 15 4 11 445 43 ... # $ crl.tot : int 278 1028 2259 191 191 54 112 49 1257 749 ... # the data frame 'raw.data' behaves in this case study like a matrix # with 4601 rows (one for each email message) and 59 columns (one for # each variable measured on the email messages), except that it's not # the case that every entry in the matrix is a real number # the 54 predictors from 'make' to 'ch..5' are expressed as percentages # from 0 to 100 # to avoid variable names beginning with a number, which are not allowed # in R, note that the variables 'X000', 'X650', 'X857', 'X415','X85', # and 'X1999' all have names starting with 'X' # in 'crl.ave', 'crl.long', and 'crl.tot', the character string # 'crl' is short for 'capital run length'; in other words, 'crl.ave' # is the average length in each email message of consecutive runs # of capital letters, 'crl.long' is the longest run of consecutive # capital letters in the email message, and 'crl.tot' is the total # number of consecutive capital letters in each message # note: ch. = ; # ch..1 = ( # ch..2 = [ # ch..3 = ! # ch..4 = $ # ch..5 = # # note that the variables 'spam' and 'testid' are of type 'logi' # (short for 'logical'), with possible values 'TRUE' and 'FALSE'; # the variables 'crl.long' and 'crl.tot' are of type 'int' (short # for 'integer'), with possible values 0, 1, ...; and all of the other # variables are of type 'num' (short for 'numerical'), with possible # values of the form # any real number truncated to about 16 significant figures # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results ##### data curation preceding the modeling, part (I) ########################## # (9) now let's begin getting the data ready for modeling # the next function call examines all 4601 * 59 = 271459 values in # the data frame 'raw.data' and counts the total number of missing values # in the data set (recall that 'NA' (not available) is the missing data # code in R, so the function that looks for missing values is called # 'is.na'; it returns a logical matrix of 271459 TRUE and FALSE values # (TRUE if the data frame entry is 'NA', FALSE otherwise), which are # converted to 1s (TRUE) and 0s (FALSE) by the 'sum' function: sum( is.na( raw.data ) ) # [1] 0 # the total number of missing values in 'raw.data' is 0; excellent # in a follow-on class to STAT 206, we would need several lectures # to discuss how to handle missingness in simple ways, and quite # a few more lectures to discuss advanced methods for coping with # missing data print( n <- nrow( raw.data ) ) # [1] 4601 # this is of course the number of documents (email messages) we have print( k <- ncol( raw.data ) - 2 ) # [1] 57 # and this is of course the number of predictors we have # the next command recodes the outcome variable to numeric from logical raw.data$spam <- ifelse( raw.data$spam == T, 1, 0 ) # the next command computes the relative frequency distribution of 'spam' with( raw.data, table( spam ) / n ) # spam # 0 1 # 0.6059552 0.3940448 # so we have about 39% 1s and 61% 0s; that should be enough of each # category to have a decent chance of predicting the 1s # in some of the modeling we'll do below, we'll need *standardized* # versions of all of the predictor variables: to standardize a predictor x # we compute # x.standardized <- ( x - mean( x ) ) / sd( x ) , # which makes x.standardized have mean 0 and SD 1 # since this is a linear transformation, it can't increase or decrease # the amount of signal for predicting an outcome y from x; what it *does* do # is put the regression coefficients on a common scale in the # *logistic regression* models we're going to fit scaled.X.data.frame <- as.data.frame( scale( raw.data[ , 3:( k + 2 ) ] ) ) # scaled.X.data.frame is now a data frame (not a matrix) consisting of # all of the predictor variables, scaled as above spam <- raw.data$spam scaled.data.frame <- cbind( spam, scaled.X.data.frame ) # and 'scaled.data.frame' now has all 57 standardized predictors # and the outcome # the next command computes the relative frequency distribution of 'testid' with( raw.data, table( testid ) / n ) # testid # FALSE TRUE # 0.6661595 0.3338405 # i use 'T' and 'F' as abbreviations for 'TRUE' and 'FALSE'; # R allows us to do this in our code as well as when describing our code # the people who created this data set put almost exactly 2/3 # of the data in the modeling subset ('testid' = F) and 1/3 # in the validation subset ('testid' = T); this is a reasonable choice # another reasonable choice would be 50/50; the research literature # is still unsettled over the question of the optimal modeling/validation # split # the next two commands extract the modeling and validation subsets; # our plan in this analysis is to use 2-way cross-validation, i.e., # to build predictive models on the 'modeling' data and see how well # they perform on the 'validation' data raw.data.modeling <- raw.data[ ! raw.data$testid, ] print( n.modeling <- dim( raw.data.modeling )[ 1 ] ) # [1] 3065 raw.data.validation <- raw.data[ raw.data$testid, ] print( n.validation <- dim( raw.data.validation )[ 1 ] ) # [1] 1536 scaled.data.modeling <- scaled.data.frame[ ! raw.data$testid, ] scaled.data.validation <- scaled.data.frame[ raw.data$testid, ] # from now on, for quite awhile, we're only going to work with # the modeling subset, leaving the validation subset untouched # to find out how good our modeling is on data not used in # the fitting process # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results ##### data curation preceding the modeling, part (II) ######################### ##### initial model-building on the 'modeling' subset ######################### # (10) now we can begin looking at the distributions of # our predictor variables # at this point in careful statistical data science work, we would # examine each predictor variable one by one, to see if # *non-linearly transformed* versions of any of the variables might have # more predictive signal in them than the raw versions do # since this is quite time-consuming here with 57 predictors, # i'm only going to ask you to examine several of them, to give you # a sense of how this process goes # we want to make numerical and graphical descriptive summaries # (a) of all of the predictors univariately and (b) of the # bivariate relationships between each of the predictors and the outcome # the built-in function 'summary' is a good place to start # please examine the file # 'spam-data-analysis-variable-summary.txt' # on the course web page, which includes a summary of all 59 variables # obtained with the command summary( raw.data.modeling ) # study this file; then run this code: for ( i in 1:57 ) { if ( mean( raw.data.modeling[ , i + 2 ] ) > median( raw.data.modeling[ , i + 2 ] ) ) { print( paste( 'yes, mean > median for predictor ', i, sep = '' ) ) } else { print( paste( 'no, mean <= median for predictor ', i, sep = '' ) ) } } for ( i in 1:57 ) { if ( min( raw.data.modeling[ , i + 2 ] ) >= 0 ) { print( paste( 'yes, min >= 0 for predictor ', i, sep = '' ) ) } else { print( paste( 'no, min < 0 for predictor ', i, sep = '' ) ) } } # note that *every one* of the 57 predictors has its mean bigger # than its median while at the same time being non-negative; # what does that tell you about the histograms shapes? # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results ################## the first block of code ends here ######################## ################## the second block of code begins here ##################### # (11) to see all 57 histograms of the predictor variables below, # you may need to adjust the choice of 'n.histogram.matrix' # and fiddle with the rest of the histogram code, # if your plotting window is too small, or you could try # resizing your plotting window to make it bigger # now run only plotting code block 1 and examine the plot ##### plotting code block 1 begins on this line ############################# n.histogram.matrix <- 5 par( mfrow = c( n.histogram.matrix, n.histogram.matrix ) ) for ( i in 1:n.histogram.matrix^2 ) { hist( raw.data.modeling[ , i + 2 ], main = '', prob = T, xlab = names( raw.data )[ i + 2 ] ) } par( mfrow = c( 1, 1 ) ) ##### plotting code block 1 ends on this line ############################### # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results # after examining the first set of plots, run plotting code block 2 ##### plotting code block 2 begins on this line ############################# par( mfrow = c( n.histogram.matrix, n.histogram.matrix ) ) for ( i in ( n.histogram.matrix^2 + 1 ):( 2 * n.histogram.matrix^2 ) ) { hist( raw.data.modeling[ , i + 2 ], main = '', prob = T, xlab = names( raw.data )[ i + 2 ] ) } par( mfrow = c( 1, 1 ) ) ##### plotting code block 2 ends on this line ############################## # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results # and after examining the second set of plots, run plotting code block 3 ##### plotting code block 3 begins on this line ############################ # run this plotting code block twice: the first time as is, # the second time removing the # comment character at the beginning # of the pdf( ... ) and dev.off( ) lines, to save this plot to include # in your solutions # pdf( 'stat-206-take-home-test-3-solutions-figure-for-2(B)(d).pdf' ) par( mfrow = c( 3, 3 ) ) for ( i in ( 2 * n.histogram.matrix^2 + 1 ):( ncol( raw.data ) - 2 ) ) { hist( raw.data.modeling[ , i + 2 ], main = '', prob = T, xlab = names( raw.data )[ i + 2 ] ) } par( mfrow = c( 1, 1 ) ) # dev.off( ) ##### plotting code block 3 ends on this line ############################# # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results ################## the second block of code ends here ##################### ################## the third block of code begins here #################### # this code block computes the correlations between the outcome # variable 'spam' and all of the predictors and sorts them from largest # to smallest in absolute value before printing them, to show us # which predictors are most strongly correlated (univariately) with 'spam' correlation.vector <- rep( NA, k - 1 ) for ( i in 1:k ) { correlation.vector[ i ] <- cor( raw.data.modeling$spam, raw.data.modeling[ , i + 2 ] ) } correlation.table <- cbind( names( raw.data.modeling[ , 3:( k + 2 ) ] ), signif( correlation.vector, 4 ) ) correlation.table[ order( abs( correlation.vector ), decreasing = T ), ] # [,1] [,2] # [1,] "your" "0.4012" # [2,] "remove" "0.3421" # [3,] "free" "0.3381" # [4,] "X000" "0.3369" # [5,] "ch..4" "0.3271" # [6,] "you" "0.2778" # [7,] "business" "0.2724" # [8,] "hp" "-0.2646" # [9,] "receive" "0.2467" # [10,] "over" "0.2427" # # . # . # . # # [48,] "ch..2" "-0.06692" # [49,] "ch..5" "0.06583" # [50,] "report" "0.0648" # [51,] "direct" "-0.06262" # [52,] "X3d" "0.05758" # [53,] "ch." "-0.05623" # [54,] "table" "-0.04498" # [55,] "address" "-0.04102" # [56,] "parts" "-0.03357" # [57,] "will" "0.007804" ################## the third block of code ends here ###################### # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results with( raw.data.modeling, plot( your, spam ) ) with( raw.data.modeling, lines( lowess( your, spam ), lty = 2, lwd = 3, col = 'maroon' ) ) with( raw.data.modeling, plot( will, spam ) ) with( raw.data.modeling, lines( lowess( will, spam ), lty = 2, lwd = 3, col = 'maroon' ) ) ################## the fourth block of code begins here ################### ##### run this first sub-block, stop and examine the output ############### # long-right-hand-tailed variables that (a) take on only positive values # and (b) have a mode to the right of 0 can sometimes be transformed # to Normality by taking logs: if the mode of such a variable is positive, # the log transform may well work; if not, the transformed variable # will have a spike in the left tail, with the rest of the transformed # values perhaps looking Normal # this is an instance of method (I) in the problem statement # for finding extra predictive signal # let's look at the variable 'make' with( raw.data.modeling, hist( make, breaks = 100, prob = T, main = '' ) ) with( raw.data.modeling, sum( make == 0 ) / n ) # [1] 0.5090198 # this shows that the mode of 'make' is 0 # if we were to try to compute log( make ), 50.9% of the values # would be ( - infinity ) with( raw.data.modeling, min( make[ make > 0 ] ) ) # [1] 0.01 # a standard thing to try with a variable such as 'make' # is the transformation # x -> log( x + C ), # where C is the smallest positive value x takes on log.make.modeling <- log( raw.data.modeling$make + 0.01 ) hist( log.make.modeling, breaks = 100, prob = T, main = '', xlab = 'log( make + 0.01 )' ) par( mfrow = c( 2, 1 ) ) with( raw.data.modeling, hist( make[ make > 0 ], breaks = 100, prob = T, main = '' ) ) with( raw.data.modeling, hist( log( make[ make > 0 ] ), breaks = 20, prob = T, main = '' ) ) par( mfrow = c( 1, 1 ) ) # we see the anticipated shape # a good bit of the predictive signal from such a variable x # can be extracted by (a) making an indicator variable that's 1 # when x is 0 and (b) predicting y from both log( x + C ) and # the indicator make.is.0.modeling <- ifelse( raw.data.modeling$make == 0, 1, 0 ) tab.sum( make.is.0.modeling, raw.data.modeling$spam ) # make.is.0.modeling n mean sd # [1,] 0 723 0.593361 0.4915464 # [2,] 1 2342 0.3368915 0.4727484 # [3,] "Total" 3065 0.3973899 0.4894378 # you can see that the indicator variable for ( make = 0 ) # by itself has a lot of predictive signal # here are the results of a logistic regression of 'spam' # just on 'make' (note that it's always good form to store # the results of the regression fitting in an object): logistic.regression.of.spam.on.make <- glm( spam ~ make, family = binomial( link = 'logit' ), data = raw.data.modeling ) summary( logistic.regression.of.spam.on.make ) # Call: # glm(formula = raw.data.modeling$spam ~ raw.data.modeling$make, # family = binomial(link = 'logit')) # Deviance Residuals: # Min 1Q Median 3Q Max # -2.4152 -0.9729 -0.9729 1.3967 1.3967 # Coefficients: # Estimate Std. Error z value Pr(>|z|) # (Intercept) -0.50213 0.03967 -12.66 < 2e-16 *** # raw.data.modeling$make 0.77487 0.13201 5.87 4.36e-09 *** # --- # Signif. codes: 0 ?***? 0.001 ?**? 0.01 ?*? 0.05 ?.? 0.1 ? ? 1 # (Dispersion parameter for binomial family taken to be 1) # Null deviance: 4119 on 3064 degrees of freedom # Residual deviance: 4078 on 3063 degrees of freedom # AIC: 4082 # Number of Fisher Scoring iterations: 4 # there are logistic regression situations in which maximum-likelihood (ML) # fitting fails, because the log-likelihood function goes to + infinity # when one or more of the estimated beta coefficients tends to # infinity in absolute value; this may occur when some of the p.hat # values from the ML fit are identically 1 or 0 (or close enough to 1 or 0 # to be essentially 1 or 0 in practice), because # logit( 1 ) = + infinity and logit( 0 ) = - infinity # glm tries to cope with this by enforcing a condition of the form # - boundary < beta.hat < + boundary # where 'boundary' is a value that represents 'essentially infinite' # this means that we always need to follow a call to glm by # asking it two questions: do you think that the Fisher scoring algorithm # converged? and did it converge to a vector of beta.hat values # in which one or more of the beta.hats touched the boundary? # having previously stored the results of the glm fitting in an object, # (here 'logistic.regression.of.spam.on.make'), this can be done # with the following command: print( paste( 'converged?', logistic.regression.of.spam.on.make$converged, 'boundary?', logistic.regression.of.spam.on.make$boundary ) ) # [1] "converged? TRUE boundary? FALSE" # this means that everything is ok: convergence occurred, # and not to the boundary ############# the first sub-block ends here ################################ # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results # when you've absorbed the output of the first sub-block, ##### run this second sub-block, stop and examine the output ############### # and here are the results of a logistic regression of 'spam' # on log( 'make' + 0.01 ) and 'make.is.0': logistic.regression.of.spam.on.log.make.and.make.is.0 <- glm( spam ~ log.make.modeling + make.is.0.modeling, family = binomial( link = 'logit' ), data = raw.data.modeling ) summary( logistic.regression.of.spam.on.log.make.and.make.is.0 ) # Call: # glm(formula = raw.data.modeling$spam ~ log.make.modeling + # make.is.0.modeling, family = binomial(link = 'logit')) # Deviance Residuals: # Min 1Q Median 3Q Max # -1.5085 -0.9064 -0.9064 1.4751 1.4751 # Coefficients: # Estimate Std. Error z value Pr(>|z|) # (Intercept) 0.54374 0.12340 4.406 1.05e-05 *** # log.make.modeling 0.14127 0.08209 1.721 0.0853 . # make.is.0.modeling -0.57034 0.29408 -1.939 0.0524 . # --- # Signif. codes: 0 ?***? 0.001 ?**? 0.01 ?*? 0.05 ?.? 0.1 ? ? 1 # (Dispersion parameter for binomial family taken to be 1) # Null deviance: 4119.0 on 3064 degrees of freedom # Residual deviance: 3966.8 on 3062 degrees of freedom # AIC: 3972.8 # Number of Fisher Scoring iterations: 4 print( paste( 'converged?', logistic.regression.of.spam.on.log.make.and.make.is.0$converged, 'boundary?', logistic.regression.of.spam.on.log.make.and.make.is.0$boundary ) ) # [1] "converged? TRUE boundary? FALSE" ################## the second sub-block ends here ########################### ################## the fourth block of code ends here ####################### # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results ################## the fifth block of code begins here ###################### # a different method for extracting more predictive signal # that often works is to transform a continuous variable # such as 'make' into a set of categorical indicators, # by cutting it at (for example) its deciles (10th percentile, # 20th percentile, ..., 100th percentile) # this permits non-linear relationships between the predictor # and outcome to be explored without having to specify # the nature of the non-linearity: that makes it a # *nonparametric* way to look for extra signal # this is an instance of method (II) in the problem statement # for finding extra predictive signal # i've written a function called # 'logistic.regression.univariate.exploration' # that does this; i included it in the file 'spam-analysis-functions-R.txt' # that you 'source'd at the beginning of this analysis # the syntax of this function is as follows: # logistic.regression.univariate.exploration( x, y, name.x, # n.bootstrap.replicates, n.cutpoints, plot.pdf ) # 'x' is the predictor; 'y' is the binary outcome; name.x # is a character string giving the name of the variable x, # 'n.bootstrap.replicates' is the desired number of lowess bootstrap curves # in the plot; 'n.cutpoints' is the desired number of categories # defined by the quantiles of 'x' (if a single value of 'x' occurs with # high frequency, you will get fewer categories; for example, 'make' # is exactly 0 76% of the time, so when i ask for 10 groups cut at the # deciles of 'make' only 3 groups are defined: (0th to 80th percentile), # (80th to 90th), (90th to 100th); and you say 'plot.pdf = T' if # you want to make a PDF copy of the plot generated by this command # (this will be stored in your current working directory) and # 'plot.pdf = F if you don't want the PDF copy made # a reminder: *lowess* (locally weighted scatterplot smoother) # is a nonparametric way to explore the relationship between a single # real-valued predictor x and a single real-valued outcome y; # it does this by estimating the conditional expectation of y given x # in a number of overlapping vertical strips in the scatterplot # and then smoothing the resulting estimates # now run only exploration code block 1 and examine the output ##### exploration code block 1 begins on this line ######################## # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results # the next command examines the predictor 'make' with 10 requested # quantile cuts with( raw.data.modeling, logistic.regression.univariate.exploration( make, spam, 'make', 100, 10, plot.pdf = F ) ) ##### exploration code block 1 ends on this line ######################## # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results # after studying the first set of outputs, run exploration code block 2 ##### exploration code block 2 begins on this line ######################## # the next command examines 'make' with 20 requested quantile cuts with( raw.data.modeling, logistic.regression.univariate.exploration( make, spam, 'make', 100, 20, plot.pdf = F ) ) ##### exploration code block 2 ends on this line ######################## # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results # after studying the second set of outputs, run exploration code block 3 ##### exploration code block 3 begins on this line ######################## # the next command examines the predictor 'your' with 10 requested # quantile cuts, and saves the plot in your current working directory with( raw.data.modeling, logistic.regression.univariate.exploration( your, spam, 'your', 100, 10, plot.pdf = T ) ) # include the resulting plot 'your.pdf' in your solutions ##### exploration code block 3 ends on this line ######################## ################## the fifth block of code ends here ####################### # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results # at this point, if we had enough time, it would be good data-science # practice to transform all of the predictors to extract maximum # predictive signal before fitting a version of the full model # with all predictors included # but instead we're now just going to fit the full model without taking # advantage of the extra signal from transformations, because something # happens that's worth knowing about when fitting GLMs to binary outcomes # a reminder: this is not cheating -- in the 2-way cross-validation # method we're using here, we're allowed to mess around any way # we want in the modeling data, as long as the result validates # in the validation data (if it doesn't ...) ################## the sixth block of code begins here ###################### # the next command (a) fits a maximum likelihood logistic regression # model to the modeling subset, with 'spam' as the outcome and all # of the predictors standardized, and (b) prints a summary # of the results: # the output of this command is sufficiently lengthy that i've put it # in a file called 'spam-data-analysis-glm-results.txt' on the # course web page system.time( print( summary( full.model.all.x.scaled.modeling <- glm( spam ~ ., data = scaled.data.modeling, family = binomial( link = 'logit' ) ) ) ) ) # Call: # glm(formula = spam ~ ., family = binomial(link = "logit"), # data = scaled.data.modeling) # Deviance Residuals: # Min 1Q Median 3Q Max # -4.4162 -0.1754 0.0000 0.0959 3.6230 # Coefficients: # Estimate Std. Error z value Pr(>|z|) # (Intercept) -18.98938 2.84602 -6.672 2.52e-11 *** # make -0.05356 0.07873 -0.680 0.496312 # address -0.18425 0.10554 -1.746 0.080860 . # all 0.11758 0.07840 1.500 0.133705 # X3d 3.38348 2.63833 1.282 0.199690 # our 0.26995 0.07383 3.656 0.000256 *** # . # . # . # ch..4 1.65143 0.24410 6.765 1.33e-11 *** # ch..5 0.93880 0.64286 1.460 0.144193 # crl.ave 1.17019 0.93301 1.254 0.209766 # crl.long 2.40336 0.70537 3.407 0.000656 *** # crl.tot 0.35077 0.17446 2.011 0.044367 * # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # (Dispersion parameter for binomial family taken to be 1) # Null deviance: 4119 on 3064 degrees of freedom # Residual deviance: 1125 on 3007 degrees of freedom # AIC: 1241 # Number of Fisher Scoring iterations: 13 # user system elapsed # 0.25 1.05 1.29 # on my desktop at home, even with 3065 observations and 57 predictors, # the iterative maximum-likelihood fitting only takes about 1 second # (a good resource for understanding the iterative fitting algorithm # used by R in logistic regression is here) # http://hua-zhou.github.io/teaching/biostatm280-2017spring/slides/18-newton/newton.html # Warning message: # glm.fit: fitted probabilities numerically 0 or 1 occurred # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results # the warning message above may be telling us that the maximum-likelihood # fitting process has bumped up against the boundary for one or more # of the beta.hats; let's find out print( paste( 'converged?', full.model.all.x.scaled.modeling$converged, 'boundary?', full.model.all.x.scaled.modeling$boundary ) ) # [1] "converged? TRUE boundary? FALSE" # so we don't have a boundary problem # however, after standardization of the predictors, any regression # coefficient bigger than about 3 in absolute value is really big # let's see why this is true # here's a simple function that computes the marginal effect of adding # a new binary predictor variable x with logistic regression coefficient # beta.x to the prediction process: logistic.regression.effect.of.adding.a.new.variable <- function( p.hat.old, beta.x ) { l.old <- log( p.hat.old / ( 1 - p.hat.old ) ) l.new <- l.old + beta.x p.hat.new <- 1 / ( 1 + exp( - l.new ) ) return( cbind( p.hat.old, p.hat.new ) ) } p.hat.old <- c( 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99 ) # if beta.x = 0, of course the old and new p.hat values are the same: logistic.regression.effect.of.adding.a.new.variable( p.hat.old, -3 ) # try calling this function (a) with beta.x = -3 # and (b) with beta.x = +3 # would you say that the p.hat values change dramatically with # beta.x = +/- 3? # looking at the output above, you can see that we *do* have several # predictors with massive beta.hats after standardization, but some of # these predictors (which ones?) also have massive beta.hat # standard errors; bayesian logistic regression with a # *regularizing prior* will help to stabilize maximum-likelihood # estimation here # as we've discussed in class, maximum likelihood estimates can be # biased; with n as the number of observations (rows) and k as the # number of predictors in the data set, if k is large relative to n, # maximum-likelihood estimates of logistic regression coefficients # can be seriously biased away from 0 (i.e., if the population beta is # (e.g.) 5 the MLE might come out 7.5, or if beta = -5 the MLE # could easily be -6.9) # switching from a maximum likelihood analysis to a bayesian analysis # with a prior distribution on the beta coefficients that's centered # at 0 for each coefficient can de-bias the resulting estimates # of the betas; such priors are called *regularizing* because # the posterior modes of the beta coefficients -- these are called the # maximum a posteriori (MAP) estimates -- # are arrived at by essentially grabbing the MLEs and shrinking them # toward 0 in a way that makes their behavior 'more regular' # (i.e., closer to unbiased) ################## the sixth block of code ends here ###################### # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results ################## the seventh block of code begins here ################## # first let's see how good the maximum likelihood logistic regression # fitting thinks it's doing *internal* to the modeling data set, # by looking at its predicted P ( spam | X ... B ) values on # the modeling subset p.hat.full.model.all.x.scaled.modeling <- predict( full.model.all.x.scaled.modeling, type = 'response' ) par( mfrow = c( 2, 1 ) ) hist( p.hat.full.model.all.x.scaled.modeling[ scaled.data.modeling$spam == 0 ], nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) hist( p.hat.full.model.all.x.scaled.modeling[ scaled.data.modeling$spam == 1 ], nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) par( mfrow = c( 1, 1 ) ) # the plot shows a decent degree of separation between the # ( p.hat values when spam = 1 ) and the ( p.hat values when spam = 0 ) c( mean( p.hat.full.model.all.x.scaled.modeling[ scaled.data.modeling$spam == 0 ] ), mean( p.hat.full.model.all.x.scaled.modeling[ scaled.data.modeling$spam == 1 ] ), mean( p.hat.full.model.all.x.scaled.modeling[ scaled.data.modeling$spam == 1 ] ) - mean( p.hat.full.model.all.x.scaled.modeling[ scaled.data.modeling$spam == 0 ] ) ) # PSI # [1] 0.09029248 0.86307864 0.77278616 # now let's see how the maximum likelihood logistic regression model # *fit to the modeling subset* does in making predictions # in the *validation subset* scaled.data.validation.X <- subset( scaled.data.validation, select = - spam ) p.hat.full.model.all.x.scaled.validation <- predict( full.model.all.x.scaled.modeling, type = 'response', newdata = scaled.data.validation.X ) c( mean( p.hat.full.model.all.x.scaled.validation[ scaled.data.validation$spam == 0 ] ), mean( p.hat.full.model.all.x.scaled.validation[ scaled.data.validation$spam == 1 ] ), mean( p.hat.full.model.all.x.scaled.validation[ scaled.data.validation$spam == 1 ] ) - mean( p.hat.full.model.all.x.scaled.validation[ scaled.data.validation$spam == 0 ] ) ) # PSI # [1] 0.09941087 0.85350183 0.75409096 ( 0.77278616 - 0.75409096 ) / 0.75409096 # [1] 0.0247917 ################## the seventh block of code ends here ################## # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results ################## the eighth block of code begins here ################## # now let's fit a bayesian logistic regression model to the # modeling data, using an excellent regularization option called the # *horseshoe* prior # this is a carefully-constructed prior for the beta coefficients # that puts most of its weight near 0 but that has extremely long tails # there's a good MCMC CRAN package for doing this fitting # called 'bayesreg' dynamic.require( 'bayesreg' ) scaled.data.modeling.spam.factor <- scaled.data.modeling scaled.data.modeling.spam.factor$spam <- as.factor( scaled.data.modeling$spam ) # in their documentation, the 'bayesreg' package developers unhelpfully # forgot to mention what they use for starting values, and moreover # you can't change their choice; let's assume that a burn-in of # 1000 iterations (their default choice) is (more than) enough # to reach equilibrium # additionally, they have alpha hardwired at 0.05 in reporting # 100 ( 1 - alpha )% credible intervals for the regression coefficients # and other interesting quantities; the output of their function call # is a list of 32 objects, from which we could extract the MCMC data set # and build our own 99.9% intervals, but i haven't supplied the code # for that here # in the next function call, you may need to adjust n.samples (the number # of monitoring iterations) down from 100000, depending on how fast # your computer is and how long you want to wait for the results; # please don't go below n.samples = 10000 # the 'bayesreg' function unfortunately only uses a single thread # on whatever machine it's run on # on my desktop at home, in which the threads run at 3.40GHz, # 101000 (burn-in + monitoring) iterations took 20-30 minutes, # depending on system load outside R n.samples <- 100000 system.time( bayesian.logistic.regression.modeling.subset <- bayesreg( spam ~ ., data = scaled.data.modeling.spam.factor, model = 'logistic', prior = 'horseshoe', n.samples = n.samples, burnin = 1000, thin = 1 ) ) summary( bayesian.logistic.regression.modeling.subset ) # the full output from this command is lengthy; i've put it into a file # called 'spam-data-analysis-bayesreg-results.txt' on the course # web page; here's a partial summary # ========================================================================================== # | Bayesreg: Bayesian Penalised Regression Estimation ver. 1.1 | # | (c) Daniel F Schmidt, Enes Makalic. 2016-9 | # ========================================================================================== # Bayesian logistic horseshoe regression Number of obs = 3065 # Number of vars = 57 # MCMC Samples = 100000 Log. Likelihood = -566.35 # MCMC Burnin = 1000 Pseudo R2 = 0.7250 # MCMC Thinning = 1 WAIC = 626.02 # -------------+---------------------------------------------------------------------------- # Parameter | med(Coef) std(OR) [95% Cred. Interval] tStat Rank ESS % # -------------+---------------------------------------------------------------------------- # make | -0.03533 0.06490 -0.19245 0.06379 -0.700 51 * 21.6 # . # . # . # crl.tot | 0.23470 0.14389 0.00770 0.56465 1.722 29 ** 2.1 # _cons | -16.61868 2.09895 -20.08681 -11.64032 . . . # -------------+---------------------------------------------------------------------------- # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results # let's compare the maximum likelihood and bayes coefficients, # and make a computation that's relevant to how well the markov chain # is mixing: full.model.all.x.scaled.modeling.ess <- floor( bayesian.logistic.regression.modeling.subset$ess.frac * n.samples ) # the ESS (effective sample size) for a beta coefficient is an estimate of # the number of IID samples that would have yielded # the same MCMC accuracy in estimating that beta coefficient # as the MCMC sample from a monitoring run of length 'n.samples' options( scipen = 100 ) cbind( full.model.all.x.scaled.modeling$coefficients[ 2:( k + 1 ) ], bayesian.logistic.regression.modeling.subset$mu.beta, abs( bayesian.logistic.regression.modeling.subset$mu.beta ) < abs( full.model.all.x.scaled.modeling$coefficients[ 2:( k + 1 ) ] ), full.model.all.x.scaled.modeling.ess ) # note that the ESS values below are from an MCMC monitoring run # of length 100,000; if you used a shorter monitoring run, e.g., # of length 10,000, your ESS values will be 10 times smaller # than those in the table below # is bayes closer # MLE bayes to 0 than MLE? ESS # make -0.05355924 -0.0454105038 1 21561 # address -0.18424821 -0.1766225067 1 8988 # all 0.11757740 0.0860558929 1 17865 # . # . # . # crl.ave 1.17018695 0.3113693163 1 437 # crl.long 2.40335702 2.6548719278 0 2145 # crl.tot 0.35076674 0.2478465043 1 2117 options( scipen = 0 ) sum( abs( bayesian.logistic.regression.modeling.subset$mu.beta ) < abs( full.model.all.x.scaled.modeling$coefficients[ 2:( k + 1 ) ] ) ) # [1] 45 sum( abs( bayesian.logistic.regression.modeling.subset$mu.beta ) < abs( full.model.all.x.scaled.modeling$coefficients[ 2:( k + 1 ) ] ) ) / k # [1] 0.7894737 # let's see how accurate the bayesian p.hat values are # *internal* to the modeling data set (i.e., using the modeling subset # to predict the y values in the modeling subset) scaled.data.modeling.X <- subset( scaled.data.modeling, select = - spam ) p.hat.bayesian.logistic.regression.modeling.subset <- predict( bayesian.logistic.regression.modeling.subset, newdata = scaled.data.modeling.X, type = 'prob' ) par( mfrow = c( 2, 1 ) ) hist( p.hat.bayesian.logistic.regression.modeling.subset[ scaled.data.modeling$spam == 0 ], nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) hist( p.hat.bayesian.logistic.regression.modeling.subset[ scaled.data.modeling$spam == 1 ], nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) par( mfrow = c( 1, 1 ) ) c( mean( p.hat.bayesian.logistic.regression.modeling.subset[ scaled.data.modeling$spam == 0 ] ), mean( p.hat.bayesian.logistic.regression.modeling.subset[ scaled.data.modeling$spam == 1 ] ), mean( p.hat.bayesian.logistic.regression.modeling.subset[ scaled.data.modeling$spam == 1 ] ) - mean( p.hat.bayesian.logistic.regression.modeling.subset[ scaled.data.modeling$spam == 0 ] ) ) # PSI # [1] 0.09061353 0.85969332 0.76907979 # now let's see how well the bayesian model, fit to the modeling # subset, does in predicting the spam status of the email messages # in the validation subset: p.hat.bayesian.logistic.regression.validation.subset <- predict( bayesian.logistic.regression.modeling.subset, newdata = scaled.data.validation.X, type = 'prob' ) par( mfrow = c( 2, 1 ) ) hist( p.hat.bayesian.logistic.regression.validation.subset[ scaled.data.validation$spam == 0 ], nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) hist( p.hat.bayesian.logistic.regression.validation.subset[ scaled.data.validation$spam == 1 ], nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) par( mfrow = c( 1, 1 ) ) c( mean( p.hat.bayesian.logistic.regression.validation.subset[ scaled.data.validation$spam == 0 ] ), mean( p.hat.bayesian.logistic.regression.validation.subset[ scaled.data.validation$spam == 1 ] ), mean( p.hat.bayesian.logistic.regression.validation.subset[ scaled.data.validation$spam == 1 ] ) - mean( p.hat.bayesian.logistic.regression.validation.subset[ scaled.data.validation$spam == 0 ] ) ) # PSI # [1] 0.09834665 0.85003499 0.75168834 # one of the advantages of bayesian prediction of dichotomous outcomes # is that we get posterior distributions for each of the p.hat values, # so that we can look at the amount of uncertainty in the prediction # process system.time( p.hat.bayesian.modeling.intervals <- predict( bayesian.logistic.regression.modeling.subset, newdata = scaled.data.modeling.X, type = 'prob', bayes.avg = T ) ) # user system elapsed # 64.35 4.47 68.86 # here are the results for the first 10 email messages # in the modeling subset cbind( p.hat.bayesian.modeling.intervals[ 1:10, ], scaled.data.modeling$spam[ 1:10 ] ) # pred se(pred) CI 2.5% CI 97.5% # [1,] 0.9886944 0.004456867 0.9780111 0.9952436 1 # [2,] 0.8229983 0.039932794 0.7367634 0.8918002 1 # [3,] 0.8230338 0.039918916 0.7368092 0.8918023 1 # [4,] 0.6431008 0.099470669 0.4448321 0.8288990 1 # [5,] 0.7456441 0.077796560 0.5668872 0.8703213 1 # [6,] 0.6380242 0.101359240 0.4367421 0.8279391 1 # [7,] 0.8155320 0.089943487 0.5973087 0.9430495 1 # [8,] 0.5014264 0.042634727 0.4192702 0.5862896 1 # [9,] 0.9806469 0.008753362 0.9593688 0.9928564 1 # [10,] 0.5369823 0.051884474 0.4349436 0.6384813 1 # and here are the results for the last 10 email messages # in the modeling subset cbind( p.hat.bayesian.modeling.intervals[ ( n.modeling - 9 ):n.modeling, ], scaled.data.modeling$spam[ ( n.modeling - 9 ):n.modeling ] ) # pred se(pred) CI 2.5% CI 97.5% # [1,] 1.853997e-04 0.0003784888 2.414882e-06 1.088104e-03 0 # [2,] 1.980770e-02 0.0144276265 2.295455e-03 5.673050e-02 0 # [3,] 1.658516e-03 0.0030508096 1.769964e-05 9.659285e-03 0 # [4,] 4.889281e-06 0.0001053029 2.525120e-11 3.136042e-05 0 # [5,] 1.459612e-01 0.0411230216 7.610701e-02 2.357302e-01 0 # [6,] 5.591704e-02 0.0178632481 2.638937e-02 9.565168e-02 0 # [7,] 2.119996e-02 0.0191095306 1.932728e-03 7.280270e-02 0 # [8,] 1.687736e-02 0.0103388278 3.750901e-03 4.273397e-02 0 # [9,] 8.308764e-02 0.0253543768 3.910164e-02 1.380537e-01 0 # [10,] 2.892517e-02 0.0131386881 1.016472e-02 6.062356e-02 0 ################## the eighth block of code ends here #################### # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results ################## the ninth block of code begins here ################### # let's see how much better the predictions are (or at least # how much better they *seem* to be) when we include not only # the intercept and all k = 57 predictors but also all ( k choose 2 ) # *two-way interactions* among the predictors # this is part I of a highly successful current modeling strategy: # (I) flood the model with predictors 1 + k + choose( k, 2 ) # [1] 1654 # this model has a total of 1654 predictors; let's fit it to the # modeling data (3065 email messages) system.time( full.model.all.x.scaled.all.interactions.modeling <- glm( spam ~ .^2, data = scaled.data.modeling, family = binomial( link = 'logit' ) ) ) # user system elapsed # 123.89 0.18 124.33 # Warning message: # glm.fit: fitted probabilities numerically 0 or 1 occurred print( paste( 'converged?', full.model.all.x.scaled.all.interactions.modeling$converged, 'boundary?', full.model.all.x.scaled.all.interactions.modeling$boundary ) ) # [1] "converged? TRUE boundary? FALSE" summary( full.model.all.x.scaled.all.interactions.modeling$coefficients ) # Min. 1st Qu. Median Mean 3rd Qu. Max. NA's # -3.748e+28 -7.171e+14 -2.486e+13 9.013e+24 6.399e+14 3.801e+28 98 # has logistic regression gone crazy? # here's a summary of the maximum likelihood beta.hat values # from the main-effects-only fit: summary( full.model.all.x.scaled.modeling$coefficients ) # Min. 1st Qu. Median Mean 3rd Qu. Max. # -68.84359 -0.34494 0.00067 -1.79510 0.30611 3.38348 # one way to see if logistic regression has gone crazy is to fit # an ols multiple linear regression model to the same data set and see # if its coefficients are also crazy-looking system.time( ols.full.model.all.x.scaled.all.interactions.modeling <- lm( spam ~ .^2, data = scaled.data.modeling ) ) # user system elapsed # 6.94 0.03 6.97 summary( ols.full.model.all.x.scaled.all.interactions.modeling$coefficients ) # Min. 1st Qu. Median Mean 3rd Qu. Max. NA's # -1.104e+12 0.000e+00 0.000e+00 1.651e+09 0.000e+00 3.655e+12 99 # these also look crazy # but let's look at the signal-to-noise ratios for the coefficients se.hat.beta.hat.ols.full.model.all.x.scaled.all.interactions.modeling <- sqrt( diag( vcov( ols.full.model.all.x.scaled.all.interactions.modeling ) ) ) z.ols.full.model.all.x.scaled.all.interactions.modeling <- ols.full.model.all.x.scaled.all.interactions.modeling$coefficients / se.hat.beta.hat.ols.full.model.all.x.scaled.all.interactions.modeling summary( z.ols.full.model.all.x.scaled.all.interactions.modeling ) # Min. 1st Qu. Median Mean 3rd Qu. Max. NA's # -5.80125 -0.69504 -0.05328 -0.06482 0.57694 4.85481 99 # these don't seem crazy at all # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results # now let's look at the quality of the p.hat values, # from the logistic regression model fit to the *modeling* subset, applied to # predict the *modeling* subset p.hat.full.model.all.interactions.scaled.modeling <- predict( full.model.all.x.scaled.all.interactions.modeling, type = 'response' ) par( mfrow = c( 2, 1 ) ) hist( p.hat.full.model.all.interactions.scaled.modeling[ scaled.data.modeling$spam == 0 ], nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) hist( p.hat.full.model.all.interactions.scaled.modeling[ scaled.data.modeling$spam == 1 ], nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) par( mfrow = c( 1, 1 ) ) c( mean( p.hat.full.model.all.interactions.scaled.modeling[ scaled.data.modeling$spam == 0 ] ), mean( p.hat.full.model.all.interactions.scaled.modeling[ scaled.data.modeling$spam == 1 ] ), mean( p.hat.full.model.all.interactions.scaled.modeling[ scaled.data.modeling$spam == 1 ] ) - mean( p.hat.full.model.all.interactions.scaled.modeling[ scaled.data.modeling$spam == 0 ] ) ) # PSI # [1] 0.01407688 0.98357964 0.96950276 # and now let's look at the quality of the p.hat values, # from the model fit to the *modeling* subset, applied to # predict the *validation* subset p.hat.full.model.all.interactions.scaled.validation <- predict( full.model.all.x.scaled.all.interactions.modeling, newdata = scaled.data.validation.X, type = 'response' ) # Warning message: # In predict.lm(object, newdata, se.fit, scale = 1, type = if (type == : # prediction from a rank-deficient fit may be misleading par( mfrow = c( 2, 1 ) ) hist( p.hat.full.model.all.interactions.scaled.validation[ scaled.data.validation$spam == 0 ], nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) hist( p.hat.full.model.all.interactions.scaled.validation[ scaled.data.validation$spam == 1 ], nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) par( mfrow = c( 1, 1 ) ) c( mean( p.hat.full.model.all.interactions.scaled.validation[ scaled.data.validation$spam == 0 ] ), mean( p.hat.full.model.all.interactions.scaled.validation[ scaled.data.validation$spam == 1 ] ), mean( p.hat.full.model.all.interactions.scaled.validation[ scaled.data.validation$spam == 1 ] ) - mean( p.hat.full.model.all.interactions.scaled.validation[ scaled.data.validation$spam == 0 ] ) ) # PSI # [1] 0.2337938 0.8184874 0.5846936 # what do you conclude about the extent of the over-fitting # with the all-main-effects-and-all-2-way-interactions model? # how do you feel about part I of the modeling strategy described above # (flood the model with predictors) if we don't follow it with # some sort of part II? ##################### the ninth block of code ends here ####################### # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results ##################### the tenth block of code begins here ##################### # a frequently highly successful modeling strategy that's widely used # today is as follows: # (I) flood the model with predictors, and then # (II) regularize it by shrinking most of its coefficients # most or all of the way to 0, in a clever data-determined way # the CRAN package 'glmnet' does this extremely well # and with unbelievable computational efficiency # glmnet makes operational an idea called the *lasso* applied # to generalized linear models (GLMs, with logistic regression as a # special case) # in machine learning data science this would be called # *regularization via penalized maximum likelihood* # in bayesian data science it would be an instance of # bayesian fitting of GLMs with a prior that shrinks the # beta coefficients most or all of the way to 0 when the data evidence # for them belonging in the model is weak # the regularization penalty acts like a log prior term # in the following optimization: # { find the beta vector that minimizes the *penalized deviance* # - 2 * log( posterior ) = - 2 * log( likelihood ) - 2 * log( prior ) } # in other words, from a bayesian viewpoint the resulting beta.hat # values are the MAP (maximum a posteriori) estimates # a really nice tutorial on glmnet is available at # https://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html # the main advantage of the lasso is that it does an excellent job of # preventing over-fitting of a model that's been flooded with predictors, # by serving as step (II) in the modeling strategy above # the main disadvantage is that it has many tuning options, which can lead # to markedly better and worse results; it requires patience to explore # all (or even many) of them # first let's see how well glmnet performs on the logistic regression # model that includes only the 57 main effects dynamic.require( 'glmnet' ) scaled.data.modeling.X <- model.matrix( spam ~ ., data = scaled.data.modeling ) spam.modeling <- raw.data$spam[ ! raw.data$testid ] spam.validation <- raw.data$spam[ raw.data$testid ] n.lambda <- 100 system.time( spam.glmnet.modeling <- glmnet( x = scaled.data.modeling.X, y = spam.modeling, family = 'binomial', alpha = 1, nlambda = n.lambda ) ) # user system elapsed # 2.48 0.00 2.56 # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results # ok, let's begin examining the results print( spam.glmnet.modeling ) # Call: glmnet( x = scaled.data.modeling.X, y = spam.modeling, # family = 'binomial', alpha = 1, nlambda = n.lambda ) # Df %Dev Lambda # 1 0 0.00 0.196300 # 2 1 2.01 0.178900 # 3 1 3.66 0.163000 # 4 5 7.51 0.148500 # 5 5 11.36 0.135300 # . # . # . # 90 57 72.55 0.000050 # 91 57 72.57 0.000045 # 92 57 72.57 0.000041 # the tuning constant 'lambda' in the lasso controls the strength # of the regularization penalty # lambda is optimized when the percentage of the # penalized deviance from a null model with no predictors # 'explained' by the model stops increasing # here i told it to examine n.lambda = 100 values of lambda, # but it stopped after 92 values because the percentage of deviance # 'explained' stopped increasing # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results # the next block of code compares the logistic regression coefficients # from three fitting methods used with the model containing all, # and only, main effects final.lambda.index.modeling <- 92 options( scipen = 100 ) cbind( full.model.all.x.scaled.modeling$coefficients[ 2:( k + 1 ) ], bayesian.logistic.regression.modeling.subset$mu.beta, coef( spam.glmnet.modeling, s = spam.glmnet.modeling$lambda[ final.lambda.index.modeling ] )[ 3:59 ] ) # logistic regression coefficients # MLE bayes lasso # make -0.05355924 -0.0453229075 -0.05163405 # address -0.18424821 -0.1778458973 -0.18751606 # all 0.11757740 0.0885063904 0.11427255 # X3d 3.38348273 1.8386863082 2.71297346 # our 0.26995411 0.2648551860 0.27173197 # . # . # . # hp -3.52061215 -3.6082669712 -3.45132980 # hpl -0.29979501 -0.2219481305 -0.32325034 # george -68.84358891 -49.4865995543 -50.77450023 <-- # X650 -0.06749844 -0.0703094172 -0.06806936 # lab -1.45852164 -0.9112353687 -1.29589733 # . # . # . # direct -0.06047450 -0.0125036076 -0.04062539 # cs -15.69455882 -2.8051143512 -4.67886981 <-- # meeting -1.96951438 -1.6506354557 -1.83238320 # . # . # . # crl.long 2.40335702 2.3422198151 2.32867257 # crl.tot 0.35076674 0.2563091373 0.31292510 options( scipen = 0 ) # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results # the next function call creates a plot that shows # the coefficient shrinkage as a function of the strength # of the penalty (values to the left arise from the # strongest penalties) plot( spam.glmnet.modeling, lwd = 3 ) # the path of the coefficient for 'george' is shown in green # and starts at about 'coefficients = -50' at the bottom right # of the plot; you can see it get hammered all the way back to 0 # as your eye moves from right to left (i.e., as the penalty # increases) # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results # let's see how well the lasso fit to the modeling subset # predicts the validation subset: scaled.data.validation.X <- model.matrix( spam ~ ., data = scaled.data.validation ) p.hat.spam.glmnet.validation <- predict( spam.glmnet.modeling, newx = scaled.data.validation.X, type = 'response', s = spam.glmnet.modeling$lambda[ final.lambda.index.modeling ] ) par( mfrow = c( 2, 1 ) ) hist( p.hat.spam.glmnet.validation[ scaled.data.validation$spam == 0 ], nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) hist( p.hat.spam.glmnet.validation[ scaled.data.validation$spam == 1 ], nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) par( mfrow = c( 1, 1 ) ) c( mean( p.hat.spam.glmnet.validation[ scaled.data.validation$spam == 0 ] ), mean( p.hat.spam.glmnet.validation[ scaled.data.validation$spam == 1 ] ), mean( p.hat.spam.glmnet.validation[ scaled.data.validation$spam == 1 ] ) - mean( p.hat.spam.glmnet.validation[ scaled.data.validation$spam == 0 ] ) ) # summary (all models use all main effects only) # PSI # [1] 0.1006029 0.8513315 0.75072860 logistic regression lasso # [1] 0.09061353 0.85969332 0.76907979 bayesian logistic regression # with the horseshoe prior # [1] 0.09941087 0.85350183 0.75409096 ordinary maximum likelihood # logistic regression # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results # as our final look at the lasso, let's turn it loose # on the model with all main effects and all 2-way interactions: scaled.data.modeling.with.interactions.X <- model.matrix( spam ~ .^2, data = scaled.data.modeling ) n.lambda <- 100 system.time( spam.glmnet.modeling.with.interactions <- glmnet( x = scaled.data.modeling.with.interactions.X, y = spam.modeling, family = 'binomial', alpha = 1, nlambda = n.lambda ) ) # user system elapsed # 13.36 0.05 13.40 # glmnet just fit a lasso logistic regression with 1654 predictors # 3065 subjects, and 100 different lambda values, in 13 seconds # of clock time (!) # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results print( spam.glmnet.modeling.with.interactions ) # Call: glmnet( x = scaled.data.modeling.with.interactions.X, # y = spam.modeling, family = 'binomial', alpha = 1, nlambda = n.lambda ) # Df %Dev Lambda # 1 0 0.00 0.196300 # 2 1 2.01 0.178900 # 3 1 3.66 0.163000 # . # . # . # 98 467 96.77 0.000024 # 99 470 96.83 0.000022 # 100 483 96.92 0.000020 # i tried n.lambda = 200 but it found the same solution as with 100 # notice that the lasso retains only 483 of the 1654 predictors # with the optimal lambda, and achieves a 96.9% value for the # percentage of the null penalized deviance 'explained' # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results plot( spam.glmnet.modeling.with.interactions, lwd = 3 ) # this plot is hard to interpret with so many predictors # and finally for the lasso work, let's see how well the lasso # fit to the modeling subset predicts the modeling and validation subsets # this is the ultimate test of whether the over-fitting # exhibited by vanilla maximum likelihood has been overcome # by the lasso # first, how well does the lasso fit to the modeling subset # predict the modeling subset? scaled.data.modeling.with.interactions.X <- model.matrix( spam ~ .^2, data = scaled.data.modeling ) p.hat.spam.glmnet.modeling.with.interactions <- predict( spam.glmnet.modeling.with.interactions, newx = scaled.data.modeling.with.interactions.X, type = 'response', s = spam.glmnet.modeling.with.interactions$lambda[ 100 ] ) par( mfrow = c( 2, 1 ) ) hist( p.hat.spam.glmnet.modeling.with.interactions[ scaled.data.modeling$spam == 0 ], nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) hist( p.hat.spam.glmnet.modeling.with.interactions[ scaled.data.modeling$spam == 1 ], nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) par( mfrow = c( 1, 1 ) ) c( mean( p.hat.spam.glmnet.modeling.with.interactions[ scaled.data.modeling$spam == 0 ] ), mean( p.hat.spam.glmnet.modeling.with.interactions[ scaled.data.modeling$spam == 1 ] ), mean( p.hat.spam.glmnet.modeling.with.interactions[ scaled.data.modeling$spam == 1 ] ) - mean( p.hat.spam.glmnet.modeling.with.interactions[ scaled.data.modeling$spam == 0 ] ) ) # [1] 0.01170338 0.98225276 0.97054938 lasso, modeling fit, modeling predicted # [1] 0.01407688 0.98357964 0.96950276 ordinary maximum likelihood, # modeling fit, modeling predicted # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results # and now, how well does the lasso fit to the modeling subset # predict the validation subset? scaled.data.validation.with.interactions.X <- model.matrix( spam ~ .^2, data = scaled.data.validation ) p.hat.spam.glmnet.validation.with.interactions <- predict( spam.glmnet.modeling.with.interactions, newx = scaled.data.validation.with.interactions.X, type = 'response', s = spam.glmnet.modeling.with.interactions$lambda[ 100 ] ) par( mfrow = c( 2, 1 ) ) hist( p.hat.spam.glmnet.validation.with.interactions[ scaled.data.validation$spam == 0 ], nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) hist( p.hat.spam.glmnet.validation.with.interactions[ scaled.data.validation$spam == 1 ], nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) par( mfrow = c( 1, 1 ) ) c( mean( p.hat.spam.glmnet.validation.with.interactions[ scaled.data.validation$spam == 0 ] ), mean( p.hat.spam.glmnet.validation.with.interactions[ scaled.data.validation$spam == 1 ] ), mean( p.hat.spam.glmnet.validation.with.interactions[ scaled.data.validation$spam == 1 ] ) - mean( p.hat.spam.glmnet.validation.with.interactions[ scaled.data.validation$spam == 0 ] ) ) # [1] 0.09390095 0.90410495 0.81020400 lasso, modeling fit, validation predicted # summary: all methods use all main effects and all 2-way interactions # PSI # [1] 0.01407688 0.98357964 0.96950276 ordinary maximum likelihood, modeling fit, # modeling predicted # [1] 0.2337938 0.8184874 0.5846936 ordinary maximum likelihood, # modeling fit, validation predicted # [1] 0.01170338 0.98225276 0.97054938 lasso, modeling fit, modeling predicted # [1] 0.09390095 0.90410495 0.81020400 lasso, modeling fit, validation predicted # does the lasso pass the overfitting test at an A+ level? ###################### the tenth block of code ends here ###################### # please stop here, ensure that everything so far has been # completed successfully, and study the most recent results ################### the eleventh block of code begins here #################### # and now, for dessert, here's a machine learning method that # typically achieves *really good* out-of-sample predictions # it's called *extreme gradient boosting (GB)*, as implemented in the # CRAN package 'xgboost' (the 'x' in 'extreme' refers to the fact # that 'xgboost' automatically uses all of the threads your CPU has # to do really fast parallel computation) # i'll explain the GB idea in office hours; it's a bit complicated # the main strength of GB is that it creates (highly) accurate # predictions for data not used in the fitting process, and it does so # in a nonparametric way # its main weakness is that it *really* is a black box # here, since this is dessert, let's just see 'xgboost' in action: dynamic.require( 'xgboost' ) spam.modeling <- raw.data$spam[ ! raw.data$testid ] spam.validation <- raw.data$spam[ raw.data$testid ] spam.modeling.xgb <- xgb.DMatrix( data = as.matrix( scaled.data.modeling )[ , 2:58 ], label = spam.modeling ) spam.validation.xgb <- xgb.DMatrix( data = as.matrix( scaled.data.validation )[ , 2:58 ], label = spam.validation ) spam.watchlist <- list( train = spam.modeling.xgb, test = spam.validation.xgb ) set.seed( 5 ) spam.cross.validated.xgboost.model <- xgb.train( data = spam.modeling.xgb, nrounds = 1000, watchlist = spam.watchlist, max_depth = 4, eta = 0.5, nthread = 8, subsample = 0.5, objective = 'binary:logistic', verbose = 1, early_stopping_rounds = 100, eval_metric = 'error' ) # [1] train-error:0.092007 test-error:0.107422 # Multiple eval metrics are present. Will use test_error for early stopping. # Will train until test_error hasn't improved in 100 rounds. # [2] train-error:0.085155 test-error:0.103516 # [3] train-error:0.080914 test-error:0.093750 # [4] train-error:0.071126 test-error:0.084635 # [5] train-error:0.062643 test-error:0.068359 # . # . # . # [133] train-error:0.000653 test-error:0.044922 # [134] train-error:0.000653 test-error:0.045573 # [135] train-error:0.000979 test-error:0.043620 <-- # [136] train-error:0.000653 test-error:0.046875 # [137] train-error:0.000653 test-error:0.047526 # [138] train-error:0.000653 test-error:0.046875 # . # . # . # [139] train-error:0.000326 test-error:0.046224 # [140] train-error:0.000653 test-error:0.047526 # . # . # . # [234] train-error:0.000326 test-error:0.054036 # [235] train-error:0.000653 test-error:0.055339 # Stopping. Best iteration: # [135] train-error:0.000979 test-error:0.043620 # here 'error' = # ( number of incorrectly classified email messages ) / # ( total number of email messages ), # using the simple rule 'predict spam if p.hat > 0.5' p.hat.spam.xgb.validation <- predict( spam.cross.validated.xgboost.model, newdata = spam.validation.xgb ) par( mfrow = c( 2, 1 ) ) hist( p.hat.spam.xgb.validation[ scaled.data.validation$spam == 0 ], nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) hist( p.hat.spam.xgb.validation[ scaled.data.validation$spam == 1 ], nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) par( mfrow = c( 1, 1 ) ) c( mean( p.hat.spam.xgb.validation[ scaled.data.validation$spam == 0 ] ), mean( p.hat.spam.xgb.validation[ scaled.data.validation$spam == 1 ] ), mean( p.hat.spam.xgb.validation[ scaled.data.validation$spam == 1 ] ) - mean( p.hat.spam.xgb.validation[ scaled.data.validation$spam == 0 ] ) ) # PSI # [1] 0.04916793 0.91758378 0.86841585 xgboost, no need to specify whether # main effects only are used, or whether # interactions are included, # validation predicted without being used # [1] 0.09390095 0.90410495 0.81020400 lasso, modeling fit (main effects # plus all 2-way interactions), # validation predicted without being used #################### the eleventh block of code ends here ##################### # there's lots more to do with this case study, # but let's end here for now ############################################################################### ###############################################################################