##################################################################################

# stat 206: data science case study 1: the kidney function data
# from Efron and Hastie (2016)

##################################################################################

# recall from the R tutorial that the comment character in R is the hashtag: #

# notational convention in what follows: when i mention
# an object name, such as 'age', in the comments below,
# i'll surround it with single quotes; this does not mean
# that the object 'age' is in character-string format,
# it's just a way of mentioning object names in this data science session

##################################################################################

# thinking of the 7 pillars of statistical data science,
# this data set has already been obtained, so

#   task 1: probability calculations
#   task 2: design of data-gathering activities

# don't seem to be (immediately) relevant here,
# but some design ideas will arise in what follows,
# and if we draw any inferential conclusions
# they'll be based on probability calculations

##################################################################################

# (1) preliminaries

# (A) make a directory for this data analysis; if you already have
# a directory called, e.g., STAT-206-Winter-2021, you could
# make the new directory inside that one and call it (say) Kidney-Function

# (B) download, into the Kidney-Function directory, the file 

#   kidney.txt

# from the Drupal course web site

# (C) examine the structure of the file 'kidney.txt';
# you'll see that it has a header line with the two character strings

#   "age" "tot"

# these are the variable names in this data set

# the remaining rows contain the values of the variables on n subjects,
# where n is at the moment not yet known

# (D) get contextual information about these variables
# from the Efron-Hastie website: go to

#   https://web.stanford.edu/~hastie/CASI/

# Click on the 'Data' option at the top of the page

# Click on 'Kidney_function_Figure_1.1'

# the result is

#   Kidney fitness data of Figure 1.1
#   Measurements on 157 healthy volunteers (potential donors)
#   These data originated in the nephrology
#   laboratory of Dr. Brian Myers, Stanford University

#   age: of volunteer
#   tot: composite measure of overall kidney function

# a good data scientist NEVER begins analyzing a data set
# before making a full assessment of the context:

#   who are the subjects? have they been drawn in a 
#   like-at-random manner from some population (if so, which one?)?

#   what are the precise definitions of all of the variables
#   measured on the subjects?

##################################################################################

# (2) start an R session

# --> change directory to what above was called 'Kidney-Function'; 

# read in the raw data set; 

# and get to know its contents

# at the beginning of this session, there's nothing in
# the current R workspace:

ls( )

# character(0)

# there are many built-in functions in R to import data from outside R;
# here, because the data set in 'kidney.txt' is already nicely formatted
# into ( n + 1 ) rows and 2 columns, with the first row specifying
# the variable names, i can use the function 'read.csv' ), telling it
# that spaces are used to separate the variable values on each line
# with the option " sep = ' ' " and telling it that the first line
# gives the variable names as character strings with the option
# 'header = T':

raw.kidney.data <- read.csv( 'kidney.txt', sep = ' ', header = T )

ls( )

# [1] "raw.kidney.data"

# the FIRST thing you should always do when looking at somebody else's
# data set is to use the built-in function 'str' (for 'structure')
# to see what you've imported into R and in what format R has received
# the external data set:

str( raw.kidney.data )

# 'data.frame':   157 obs. of  2 variables:
#  $ age: int  18 19 19 20 21 21 21 22 22 22 ...
#  $ tot: num  2.44 3.86 -1.22 2.3 0.98 -0.5 2.74 -0.12 -1.21 0.99 ...

# a 'data frame' in R is an extremely useful generalization of a matrix
# in math: it has rows and columns, as a matrix does, but the entries
# don't have to be numeric -- the columns can be a mixture of any and all 
# of R's variable types

# here R has detected 157 rows (obs.) and 2 columns (variables)

# Efron and Hastie told us that there should be 157 rows and 2 columns
# of data, and indeed there are

# that's an important check to make when receiving somebody else's data:
# always ask them how many rows and how many columns you should expect
# to get, and ensure that you got the right shape of the data set

# the variable 'age' has been stored in R as 'int' (integer) format,
# because R has detected that all of its values are integers

# we can do things to 'age' just as if it were a floating point
# real number ('num'), and nothing should go wrong

# the variable 'tot' has been stored in R as 'num' 
# (numerical: double-precision floating-point) format;
# it appears that all of the numbers have been measured with
# significant figures in the format +/- x.xx

# we can access the individual variables in 'raw.kidney.data'
# with the '$' operator in R: you name the data frame,
# follow that with a $ sign, and then name the variable:

age <- raw.kidney.data$age

ls( )

# [1] "age"             "raw.kidney.data"

# now we have two separate objects in the current workspace:
# 'age' has been extracted from the data frame and stands on its own:

str( age )

# int [1:157] 18 19 19 20 21 21 21 22 22 22 ...

# this is also an opportunity to rename any variables with imported names
# that are not fully descriptive:

age

#   [1] 18 19 19 20 21 21 21 22 22 22 22 22 22 23 23 23 23 23 23 23 23 23 23 23 24 24 24 24 24 24 24
#  [32] 24 24 24 24 24 25 25 25 25 25 25 25 25 25 25 26 26 26 26 26 26 26 27 27 27 27 27 27 27 27 27
#  [63] 27 27 27 28 28 28 28 29 29 29 29 29 30 30 30 30 31 31 31 31 31 31 31 32 32 32 32 32 32 32 33
#  [94] 33 33 33 33 33 34 34 34 34 35 35 36 36 36 36 36 37 38 38 39 41 41 41 42 43 44 45 45 46 47 48
# [125] 51 53 54 54 54 54 55 56 57 57 60 60 60 61 61 62 62 62 63 63 65 65 68 69 70 71 72 73 73 74 80
# [156] 82 88

# compare with the file 'kidney.txt': first and last age values match

kidney.function <- raw.kidney.data$tot

ls( )

# [1] "age"             "kidney.function" "raw.kidney.data"

str( kidney.function )

# num [1:157] 2.44 3.86 -1.22 2.3 0.98 -0.5 2.74 -0.12 -1.21 0.99 ...

kidney.function

#   [1]  2.44  3.86 -1.22  2.30  0.98 -0.50  2.74 -0.12 -1.21  0.99  2.07  1.55  5.19 -0.38  1.62
#  [16]  0.20  0.15 -1.28  0.78  2.94 -2.12  2.46  4.63  1.64 -0.28  0.67 -0.43 -2.10 -0.82  3.92
#  [31]  1.63  1.24 -0.66  0.15  3.17 -0.58  2.69 -0.24  1.74  5.07 -3.30  1.51  3.93 -0.87  0.38
#  [46] -0.47  2.33 -0.05 -1.10  2.07 -1.35 -2.45 -1.39  0.59  1.79  1.27  0.41 -1.57  0.83  1.76
#  [61] -1.44  2.55  1.81  2.82  1.37 -1.37  1.27  1.22 -0.05 -1.09  3.83  0.89  1.19 -0.07  1.52
#  [76]  0.56  0.05 -0.12  1.69 -0.84  0.60 -0.60  0.05  1.31 -1.80  1.80  2.38  2.86  4.53  1.32
#  [91]  0.27 -2.29  0.30  1.99 -1.04  1.44  1.58  1.38  0.44  1.15 -0.09  0.55 -0.91  1.68 -1.41
# [106] -1.47  0.33 -1.94  3.65 -0.87 -0.97  1.04 -1.55  1.87 -4.00 -0.16 -2.13 -0.86 -0.93 -0.53
# [121] -0.60 -1.62 -3.81  0.11 -0.56  3.22 -1.87  2.56 -1.25 -2.95 -0.01 -2.90 -1.31 -4.89 -0.10
# [136] -2.81 -1.72 -3.60 -0.51 -0.50 -3.12 -4.46 -3.56  0.88 -1.56 -6.45 -3.10 -1.62  1.01 -3.77
# [151] -2.53 -2.45 -0.33 -5.73 -5.14 -2.08 -4.86

##################################################################################

# (3) now let's perform the *data curation* step

# data curation (also called 'data cleaning') is intended 
# to fight against the truism

#   garbage in, garbage out

# data curation is the process of carefully and accurately preparing the data
# for the interesting bit (analysis); with big, messy data sets,
# this step can take (say) 75-90% of the entire time spent
# working on a data analysis (!)

# in this step we do things including the following:

#   * if possible, identify the *units of measurement* of each variable

#   * investigate the missing data situation: do any of our variables
#     have missing values, and if so what should we do about it?

#   * perform sanity checks on the values of all of the variables,

#   * ... (i'll think of more of these items as we go along)

# from the discussion above, 'age' is evidently measured in years

# first let's create a new object, 'n', which keeps track of
# the number of observations:

print( n <- length( age ) )

# [1] 157

# this command illustrates the process in R in which
# you can nest function calls and assignments inside other function calls
# and assignments; here R first computes the length of the age vector,
# notices that no object named 'n' exists in the current workspace,
# creates 'n' (based on the data type of 'length( age )',
# assigns 'length( age )' to 'n', and prints the result
# to the R Console window

# R allows a large number of nestings in a single command,
# but i don't recommend going any deeper than about 3;
# the deeper you go, the easier it is to make a mistake

str( n )

# int 157

# R knows that the length of a vector always has to be an integer,
# so the data type of 'length( age )' is 'int',
# and 'n' inherits this data type during the assignment operation

# you can find out about the abundance of missing data in a variable
# with the built-in function 'is.na' (recall that NA [ not available ]
# is R's missing data code:

sum( is.na( age ) )

# [1] 0

# this sequence of function calls (a) creates a logical variable of length
# n = 157, with each person recorded as T (missing age value) or F
# (not missing age value), (b) converts T to 1 and F to 0,
# and adds the result

# in this case we get the much-to-be-desired answer that 'age'
# has no missing values

sum( is.na( kidney.function ) )

# [1] 0

# the built-in function 'summary' gives useful data-curation information:

summary( age )

#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#   18.00   25.00   31.00   36.39   43.00   88.00

# ok, this passes at least one sanity check:
# the ages of the people in this study range from 18 to 88

# note that, as usual, age is measured by recording
# the largest integer less than or equal to a continuous version
# of (time in years since birth): for example, the age of a 
# 7-month-old baby would be recorded as 0, which in R would be 

floor( 7 / 12 )

# [1] 0

# i often create a small helper function to add a bit of information
# to 'summary', by including the standard deviation of the variable:

slightly.better.summary <- function( y ) {

  return( c( summary( y ), sd( y ) ) )

}

slightly.better.summary( age )

#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.         
# 18.0000 25.0000 31.0000 36.3949 43.0000 88.0000 15.9232

# with a bit more work i could get the output to look like

#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.      SD         
# 18.0000 25.0000 31.0000 36.3949 43.0000 88.0000 15.9232

# but let's move on

slightly.better.summary( kidney.function )

#          Min.       1st Qu.        Median          Mean       3rd Qu.          Max.               
# -6.4500000000 -1.3500000000 -0.0500000000 -0.0001910828  1.5200000000  5.1900000000  2.1883422104

# this is way too many significant figures, so i can improve 
# my function as follows:

slightly.better.summary <- function( y ) {

  return( round( c( summary( y ), sd( y ) ), 4 ) )

}

slightly.better.summary( kidney.function )

#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.         
# -6.4500 -1.3500 -0.0500 -0.0002  1.5200  5.1900  2.1883

# question: what has somebody evidently done with the 
# original kidney-function scale before it came to us?

# answer: transformed it by subtracting the data mean
#         from all the data values before giving the variable to us

# two basic principles in data science:

#   WE ALWAYS WANT TO BE GIVEN THE RAW DATA VALUES 
#   EXACTLY AS THEY WERE MEASURED, AND 

#   WE ALWAYS WANT TO KNOW WHAT THE UNITS OF MEASUREMENT ARE; 
#   OTHERWISE WE CAN'T PERFORM COMPLETELY EFFECTIVE SANITY CHECKS 
#   ON THE VARIABLE'S OBSERVED VALUES

# of course, we can't always get what we want

# question: in this case, has the data transformation 
# prior to our receiving the kidney.function variable destroyed
# anything we care about?

# answer: well, it has kept us from performing full sanity checks,
# but since it's just a horizontal shift of the distribution
# it hasn't changed the shape, and it hasn't affected the
# strength of (linear) relationship between age and kidney function

##################################################################################

# (4) now let's perform the *description of existing data set* step

# there are two types of descriptive measures: graphical and numerical

# we already got useful numerical summaries of both variables
# from the summary function calls above:

# age:

#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.      SD   
# 18.0000 25.0000 31.0000 36.3949 43.0000 88.0000 15.9232

# kidney function:

#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.      SD   
# -6.4500 -1.3500 -0.0500 -0.0002  1.5200  5.1900  2.1883

# question: what's a good graphical summary of a single numerical variable?

# answer:

hist( age )

# R has several built-in algorithms, based on theoretical calculations,
# which implement slightly different versions of an optimal number 
# of histogram bars; this algorithm naturally depends on n (as n increases, 
# the optimal number of bars goes up, (interestingly) at a n^( 1 / 3 ) rate

# this plot could be improved in at least two ways:
# the main title is redundant, and it's a good idea to always plot
# your histograms on the density scale:

hist( age, prob = T, main = '' )

# it can also be nice to add color:

hist( age, prob = T, main = '', col = 'darkcyan' )

# but we don't want to go crazy with that option;
# a graph with too many colors is a bad graph

# we could instead look at distributional shape with the built-in functions
# 'plot' and 'density':

plot( density( age ) )

# the R built-in function 'density' implements various versions
# of a method called *kernel density estimation*

str( density( age ) )

# List of 7
#  $ x        : num [1:512] 4.81 5 5.18 5.37 5.56 ...
#  $ y        : num [1:512] 2.02e-05 2.32e-05 2.68e-05 3.09e-05 3.55e-05 ...
#  $ bw       : num 4.4
#  $ n        : int 157
#  $ call     : language density.default(x = age)
#  $ data.name: chr "age"
#  $ has.na   : logi FALSE
#  - attr(*, "class")= chr "density"

# this plot could be improved in at least three ways:
# the main title and horizontal labels are awful, and 
# i like thicker curves (change the 'lwd' option to 'plot') 
# than those provided by the default thickness in R:

plot( density( age ), xlab = 'age', main = '', lwd = 3, col = 'red' )

# we can easily compare the two visualization methods:

# the first command in the following code block
# treats the plotting window as a matrix of plots with rows
# and columns, in the order ( rows, columns ), with default
# ( 1, 1 ), and changes the default to 2 rows and 1 column;
# the last command in this code block returns the matrix
# to ( 1, 1 ) (*always* remember to do this; otherwise
# the next time you invoke 'plot' in your continuing R session,
# the ( rows, columns ) setting will be whatever was useful
# to you previously, which will usually not be what you want later:

par( mfrow = c( 2, 1 ) )

hist( age, prob = T, main = '', col = 'darkcyan' )

plot( density( age ), xlab = 'age', main = '', lwd = 3, col = 'red' )

par( mfrow = c( 1, 1 ) )

# now you see the advantage of plotting the histogram on the density scale

# however, this plot is not yet really good:
# the horizontal and vertical axes have different limits;
# when comparing two or more graphs, you *always* want
# to make them completely comparable by forcing them
# to have the same horizontal and vertical scales

# this can be done with the 'xlim' and 'ylim' options to 'plot':

par( mfrow = c( 2, 1 ) )

hist( age, prob = T, main = '', col = 'darkcyan', xlim = c( 0, 100 ),
  ylim = c( 0, 0.05 ) )

plot( density( age ), xlab = 'age', main = '', lwd = 3, col = 'red', 
  xlim = c( 0, 100 ), ylim = c( 0, 0.05 )  )

par( mfrow = c( 1, 1 ) )

# the two distributional-shape graphical summaries
# give comparable information, although the output of 'density' 
# makes it look like there are data values below 10 and above 90,
# and the leftmost and rightmost bars in the histogram are
# ambiguous about the smallest and largest data values

# we can "fix" this in the density plot 
# with the 'from' and 'to' options to 'density':

par( mfrow = c( 2, 1 ) )

hist( age, prob = T, main = '', col = 'darkcyan', xlim = c( 0, 100 ),
  ylim = c( 0, 0.05 ) )

plot( density( age, from = min( age ), to = max( age ) ), xlab = 'age', 
  main = '', lwd = 3, col = 'red', xlim = c( 0, 100 ), ylim = c( 0, 0.05 )  )

par( mfrow = c( 1, 1 ) )

# the result is ugly; let's try again

# here's a new idea: make the plot without the 'from' and 'to' options,
# but add vertical lines at min( age ) and max( age ) to both plots
# with 'abline' function calls to show the actual data range
# (the 'lty' option to 'plot' changes the line type (solid, dotted, ...);
# there are about 20 different line types in R):

par( mfrow = c( 2, 1 ) )

hist( age, prob = T, main = '', col = 'darkcyan', xlim = c( 0, 100 ),
  ylim = c( 0, 0.05 ) )

abline( v = min( age ), lty = 2, lwd = 2, col = 'blue' )

abline( v = max( age ), lty = 2, lwd = 2, col = 'blue' )

plot( density( age ), xlab = 'age', main = '', lwd = 3, col = 'red', 
  xlim = c( 0, 100 ), ylim = c( 0, 0.05 )  )

abline( v = min( age ), lty = 2, lwd = 2, col = 'blue' )

abline( v = max( age ), lty = 2, lwd = 2, col = 'blue' )

par( mfrow = c( 1, 1 ) )

# for me this is a successful plot

# i can now publish this picture to myself with the
# built-in function calls 'pdf' and 'dev.off':

pdf( 'decent-plot-of-age.pdf' )

par( mfrow = c( 2, 1 ) )

hist( age, prob = T, main = '', col = 'darkcyan', xlim = c( 0, 100 ),
  ylim = c( 0, 0.05 ) )

abline( v = min( age ), lty = 2, lwd = 2, col = 'blue' )

abline( v = max( age ), lty = 2, lwd = 2, col = 'blue' )

plot( density( age ), xlab = 'age', main = '', lwd = 3, col = 'red', 
  xlim = c( 0, 100 ), ylim = c( 0, 0.05 )  )

abline( v = min( age ), lty = 2, lwd = 2, col = 'blue' )

abline( v = max( age ), lty = 2, lwd = 2, col = 'blue' )

par( mfrow = c( 1, 1 ) )

dev.off( )

# you can get a ton of information about options to 'plot'
# with the function call ('par' stands for *plotting parameters*):

help( par )

# another way to visualize this information is to go with
# ( rows, columns ) = ( 1, 2 ) instead of ( 2, 1 ):

par( mfrow = c( 1, 2 ) )

hist( age, prob = T, main = '', col = 'darkcyan', xlim = c( 0, 100 ),
  ylim = c( 0, 0.05 ) )

abline( v = min( age ), lty = 2, lwd = 2, col = 'blue' )

abline( v = max( age ), lty = 2, lwd = 2, col = 'blue' )

plot( density( age ), xlab = 'age', main = '', lwd = 3, col = 'red', 
  xlim = c( 0, 100 ), ylim = c( 0, 0.05 )  )

abline( v = min( age ), lty = 2, lwd = 2, col = 'blue' )

abline( v = max( age ), lty = 2, lwd = 2, col = 'blue' )

par( mfrow = c( 1, 1 ) )

# but to my eye this makes it harder to compare the two plots
# than with ( 1, 2 )

# we can make an even more succinct version of these excellent plots
# with the built-in function 'lines':

hist( age, prob = T, main = '', col = 'darkcyan', xlim = c( 0, 100 ),
  ylim = c( 0, 0.05 ) )

abline( v = min( age ), lty = 2, lwd = 2, col = 'blue' )

abline( v = max( age ), lty = 2, lwd = 2, col = 'blue' )

lines( density( age ), lwd = 3, col = 'red' )

# this is the best plot of all

# you can see that making successful data science graphs is non-trivial

# another approach to making good graphs,
# which you might prefer, is to go into
# hadley wickham's 'tidyverse' and use 'ggplot'
# or one of its variants; isabelle will demonstrate that
# in the afternoon discussion sections

# what conclusions do you draw from this plot
# about the distributional shape of 'age' in this data set?

# positive skew (long right hand tail),
# looks a lot like a mixture of two Gaussian distributions

# the morning discussion section on 20 jan 2021 ended above this line

# now that we have this code block, we can simply run it
# with 'age' replaced by 'kidney.function' and with
# the x and y axis limits removed:

hist( kidney.function, prob = T, main = '', col = 'darkcyan' )

abline( v = min( kidney.function ), lty = 2, lwd = 2, col = 'blue' )

abline( v = max( kidney.function ), lty = 2, lwd = 2, col = 'blue' )

lines( density( kidney.function ), lwd = 3, col = 'red' )

# what conclusions do you draw from this plot
# about the distributional shape of 'kidney.function' in this data set?

#
#











##################################################################################