#################################################################################################
#################################################################################################

# using R for a time series interlude, to support mc and mcmc methods

# dd ( 26 feb 2021 )

##### preliminaries:

# (1) unbuffer the output (if relevant to your R environment)

# (2) set the working directory to the place where you'd like
# PDF versions of plots to be stored, and where any data files
# that you want to import into R are stored

# on my home desktop this is

#   setwd( 'C:/DD/Teaching/AMS-STAT-206/Winter-2021/Time-Series' )

# (3) ensure that we start with a completely empty R workspace:

rm( list = ls( ) )

#################################################################################

##### continuing the monte carlo (mc) work in the quiz 2 case study

# recall that we made M = 1000000 IID draws from the posterior distribution
# for theta using the built-in function 'rbeta'

# here's the first 10 rows of the monte carlo data set again:

#           theta    p
#  [1,] 0.9152385    1
#  [2,] 0.9101900    1
#  [3,] 0.8844570    0
#  [4,] 0.8930725    0
#  [5,] 0.8916875    0
#  [6,] 0.9044753    1
#  [7,] 0.9027067    1
#  [8,] 0.9032155    1
#  [9,] 0.9228386    1
# [10,] 0.8964174    0

# here the 'theta' column records the IID monte carlo draws 'theta.star'
# from the posterior of interest, and 'p' is the binary column
# that allows us to estimate P ( theta > 0.9 | y [PM] [script B] )

# the row index 1-10 keeps track of which draw is which;
# in other words, the row index summarizes the *iteration number*
# of the monte carlo algorithm

# it's useful to index the rows with the symbol 't', standing for 'time';
# having done that, it's immediately clear that

#   the numbers in the 'theta' column are realizations of a *stochastic process*,
#   namely a set of random variables indexed by an *index set* T:

#     { theta.star_t, t in T }, with T = { 0, 1, 2, ..., M }

# (theta.star_t stands for what on paper i would write as 

#   theta with a superscript * and a subscript t)

# stochastic processes indexed by time are called 'time series';
# in studying the behavior of monte carlo algorithms
# we need to look at some ideas from the (vast) time series literature

# the index set T can either be *discrete*, as it is here -- { 0, 1, 2, ..., M } --
# or *continuous* (example: *high-frequency* stock market trades
# currently occur every microsecond (one millionth of a second) on the world's
# major stock exchanges; since the time gap between trades is so small,
# people use time series models with continuous index sets

#   (e.g., the interval on the real number line T = [ 0, T.max ])

# to model high-frequency trading)

# all we need in 206 to understand monte carlo and markov chain monte carlo
# algorithms is the discrete index set T = { 0, 1, 2, ..., M }, 
# in which ( t in T ) = 0, 1, ... is the *iteration number*
# of the iterative algorithm 

# this is lucky because the math with continuous T is a lot harder

# simple discrete-time time series models look like this:

#   theta.star_{ t + 1 } = beta.0 + 
#     g( theta.star_t, theta.star_{ t - 1 }, ..., theta.star_0 ) + e_t ,    (*)

# in which the { e_t } are modeled as IID draws from some distribution
# with mean 0 AND INDEPENDENT OF THE { y_t }

#   theta.star_0 is called the 'initial (or starting) value' of the algorithm

# model (*) above makes operational the idea that processes unfolding in time
# often exhibit *time dependence*, in which it's useful to use the earlier values

#   theta.star_t, theta.star_{ t - 1 }, ..., theta.star_0

# to predict theta.star_{ t + 1 } (example: the daily maximum temperature
# in santa cruz on day ( t + 1 ) 'depends' on the daily maximum temperature
# at the same monitoring station on day t (the day before), because
# weather patterns are often slowly-moving, and what's called 
# a *persistence forecast* (today will be a lot like yesterday) can do
# surprisingly well

# this corresponds to the simple model

#   theta.star_{ t + 1 } = beta.0 + g( theta.star_t ) + e_t

#################################################################################

# let's look at some recent temperature data from the santa cruz area
# through the lens of time series

##### the 7 pillars of statistical data science

##### (1) probability calculations 

# here this involves remembering how covariances and correlations
# between pairs of random variables work

##### (2) design of data-gathering activities

# this has already been done with the data set below,
# but we need to completely understand the design to properly interpret the data set

##### (3) data curation

# get the data into R and perform sanity checks on it

# i used the search string 'data on daily maximum temperatures' in chrome
# and immediately got directed to the website 

#   https://www.ncdc.noaa.gov/cdo-web/datatools/records

# maintained by the *national oceanographic and atmospheric administration* (NOAA)

# i chose 'view selected records' and the following inputs:

#   timescale: daily records
#   parameter: highest max temperature
#   date range: 2020
#   record type: all
#   location category: county
#   state: california
#   county: santa cruz county, CA

# i then clicked on 'SHOW RECORDS' 

# after a few more clicks, i got the website
# to send me an email message with the data set i wanted
# as a .csv file attachment (the cost of this 'data product' was $0)

# i downloaded this file to the current directory in my R session

# i opened the .csv file to see what i have

# it's a rectangular data set (as usual) with 796 rows and 6 columns

# the first row is a *header line*, which gives *data dictionary* information
# about the variables (columns) in the .csv file

# the remaining 795 rows contain data: one row for each day
# from 1 jan 2020 to 24 feb 2021

# the header line says

#   STATION NAME DATE TAVG TMAX TMIN

# scrolling down reveals that there isn't any data for santa cruz (!)
# but that we have what looks like it may be good data from
# the ben lomond NOAA station

# the NOAA website said that hourly temperature measurements
# were taken:

#   TAVG = the mean of the 24 hourly temperatures for the given day
#   TMAX = the maximum of the 24 hourly temperatures for the given day
#   TMIN = the maximum of the 24 hourly temperatures for the given day

# from examination of the .csv file external to R in the dreaded Windows program 'Excel',
# the ben lomond data starts in row 375 (not counting the header line)
# and runs through to the end in line 795 (not counting the header line)

# i read the data into R with the built-in function 'read.csv':

daily.temperature.data.in.zip.code.95060 <- read.csv( 
  'C:/DD/Teaching/AMS-STAT-206/Winter-2021/Time-Series/temperature-data-zip-code-95060-2020-2021.csv',
  header = T )

# always do a call to the built-in function 'str' (structure)
# when you import something into R, to see what R did in trying to import the
# data from the external file

str( daily.temperature.data.in.zip.code.95060 )

# 'data.frame':   795 obs. of  6 variables:
#  $ STATION: chr  "US1CASZ0047" "US1CASZ0047" "US1CASZ0047" "US1CASZ0047" ...
#  $ NAME   : chr  "SANTA CRUZ 8.9 NW, CA US" "SANTA CRUZ 8.9 NW, CA US" "SANTA CRUZ 8.9 NW, CA US" "SANTA CRUZ 8.9 NW, CA US" ...
#  $ DATE   : chr  "1/4/2020" "1/9/2020" "1/11/2020" "1/12/2020" ...
#  $ TAVG   : int  NA NA NA NA NA NA NA NA NA NA ...
#  $ TMAX   : int  NA NA NA NA NA NA NA NA NA NA ...
#  $ TMIN   : int  NA NA NA NA NA NA NA NA NA NA ...

# the first ben lomond record:

daily.temperature.data.in.zip.code.95060[ 375, ]

#         STATION                         NAME     DATE TAVG TMAX TMIN
# 375 USR0000CBNL BEN LOMOND CALIFORNIA, CA US 1/1/2020   51   57   44

# i extract the entire ben lomond time series
# and put it in a stand-alone R object:

daily.maximum.temperature.in.ben.lomond.from.1.jan.2020.to.2.24.2021 <- 
  daily.temperature.data.in.zip.code.95060[ 375:795, 5 ]

print( n.ben.lomond.data <- 
  length( daily.maximum.temperature.in.ben.lomond.from.1.jan.2020.to.2.24.2021 ) )

# [1] 421

# this passes a sanity check: 366 days in 2020 (it was a leap year),
# 31 and 24 days in jan and feb 2021, respectively: 366 + 31 + 24 = 421

# check for missing values: here's a little helper function that does this task:

count.of.missing.values <- function( y ) {

  number.of.missing.values <- sum( is.na( y ) )

  print( paste( 'Number of missing values =', number.of.missing.values ) )

  return( number.of.missing.values )

}

number.of.missing.values.in.the.ben.lomond.time.series <- 
  count.of.missing.values( daily.maximum.temperature.in.ben.lomond.from.1.jan.2020.to.2.24.2021 )

# [1] "Number of missing values = 0"

##### end of the formal data curation step, although if we find
##### anomalies below we'll need to act in a data-curation manner to fix things

##### (4) numerical and graphical summaries of an existing data set

c( summary( daily.maximum.temperature.in.ben.lomond.from.1.jan.2020.to.2.24.2021 ),
  sd( daily.maximum.temperature.in.ben.lomond.from.1.jan.2020.to.2.24.2021 ) )

#     Min.   1st Qu.    Median      Mean   3rd Qu.      Max.        SD         
# 37.00000  54.00000  65.00000  66.62470  80.00000 105.00000  14.73696

# the data dictionary forgot to tell us the units of measurement (!);
# we can infer from this summary that the temperature measurements were
# in degrees fahrenheit

# our usual graphical summary of a single column of quantitative values:

hist( daily.maximum.temperature.in.ben.lomond.from.1.jan.2020.to.2.24.2021, 
  prob = T, xlab = 'Daily Maximum Temperature', 
  col = 'darkcyan', main = 'Ben Lomond (1 Jan 2020 - 24 Feb 2021)' )

# now comes a new plot in time series analysis: a scatterplot
# connected with line segments in which the horizontal scale is time
# (here, days) and the vertical scale is the value of the time series
# (here, daily maximum temperature in ben lomond); this is called a

# --> *time series plot* (or *time series trace*)

# question: what do you expect this plot to look like before making it?

# answer: 

plot( 1:n.ben.lomond.data, daily.maximum.temperature.in.ben.lomond.from.1.jan.2020.to.2.24.2021,
  type = 'l', lwd = 2, col = 'red', xlab = 'Day Index (1 = 1 Jan 2020)',
  ylab = 'Maximum Temperature', main = 'Ben Lomond (1 Jan 2020 - 24 Feb 2021)' )

# we can use an excellent built-in function in R called 'lowess'
# (locally-weighted scatterplot smoother) to capture the temperature trend over time:

lines( lowess( 1:n.ben.lomond.data, 
  daily.maximum.temperature.in.ben.lomond.from.1.jan.2020.to.2.24.2021, f = 2 / 3 ),
  lwd = 3, col = 'darkcyan', lty = 2 )

# lowess works by (a) creating vertical strips that isolate chunks of the data
# (in the horizontal dimension) and plotting something like the mean vertical value
# against the center of the horizontal values in the vertical strip, 
# and (b) drawing a smooth curve through the resulting points

# this can be regarded as a frequentist non-parametric method
# for estimating a relationship of the form 

#   y = h ( x )

# from a scatterplot of ( x, y ) points

# the lowess option 'f = 2 / 3' (the default value) tells lowess
# to use 2 / 3 of the data in each vertical strip

# this gets the basic *seasonality* of the temperature time series
# but doesn't capture the fine structure

# let's plot it again with a smaller f value (1 / 10):

par( mfrow = c( 2, 1 ) )

plot( 1:n.ben.lomond.data, daily.maximum.temperature.in.ben.lomond.from.1.jan.2020.to.2.24.2021,
  type = 'l', lwd = 2, col = 'red', xlab = 'Day Index (1 = 1 Jan 2020)',
  ylab = 'Maximum Temperature', main = 'Ben Lomond (1 Jan 2020 - 24 Feb 2021)' )

lines( lowess( 1:n.ben.lomond.data, 
  daily.maximum.temperature.in.ben.lomond.from.1.jan.2020.to.2.24.2021, f = 2 / 3 ),
  lwd = 3, col = 'darkcyan', lty = 2 )

plot( 1:n.ben.lomond.data, daily.maximum.temperature.in.ben.lomond.from.1.jan.2020.to.2.24.2021,
  type = 'l', lwd = 2, col = 'red', xlab = 'Day Index (1 = 1 Jan 2020)',
  ylab = 'Maximum Temperature', main = 'Ben Lomond (1 Jan 2020 - 24 Feb 2021)' )

lines( lowess( 1:n.ben.lomond.data, 
  daily.maximum.temperature.in.ben.lomond.from.1.jan.2020.to.2.24.2021, f = 1 / 10 ),
  lwd = 3, col = 'darkcyan', lty = 2 )

par( mfrow = c( 1, 1 ) )

# these plots are excellent, but the horizontal scale (days numbered
# from 1 to 421) makes it difficult to identify the month in which
# a given day falls

# R has an entire suite of built-in functions for time series analysis;
# the simplest is 'ts', which permits the time index to be defined
# in a somewhat more interpretable way:

ben.lomond.time.series <- ts( daily.maximum.temperature.in.ben.lomond.from.1.jan.2020.to.2.24.2021, 
  start = c( 2020, 1 ), end = c( 2021, 55 ), frequency = 366 )

plot( ben.lomond.time.series, lwd = 2, col = 'red', xlab = '',
  ylab = 'Maximum Temperature', main = 'Ben Lomond (1 Jan 2020 - 24 Feb 2021)' )

# here 'ts' has (oddly) divided the year into 10 blocks of about 37 days each,
# but you can see that it got pretty hot in ben lomond in jun and jun 2020

####################################################################################################

##### the simplest time series models 

# in the 'theta.star' monte carlo context of quiz 2
# the simplest models above (denoted by (*)) looked like this:

#   theta.star_{ t + 1 } = beta.0 + 
#     g( theta.star_t, theta.star_{ t - 1 }, ..., theta.star_0 ) + e_t ,       (*)

# let's disengage from that context and redefine the general time series model 
# in terms of a quantitative (outcome) time series y_t :

#   y_{ t + 1 } = beta.0 + g( y_t, y_{ t - 1 }, ..., y_0 ) + e_t               (**)

# the simplest model of this form pretends/assumes that 

##### the time dependence only goes back one time period:

#   y_{ t + 1 } = beta.0 + g( y_t ) + e_t                                      (***)

# from stat 131 you may recognize this as a *markovian* assumption:
# someone offers you the entire history of the time series
# back to time period 0, and you use only the most recent value

# the simplest of the models of the form (***) is called an

# --> *autoregressive (AR) model of order 1* with 
#     *first-order autocorrelation* rho, denoted AR_rho( 1 ):

#   y_{ t + 1 } = beta.0 + rho * y_t + e_t ,                                   (****)

# in which the standard off-the-shelf assumption about the 'errors' e_t
# is that they're IID from the N ( 0, sigma^2 ) PDF 

# --> and independent of the y_t values

# the parameter space in these models is as follows:

#   beta.0 lives anywhere it wants on the entire real number line
#   rho is a correlation, so it lives between -1 and +1
#   sigma is a SD, so it lives on ( 0, infinity )

# econometricians call the { e_t } values *shocks* to the time series system;
# this is just a polite way of saying that the wizard of oz
# doesn't give you y_{ t + 1 }, he 'contaminates' ( beta.0 + rho * y_t )
# with a random amount of noise e_t and lets you see the sum

# an IID gaussian 'error' process is called a 'white noise' process,
# from the old days when people used oscilloscopes to decompose
# a signal into its constituent frequencies and passing IID gaussian data
# through the oscilloscope produced a sound that sounded 'white' to experts
# (don't ask me why; i don't know)

# (****) is called an autoregressive model because
# you'll remember from stat 131 that the simple linear regression model looks like

#   y_i = beta.0 + beta.1 * x_i + e_i

# in model (****) we're *regressing the time series on itself*,
# one time period behind

# jargon: time periods back in the past from the current time index
# are called *lags*: if the time series is

#   y_0, y_1, y_2, ...

# the same time series lagged 1 time period in the past would be

#   y_1, y_2, ...

# (y_0 gets lost in the lagging because there is no y_{ -1 })

# a mathematical object that's central to studying AR processes
# is the *autocorrelation function* (ACF), which (as the name implies) measures
# how strongly the time series is correlated with itself 1, 2, ... lags back into the past

# for example, to estimate the *first-order autocorrelation* of a time series,
# we just build a data set that looks like this:

#   y_0  ---            throw this row away because y_{ -1 } is undefined
#   y_1  y_0
#   y_2  y_1

#    .    .
#    .    .

#   y_n  y_{ n - 1 }

# now just give this data set with ( n - 1 ) rows and 2 columns
# to the R built-in function 'cor'; the result is the best possible
# estimate of the first-order autocorrelation when no information
# about the time series external to the data set is available

# an easy calculation using stat 131 ideas then gives the following result
# ( here C( X, Y ) and rho( X, Y ) are the covariance and correlation
# between the random variables X and Y, respectively): in model (****)

#   C( y_{ t + 1 }, y_t ) = C( beta.0 + rho * y_t + e_t, y_t )
#                         = C( rho * y_t + e_t, y_t )
#                         = rho * C( y_t, y_t ) + C( e_t, y_t )

# but the { e_t } and the { y_t } are assumed *independent* in this model,
# so C( e_t, y_t ) = 0, and C( y_t, y_t ) = V( y_t ) = sigma_y^2 (say); then

#   rho( y_{ t + 1 }, y_t ) = C( y_{ t + 1 }, y_t ) / ( SD( y_{ t + 1 } ) * SD( y_t )
#                           = rho * sigma_y^2 / sigma_y^2 = rho

# thus an AR_rho( 1 ) time series has autocorrelation rho at lag 1,
# which explains why the model was defined in the way it was

# the *autocorrelation function* (ACF) simply collects all of the autocorrelations
# of the time series with itself 0, 1, ... lags back into the past

# it's helpful to plot this function, with the horizontal axis
# keeping track of lag 0, 1, ... and the vertical axis displaying
# the autocorrelation at lag i (the autocorrelation of { y_t }
# with itself 0 lags back into the past is simply the correlation
# of { y_t } with { y_t }, which is of course +1)

##################################################################################################

# two more ideas, and we're ready to use time series methods in our
# monte carlo work (the first of these ideas was covered in stat 131):

# (1) a stochastic process (in this case, a time series) is said to be
#     *stationary* if its probability behavior is invariant under a time shift;
#     in other words, if the time series { y_t } and the time series { y_{ t - i } }
#     have exactly the same probability behavior for all i = 1, 2, ...

# the ben lomond daily maximum temperature time series over a short time horizon
# is not stationary, because of the seasonal behavior, but we could readily
# obtain a stationary series by estimating the seasonal component and
# subtracting it out

# there are two main ways to analyze a non-stationary time series:

#   (a) model the non-stationarity directly (this is the better option); or
#   (b) do something to the non-stationary time series so that the result (&)
#       is stationary, and fit stationary time series models to (&)

# the AR_rho( 1 ) model is for stationary time series, so strictly speaking
# i shouldn't be fitting such a model to the ben lomond time series examined here,
# but i want to do so anyway to show you some extremely useful plots
# that we'll be using in our (markov chain) monte carlo work

# (2) as soon as somebody had the idea of an AR_rho( 1 ) time series,
#     the usual mathematical desire to generalize led them to define
#     the AR( k ) time series model, in which the time series
#     is regressed on itself k >= 1 lags back into the past:

#       y_{ t + 1 } = beta.0 + beta.1 * y_t + beta.2 * y_{ t - 1 } + ... +
#                       beta.k * y_{ t - k - 1 } + e_t

# within the AR( k ) class, with a given data set, we now have
# the problem of figuring out the optimal value of k

# for this purpose, the excellent statisticians george box and george tiao 
# invented a thing called the

#   *partial autocorrelation function* (PACF)

# this is a bit difficult to explain mathematically; here's the intuition

#   the PACF at lag i of a time series { y_t } is the autocorrelation
#   between { y_t } and 

#     { y_{ t - i } } with the linear dependence of y_t on 
#     { y_{ t - 1 }, y_{ t - 2 }, ..., y_{ t - k - 1 } } 

#   removed

# as with the ACF, it's helpful to plot the PACF, with the horizontal axis
# keeping track of lag 0, 1, ... and the vertical axis displaying
# the partial autocorrelation at lag i

# this has the highly useful consequence that

# --> if the time series is AR( k ), the PACF will have non-zero spikes
#     at lags ( 1, ..., k ) AND THEN ALL OF THE PACF VALUES AT LAGS
#     ( k + 1, k + 2, ... ) will be 0

# so you can diagnose the optimal k in the AR( k ) modeling class
# by looking at the PACF and seeing where it drops to 0 (apart
# from the usual sampling noise)

##################################################################################################

# i like to wrap all of this up in a graph that i call the 'time series 4-plot':

# it divides the R plotting region into a ( 2 by 2 ) matrix of sub-plots

# the upper left graph is the time series plot of the data

# the upper right graph is a density trace of the data values

# the lower left graph plots the autocorrelation function,
# using the built-in function 'acf'

# and the lower right graph displays the partial autocorrelation function,
# using the built-in function 'pacf'

# here's what it looks like for the ben lomond time series;

par( mfrow = c( 2, 2 ) )

plot( 1:n.ben.lomond.data, daily.maximum.temperature.in.ben.lomond.from.1.jan.2020.to.2.24.2021, 
  type = 'l', xlab = 'Day', ylab = 'Maximum Temperature', col = 'red' )

plot( density( daily.maximum.temperature.in.ben.lomond.from.1.jan.2020.to.2.24.2021 ), 
  xlab = 'Maximum Temperature', lwd = 3, col = 'blue', main = 'Ben Lomond' )

acf( as.ts( daily.maximum.temperature.in.ben.lomond.from.1.jan.2020.to.2.24.2021, 
  start = 1, end = n.ben.lomond.data, frequency = 1 ), lag.max = 20, main = '',
  col = 'darkcyan', lwd = 3 )

pacf( as.ts( daily.maximum.temperature.in.ben.lomond.from.1.jan.2020.to.2.24.2021, 
  start = 1, end = n.ben.lomond.data, frequency = 1 ), lag.max = 20, main = '',
  col = 'chocolate1', lwd = 3 )

par( mfrow = c( 1, 1 ) )

# the default acf( ) and pacf( ) functions helpfully plot horizontal blue dotted lines
# 2 standard errors each way from 0, in which the standard errors are calculated
# from a 'null' model in which there is no time dependence in the 'time series' at all

# it turns out that the ACF of an AR_rho( 1 ) time series
# should look like a ski slope, with the autocorrelations decreasing
# geometrically (as rho, rho^2, ..., in which rho is the first-order autocorrelation)
# as the lag increases from 0

# you can see that the ben lomond time series, which is not yet stationary,
# would need a more complicated model than AR_rho( 1 ): (a) the ACF
# doesn't look like a ski slope, and (b) some of the PACF values at lags > 1
# are non-zero

##################################################################################################

# in the context of IID monte carlo simulated draws from a distribution,
# let's agree to call the above graph the 'monte carlo 4-plot'

# --> what should it look like with IID draws from a posterior distribution?

# here's the code that created the mc data set in the quiz 2 problem,
# with M = 1000000:

s <- 830

n <- 921

alpha.prior <- 1

beta.prior <- 1

alpha.posterior <- alpha.prior + s

beta.posterior <- beta.prior + ( n - s )

M <- 1000000

set.seed( 9119617 )

system.time( 

  theta.star.M.1000000 <- rbeta( M, alpha.posterior, beta.posterior )

)

#   user  system elapsed 
#   0.19    0.00    0.19

# and now the mc 4-plot of the theta.star column in the mc data set:

par( mfrow = c( 2, 2 ) )

plot( 1:M, theta.star.M.1000000, type = 'l', xlab = 'Iteration Number', 
  ylab = 'theta', col = 'red' )

plot( density( theta.star.M.1000000 ), 
  xlab = 'theta', lwd = 3, col = 'blue', main = 'Quiz 2 Case Study' )

acf( as.ts( theta.star.M.1000000, 
  start = 1, end = M, frequency = 1 ), lag.max = 10, main = '',
  col = 'darkcyan', lwd = 3 )

pacf( as.ts( theta.star.M.1000000, 
  start = 1, end = M, frequency = 1 ), lag.max = 10, main = '',
  col = 'chocolate1', lwd = 3 )

par( mfrow = c( 1, 1 ) )

# --> does it look the way you expected, 
# with IID monte carlo (*not* markov chain monte carlo) data?

#################################################################################################
#################################################################################################