Computation time and features

Benjamin Christoffersen

2018-07-01

This vignette shows comparisons in terms of computation time with other packages and alternative base R solutions. Specifically, we will make comparisons with the roll package and zoo package. It should be stressed though that the other solutions do additional things than this package does. E.g., there is not performed any rank test in the function in this package. We start by showing the comparisons of computation times and then we show different options. R versions of the functions are shown to make it clear what the output is. Some function definitions are shown at the end.

Comparisons

We start by simulating the data

set.seed(32981054)
n <- 10000
p <- 6
wdth = 50
X <- matrix(rnorm(p * n), n, p)
y <- drop(X %*% runif(p)) + rnorm(n)
df <- data.frame(y, X)
frm <- eval(parse(text = paste0(
  "formula(y ~ -1 + ", paste0("X", 1:p, collapse = " + "), ")")))

Then we check that the functions give the same (the function definitions are at the end of this document)

library(rollRegres)
all.equal(roll_regress_R(X, y, wdth), roll_regres.fit(X, y, wdth)$coefs, 
          check.attributes = FALSE)
#R [1] "Mean relative difference: 0.9998881"
all.equal(roll_regress_R(X, y, wdth), roll_regres(frm, df, wdth)$coefs, 
          check.attributes = FALSE)
#R [1] "Mean relative difference: 0.9998881"
all.equal(roll_regress_R(X, y, wdth), roll_regress_zoo(X, y, wdth), 
          check.attributes = FALSE)
#R [1] "Mean relative difference: 0.9998881"
all.equal(roll_regress_R(X, y, wdth), roll_regress_R_for_loop(X, y, wdth), 
          check.attributes = FALSE)
#R [1] "Mean relative difference: 0.9998881"
all.equal(roll_regress_R(X, y, wdth), roll_lm_func(X, y, wdth), 
          check.attributes = FALSE)
#R [1] "Mean relative difference: 0.9998881"

and here then we compare the computation time

microbenchmark::microbenchmark(
  roll_regress            = roll_regres.fit(X, y, wdth),
  # this will be slower due to call to `model.matrix` and `model.frame`
  roll_regress_df         = roll_regres(frm, df, wdth),
  roll_regress_R          = roll_regress_R(X, y, wdth),
  roll_regress_zoo        = roll_regress_zoo(X, y, wdth),
  roll_regress_R_for_loop = roll_regress_R_for_loop(X, y, wdth),
  roll_lm = roll_lm_func(X, y, wdth),
  times = 5)
#R Unit: milliseconds
#R                     expr        min         lq       mean     median         uq
#R             roll_regress   5.761187   6.063805   6.291281   6.498768   6.551311
#R          roll_regress_df   6.843657   7.356448   7.910719   7.433089   7.656694
#R           roll_regress_R 162.769448 162.883225 169.889885 171.152660 173.517501
#R         roll_regress_zoo 737.844834 739.207403 752.422065 743.101528 757.885534
#R  roll_regress_R_for_loop 350.679842 355.876486 375.920820 374.616642 388.647265
#R                  roll_lm 135.541782 139.240352 140.969537 139.568648 141.311661
#R         max neval
#R    6.581336     5
#R   10.263708     5
#R  179.126589     5
#R  784.071026     5
#R  409.783865     5
#R  149.185244     5

# here is the formula used above
frm
#R y ~ -1 + X1 + X2 + X3 + X4 + X5 + X6

Additional features

This section will cover some additional features.

Expanding window

Here are expanding window regressions with additional output

#####
# simulate data
set.seed(65731482)
n <- 100
p <- 2
X <- matrix(rnorm(p * n), n, p)
y <- drop(X %*% runif(p)) + rnorm(n)

#####
# use package function
pck_out <- roll_regres.fit(
  X, y, width = 50L, do_downdates = FALSE, 
  do_compute = c("sigmas", "r.squareds", "1_step_forecasts"))

#####
# assign R-version
R_func <- function(X, y, width){
  n <- nrow(X)
  p <- ncol(X)
  out <- matrix(NA_real_, n, p)
  sigmas             <- rep(NA_real_, n)
  r.squared          <- rep(NA_real_, n)
  one_step_forecasts <- rep(NA_real_, n)

  for(i in width:n){
    idx <- 1:i
    fit <- lm(y[idx] ~ -1 + X[idx, , drop = FALSE])
    out[i, ] <- fit$coefficients

    su <- summary(fit)
    sigmas[i] <- su$sigma

    ss1 <- sum((y[idx] - mean(y[idx]))^2)
    ss2 <- sum(fit$residuals^2)
    r.squared[i] <- 1 - ss2 / ss1

    if(i < n){
      next_i <- i + 1L
      one_step_forecasts[next_i] <- fit$coefficients %*% X[next_i, ]
    }
  }

  list(coef = out, sigmas = sigmas, r.squared = r.squared,
       one_step_forecasts = one_step_forecasts)
}

R_out <- R_func(X, y, 50L)

#####
# gives the same
stopifnot(isTRUE(all.equal(R_out$coef              , pck_out$coefs)))
stopifnot(isTRUE(all.equal(R_out$sigmas            , pck_out$sigmas)))
stopifnot(isTRUE(all.equal(R_out$r.squared         , pck_out$r.squared)))
stopifnot(isTRUE(all.equal(R_out$one_step_forecasts, pck_out$one_step_forecasts)))

Update in blocks

You can use the grp argument to make updates in blocks. E.g., here is an example with weekly data

#####
# simulate data
set.seed(68799379)
week <- as.integer(gl(25, 7))
head(week[1:10])
#R [1] 1 1 1 1 1 1
n <- length(week)
p <- 2
X <- matrix(rnorm(p * n), n, p)
y <- drop(X %*% runif(p)) + rnorm(n)

#####
# use package function
pck_out <- roll_regres.fit(
  X, y, width = 10L, grp = week, 
  do_compute = c("sigmas", "r.squareds", "1_step_forecasts"))

#####
# assign R-version
R_func <- function(X, y, width, grp){
  u_grp = unique(grp)
  n <- nrow(X)
  p <- ncol(X)
  out <- matrix(NA_real_, n, p)
  sigmas             <- rep(NA_real_, n)
  r.squared          <- rep(NA_real_, n)
  one_step_forecasts <- rep(NA_real_, n)

  start_val <- max(which(u_grp <= width))
  for(g in u_grp[start_val:length(u_grp)]){
    idx <- which(grp %in% (g - width + 1L):g)
    i <- which(grp == g)
    fit <- lm(y[idx] ~ -1 + X[idx, , drop = FALSE])
    out[i, ] <- sapply(fit$coefficients, rep, times = length(i))

    su <- summary(fit)
    sigmas[i] <- su$sigma

    ss1 <- sum((y[idx] - mean(y[idx]))^2)
    ss2 <- sum(fit$residuals^2)
    r.squared[i] <- 1 - ss2 / ss1

    if(g != max(grp)){
      next_g <- grp[min(which(grp > g))]
      next_g <- which(grp == next_g)
      one_step_forecasts[next_g] <- X[next_g, ] %*% fit$coefficients
    }
  }

  list(coef = out, sigmas = sigmas, r.squared = r.squared,
       one_step_forecasts = one_step_forecasts)
}

R_out <- R_func(X, y, 10L, grp = week)

#####
# gives the same
stopifnot(isTRUE(all.equal(R_out$coef              , pck_out$coefs)))
stopifnot(isTRUE(all.equal(R_out$sigmas            , pck_out$sigmas)))
stopifnot(isTRUE(all.equal(R_out$r.squared         , pck_out$r.squared)))
stopifnot(isTRUE(all.equal(R_out$one_step_forecasts, pck_out$one_step_forecasts)))

Session info

sessionInfo()
#R R version 3.5.0 (2018-04-23)
#R Platform: x86_64-w64-mingw32/x64 (64-bit)
#R Running under: Windows 10 x64 (build 17134)
#R 
#R Matrix products: default
#R 
#R locale:
#R [1] LC_COLLATE=C                          
#R [2] LC_CTYPE=English_United States.1252   
#R [3] LC_MONETARY=English_United States.1252
#R [4] LC_NUMERIC=C                          
#R [5] LC_TIME=English_United States.1252    
#R 
#R attached base packages:
#R [1] compiler  stats     graphics  grDevices utils     datasets  methods  
#R [8] base     
#R 
#R other attached packages:
#R [1] rollRegres_0.1.0 roll_1.0.7       zoo_1.8-2       
#R 
#R loaded via a namespace (and not attached):
#R  [1] Rcpp_0.12.17         lattice_0.20-35      digest_0.6.15       
#R  [4] rprojroot_1.3-2      grid_3.5.0           backports_1.1.2     
#R  [7] magrittr_1.5         evaluate_0.10.1      RcppParallel_4.4.0  
#R [10] stringi_1.1.7        checkmate_1.8.5      rmarkdown_1.9       
#R [13] tools_3.5.0          stringr_1.3.0        yaml_2.1.18         
#R [16] microbenchmark_1.4-4 htmltools_0.3.6      knitr_1.20

Function definitions

Here are the function definitions

#####
# Pure R version of package function
roll_regress_R <- function(X, y, width){
  n <- nrow(X)
  p <- ncol(X)
  out <- matrix(NA_real_, p, n)
  
  dummy_1 <- matrix(0.)
  dummy_2 <- numeric(p)

  is_first <- TRUE
  for(i in width:n){
    if(is_first){
      is_first <- FALSE
      qr. <- qr(X[1:width, ])
      R <- qr.R(qr.)

      # Use X^T
      X <- t(X)

      XtY <- drop(tcrossprod(y[1:width], X[, 1:width]))
    } else {
      x_new <- X[, i]
      x_old <- X[, i - width]

      # update R 
      rollRegres:::dchud_wrap(
        R, p, p, x_new, dummy_1, 0L, 0L, 0., 0., dummy_2, dummy_2)

      # downdate R
      rollRegres:::dchdd_wrap(
        R, p, p, x_old, dummy_1, 0L, 0L, 0., 0., dummy_2, dummy_2, 
        integer(1))

      # update XtY
      XtY <- XtY + y[i] * x_new - y[i - width] * x_old
    }

    coef. <- .Internal(backsolve(R, XtY, p, TRUE, TRUE))
    coef. <- .Internal(backsolve(R, coef., p, TRUE, FALSE))

    out[, i] <- coef.
  }

  t(out)
}

#####
# simple R version
roll_regress_R_for_loop <- function(X, y, width){
  n <- nrow(X)
  p <- ncol(X)
  out <- matrix(NA_real_, n, p)

  for(i in width:n){
    idx <- (i - width + 1L):i
    out[i, ] <- lm.fit(X[idx, , drop = FALSE], y[idx])$coefficients
  }

  out
}

#####
# zoo version
.zoo_inner <- function(Z) {
  fit <- lm.fit(Z[, -1, drop = FALSE], Z[, 1])
  fit$coefficients
}
library(zoo)
roll_regress_zoo <- function(x, y, width){
  Z <- cbind(y, X)
  rollapply(Z, width, FUN = .zoo_inner,  
            by.column = FALSE,  align = "right", fill = NA_real_)
}

#####
# roll_lm
library(roll)
roll_lm_func <- function(x, y ,width)
  roll_lm(X, matrix(y, ncol = 1), wdth, intercept = FALSE)$coefficients[, -1L]

# use one thread as other methods are easily to compute in parallel too
RcppParallel::setThreadOptions(numThreads = 1)

# compile functions
library(compiler)
roll_regress_R          <- cmpfun(roll_regress_R)
roll_regress_R_for_loop <- cmpfun(roll_regress_R_for_loop)
.zoo_inner              <- cmpfun(.zoo_inner)
roll_regress_zoo        <- cmpfun(roll_regress_zoo)
roll_lm_func            <- cmpfun(roll_lm_func)