The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

D-vine quantile regression with discrete variables: analysis of bike rental data

Dani Kraus and Thomas Nagler

November 8, 2017

Required packages

library(vinereg)
require(ggplot2)
require(dplyr)
require(purrr)
require(scales)
require(quantreg)

Plot function for marginal effects

plot_marginal_effects <- function(covs, preds) {
    cbind(covs, preds) %>%
        tidyr::gather(alpha, prediction, -seq_len(NCOL(covs))) %>%
        dplyr::mutate(prediction = as.numeric(prediction)) %>%
        tidyr::gather(variable, value, -(alpha:prediction)) %>%
        dplyr::mutate(value = as.numeric(value)) %>%
        ggplot(aes(value, prediction, color = alpha)) +
        geom_point(alpha = 0.15) + 
        geom_smooth(span = 0.5, se = FALSE) + 
        facet_wrap(~ variable, scale = "free_x") +
        theme(legend.position = "none") +
        theme(plot.margin = unit(c(0, 0, 0, 0), "mm")) +
        xlab("")
}

Data preparation

Load data

bikedata <- read.csv("day.csv")
bikedata[, 2] <- as.Date(bikedata[, 2])
head(bikedata)
##   instant     dteday season yr mnth holiday weekday workingday weathersit
## 1       1 2011-01-01      1  0    1       0       6          0          2
## 2       2 2011-01-02      1  0    1       0       0          0          2
## 3       3 2011-01-03      1  0    1       0       1          1          1
## 4       4 2011-01-04      1  0    1       0       2          1          1
## 5       5 2011-01-05      1  0    1       0       3          1          1
## 6       6 2011-01-06      1  0    1       0       4          1          1
##       temp    atemp      hum windspeed casual registered  cnt
## 1 0.344167 0.363625 0.805833 0.1604460    331        654  985
## 2 0.363478 0.353739 0.696087 0.2485390    131        670  801
## 3 0.196364 0.189405 0.437273 0.2483090    120       1229 1349
## 4 0.200000 0.212122 0.590435 0.1602960    108       1454 1562
## 5 0.226957 0.229270 0.436957 0.1869000     82       1518 1600
## 6 0.204348 0.233209 0.518261 0.0895652     88       1518 1606

Rename variables

bikedata  <- bikedata %>%
    rename(
        temperature = atemp, 
        month = mnth,
        weathersituation = weathersit,
        humidity = hum,
        count = cnt
    )

Un-normalize variables

See variable description on UCI web page.

bikedata <- bikedata %>%
    mutate(
        temperature = 66 * temperature + 16,
        windspeed = 67 * windspeed,
        humidity = 100 * humidity
    )

Show trend

ggplot(bikedata, aes(dteday, count)) +
    geom_line() + 
    scale_x_date(labels = scales::date_format("%b %y")) + 
    xlab("date") + 
    ylab("rental count") + 
    stat_smooth(method = "lm", se = FALSE, linetype = "dashed") + 
    theme(plot.title = element_text(lineheight = 0.8, size = 20)) +
    theme(text = element_text(size = 18))

Remove trend

lm_trend <- lm(count ~ instant, data = bikedata)
trend <- predict(lm_trend)
bikedata <- mutate(bikedata, count = count / trend)
ggplot(bikedata, aes(dteday, count)) + 
    geom_line() + 
    scale_x_date(labels = scales::date_format("%b %y")) + 
    xlab("date") + 
    ylab("detrended rental count") + 
    theme(plot.title = element_text(lineheight = 0.8, size = 20)) + 
    theme(text = element_text(size = 18))

Drop useless variables

bikedata <- bikedata %>%
    select(-instant, -dteday, -yr) %>%  # time indices
    select(-casual, -registered) %>%    # casual + registered = count
    select(-holiday) %>%                # we use 'workingday' instead
    select(-temp)                       # we use 'temperature' (feeling temperature)

Declare discrete variables as ordered

disc_vars <- c("season", "month", "weekday", "workingday", "weathersituation")
bikedata <- bikedata %>%
    mutate(weekday = ifelse(weekday == 0, 7, weekday)) %>%  # sun at end of week
    purrr::modify_at(disc_vars, as.ordered)

D-vine regression model

Fit model

fit <- vinereg(
  count ~ ., 
  data = bikedata, 
  family = c("onepar", "tll"), 
  selcrit = "aic"
)
fit
## D-vine regression model: count | temperature, humidity, windspeed, month, season, weathersituation, workingday, weekday 
## nobs = 731, edf = 57.55, cll = 414.62, caic = -714.15, cbic = -449.74
summary(fit)
##                var       edf         cll        caic        cbic       p_value
## 1            count  9.596830 -198.076002  415.345665  459.437472            NA
## 2      temperature 21.973257  415.792064 -787.637614 -686.683385 1.101456e-161
## 3         humidity 17.935922  118.945590 -202.019336 -119.614295  2.152571e-40
## 4        windspeed  1.000000   22.862611  -43.725223  -39.130810  1.360595e-11
## 5            month  1.000000   13.914960  -25.829921  -21.235508  1.324616e-07
## 6           season  1.000000   11.353496  -20.706992  -16.112578  1.886801e-06
## 7 weathersituation  1.000000   12.049340  -22.098679  -17.504266  9.152305e-07
## 8       workingday  3.044168   13.401411  -20.714486   -6.728321  6.860982e-06
## 9          weekday  1.000000    4.381521   -6.763042   -2.168628  3.073960e-03

In-sample predictions

alpha_vec <- c(0.1, 0.5, 0.9)
pred <- fitted(fit, alpha_vec)

Marginal effects

plot_marginal_effects(
    covs = select(bikedata, temperature), 
    preds = pred
)

plot_marginal_effects(covs = select(bikedata, humidity), preds = pred) +
    xlim(c(25, 100))

plot_marginal_effects(covs = select(bikedata, windspeed), preds = pred) 

month_labs <- c("Jan","", "Mar", "", "May", "", "Jul", "", "Sep", "", "Nov", "")
plot_marginal_effects(covs = select(bikedata, month), preds = pred) +
        scale_x_discrete(limits = 1:12, labels = month_labs)

plot_marginal_effects(covs = select(bikedata, weathersituation), 
                      preds = pred) +
    scale_x_discrete(limits = 1:3,labels = c("good", "medium", "bad"))

weekday_labs <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")
plot_marginal_effects(covs = select(bikedata, weekday), preds = pred) +
    scale_x_discrete(limits = 1:7, labels = weekday_labs)

plot_marginal_effects(covs = select(bikedata, workingday), preds = pred) +
    scale_x_discrete(limits = 0:1, labels = c("no", "yes")) +
    geom_smooth(method = "lm", se = FALSE) +
    xlim(c(0, 1))

season_labs <- c("spring", "summer", "fall", "winter")
plot_marginal_effects(covs = select(bikedata, season), preds = pred) +
    scale_x_discrete(limits = 1:4, labels = season_labs)

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.