user_sample

The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

user_sample_3

Prerequisites

Installation for user as described in README

Steps

adapt the data_dir (in windows e.g. //de-inf-008/$/Eigene Dateien/ubair-master/data/)

library(ubair)
library(ggplot2)
# set data_dir where the data is stored
data_dir <- "../../Daten/user_sample_data/"
# This might take a few seconds for large files
data <- load_uba_data_from_dir(data_dir = data_dir)

Set sample variables.

target <- "NO2"
stations <- list(
  Lünen = "DENW006", Bottrop = "DENW021",
  Köln = "DENW212"
)
meteo_variables <- c("TMP", "RFE", "WIG", "WIR", "LDR")

# dates for 9 Euro ticket effect
application_start <- lubridate::ymd("20220301") # = start reference time
date_effect_start <- lubridate::ymd_hm("20220601 00:00")
application_end <- lubridate::ymd("20220831") # = end effect time

buffer <- 0 # number of data points to be ignored before effect

trend <- "linear"
model_type <- "lightgbm"

window_size <- 14 # days of data to calculate the mean in prediction results

create a params.yaml. Either

copy the default params.yaml to your file system, adapt it and load it or
load the default and adapt the params programmatically

params <- load_params()
# adapt params programatically
params$target <- target
params$meteo_variables <- meteo_variables

Prepare data of each station for training and run counterfactual. See function documentation for further details.

results_all <- data.table::data.table()
for (station_name in names(stations)) {
  env_data <- clean_data(data, station = stations[station_name])
  dt_prepared <- prepare_data_for_modelling(env_data, params)
  dt_prepared <- dt_prepared[complete.cases(dt_prepared)]
  split_data <- split_data_counterfactual(
    dt_prepared,
    application_start = application_start,
    application_end = application_end
  )
  res <- run_counterfactual(split_data,
    params,
    detrending_function = trend,
    model_type = model_type,
    alpha = 0.9,
    log_transform = TRUE
  )
  predictions <- res$prediction
  predictions[, station_name := station_name]
  predictions[, station := stations[station_name]]
  predictions[, model := model_type]
  predictions[, trend := trend]
  results_all <- rbind(results_all, predictions)
}
#> Warning in log(dt_train_new$value): NaNs wurden erzeugt
#> [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000316 seconds.
#> You can set `force_row_wise=true` to remove the overhead.
#> And if memory is not enough, you can set `force_col_wise=true`.
#> [LightGBM] [Info] Total Bins 1549
#> [LightGBM] [Info] Number of data points in the train set: 60472, number of used features: 8
#> [LightGBM] [Info] Start training from score -0.000000
#> Warning in log(dt_train_new$value): NaNs wurden erzeugt
#> [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000346 seconds.
#> You can set `force_row_wise=true` to remove the overhead.
#> And if memory is not enough, you can set `force_col_wise=true`.
#> [LightGBM] [Info] Total Bins 1550
#> [LightGBM] [Info] Number of data points in the train set: 60133, number of used features: 8
#> [LightGBM] [Info] Start training from score -0.000000
#> [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000315 seconds.
#> You can set `force_row_wise=true` to remove the overhead.
#> And if memory is not enough, you can set `force_col_wise=true`.
#> [LightGBM] [Info] Total Bins 1552
#> [LightGBM] [Info] Number of data points in the train set: 60494, number of used features: 8
#> [LightGBM] [Info] Start training from score 0.000000

Plot bias (prediction - observation) to compare multiple stations

results_all[, bias := prediction - value]
results_all[, station_label := paste0(station_name, " (", station, ")")]
results_all[, station := as.character(unlist(station))]
results_all[, smoothed_bias := data.table::frollmean(bias,
  n = 24 * window_size,
  align = "center"
),
by = station
]
bias_plot <- ggplot2::ggplot(
  results_all,
  ggplot2::aes(
    x = date,
    y = smoothed_bias,
    color = station_label,
    group = station_label
  )
) +
  geom_line() +
  labs(
    title = paste("NO2 Bias Over Time by Station (", window_size, "-day mean)"),
    x = "Date",
    y = "NO2 Bias \n(prediction - observation) [µg/m³]",
    color = "Station"
  ) +
  ggplot2::geom_vline(
    xintercept = date_effect_start, linetype = 4,
    colour = "black"
  ) +
  ggplot2::theme_bw() +
  ggplot2::scale_x_datetime(
    date_minor_breaks = "1 month",
    date_breaks = "2 month"
  ) +
  ggplot2::theme(legend.position = "bottom")
bias_plot
#> Warning: Removed 1005 rows containing missing values or values outside the scale range
#> (`geom_line()`).

plot of chunk bias_plot

Plot counterfactual for Köln.

station_name_plot <- "Köln"
counterfactual_plot <- plot_counterfactual(
  dplyr::filter(results_all, station_name == station_name_plot), params,
  window_size = window_size,
  date_effect_start,
  buffer = buffer
)
counterfactual_plot

plot of chunk counterfactual_plot_3 ```

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.