The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

slideimp

library(slideimp)
set.seed(1234)

Comparing Methods Using a Shared Cross-Validation Split with tune_imp()

# 20 rows, 1000 columns, all columns have at least some NA
sim_obj <- sim_mat(n = 20, p = 1000, perc_col_na = 1)
obj <- sim_obj$input
obj[1:4, 1:4]
#>          feature1  feature2  feature3  feature4
#> sample1 0.3486482 0.7385414 0.4077444 0.1607935
#> sample2 0.5338935 0.4724364 0.9663621 0.3722729
#> sample3 0.7185848 0.7351035 0.6724479 0.3162537
#> sample4 0.1734418        NA 0.0000000 0.0000000
na_loc <- sample_na_loc(obj, n_cols = 200, n_rows = 5, n_reps = 5)
length(na_loc)
#> [1] 5
na_loc[[1]][1:6, ]
#>      row col
#> [1,]  18 661
#> [2,]  17 661
#> [3,]  20 661
#> [4,]  15 661
#> [5,]   6 661
#> [6,]  10 293
# This custom function imputes missing values with random normal values and takes
# `mean` and `sd` as params
rnorm_imp <- function(obj, mean, sd) {
  na <- is.na(obj)
  obj[na] <- rnorm(sum(na), mean = mean, sd = sd) # <- impute values with rnorm
  return(obj) # <- return an imputed object with the same dim as obj
}

pca_tune <- tune_imp(
  obj,
  .f = "pca_imp",
  na_loc = na_loc,
  parameters = data.frame(ncp = 10)
)
#> Tuning `pca_imp()`
#> Step 1/2: Resolving NA locations
#> Running mode: sequential
#> Step 2/2: Tuning

knn_tune <- tune_imp(
  obj,
  .f = "knn_imp",
  na_loc = na_loc,
  parameters = data.frame(k = 10)
)
#> Tuning `knn_imp()`
#> Step 1/2: Resolving NA locations
#> Running mode: sequential
#> Step 2/2: Tuning

rnorm_tune <- tune_imp(
  obj,
  .f = rnorm_imp,
  na_loc = na_loc,
  parameters = data.frame(mean = 0, sd = 1) # must match with arguments of `rnorm_imp`
)
#> Tuning custom function
#> Step 1/2: Resolving NA locations
#> Running mode: sequential
#> Step 2/2: Tuning
mean(compute_metrics(pca_tune, metrics = "rmse")$.estimate)
#> [1] 0.2045949
mean(compute_metrics(knn_tune, metrics = "rmse")$.estimate)
#> [1] 0.1962824
mean(compute_metrics(rnorm_tune, metrics = "rmse")$.estimate)
#> [1] 1.108204

Group-Wise Imputation with Small-Group Padding and Group-Wise Parameters with group_imp()

sim_obj <- sim_mat(n = 20, p = 50, n_col_groups = 2)

# Matrix to be imputed
obj <- sim_obj$input
obj[1:5, 1:4]
#>           feature1  feature2  feature3  feature4
#> sample1 0.03031619 0.1352710 0.4653579 0.1143298
#> sample2 0.50962966 0.2313022 0.9737881 0.5198844
#> sample3 0.69068459 0.5020337 0.7863144 0.9997625
#> sample4 0.40710170 0.4094603 0.3228294 0.2529592
#> sample5 0.73113251 0.4108405 0.6385945        NA

# Metadata, i.e., which features belong to which group
meta <- sim_obj$col_group
meta[1:5, ]
#>    feature  group
#> 1 feature1 group1
#> 2 feature2 group1
#> 3 feature3 group1
#> 4 feature4 group2
#> 5 feature5 group2

# We put feature 1 in `group3`
meta[1, 2] <- "group3"
meta[1:5, ]
#>    feature  group
#> 1 feature1 group3
#> 2 feature2 group1
#> 3 feature3 group1
#> 4 feature4 group2
#> 5 feature5 group2
set.seed(1234)
group_imp_df <- prep_groups(colnames(obj), group = meta, min_group_size = 10)
group_imp_df$parameters <- list(list(k = 3), list(k = 4), list(k = 5))
group_imp_df
#> # slideimp table: 3 x 4
#>   group          feature             aux parameters
#>  group1 <character [26]> <character [0]> <list [1]>
#>  group2 <character [23]> <character [0]> <list [1]>
#>  group3  <character [1]> <character [9]> <list [1]>
knn_results <- group_imp(obj, group = group_imp_df, cores = 4, k = 10)
#> Imputing 3 groups using KNN.
#> Running mode: threaded (4 cores)
print(knn_results, p = 4)
#> Method: group_imp (KNN imputation)
#> Dimensions: 20 x 50
#> 
#>           feature1  feature2  feature3  feature4
#> sample1 0.03031619 0.1352710 0.4653579 0.1143298
#> sample2 0.50962966 0.2313022 0.9737881 0.5198844
#> sample3 0.69068459 0.5020337 0.7863144 0.9997625
#> sample4 0.40710170 0.4094603 0.3228294 0.2529592
#> sample5 0.73113251 0.4108405 0.6385945 0.4472338
#> sample6 0.48993078 0.4950764 0.6553896 0.6737998
#> # Showing 6 of 20 rows and 4 of 50 columns

Sliding Window Imputation for WGBS/EM-seq Data with slide_imp()

Select window_size, overlap_size, and PCA/K-NN Parameters

set.seed(1234)
sample_names <- paste0("S", 1:10)
n_sites <- 1000

# Simulate positions with 50–500 bp between each site
distances_between <- sample(50:500, size = n_sites, replace = TRUE)
locations <- cumsum(distances_between) # <- important, location vector

methyl <- data.frame(
  chr = "chr1",
  start = locations,
  end = locations,
  strand = "+"
)

for (i in seq_along(sample_names)) {
  methyl[[paste0("numCs", i)]] <- sample.int(100, size = n_sites, replace = TRUE)
  methyl[[paste0("numTs", i)]] <- sample.int(100, size = n_sites, replace = TRUE)
  methyl[[paste0("coverage", i)]] <- methyl[[paste0("numCs", i)]] + methyl[[paste0("numTs", i)]]
}

methyl[1:5, 1:10]
#>    chr start  end strand numCs1 numTs1 coverage1 numCs2 numTs2 coverage2
#> 1 chr1   333  333      +     83     68       151     84     36       120
#> 2 chr1   718  718      +     82      2        84     54     99       153
#> 3 chr1  1173 1173      +     48      2        50     44     16        60
#> 4 chr1  1323 1323      +     70     48       118     48     56       104
#> 5 chr1  1483 1483      +     88     15       103     83     19       102
numCs_matrix <- as.matrix(methyl[, paste0("numCs", seq_along(sample_names))])
cov_matrix <- as.matrix(methyl[, paste0("coverage", seq_along(sample_names))])
beta_matrix <- numCs_matrix / cov_matrix

colnames(beta_matrix) <- sample_names
rownames(beta_matrix) <- methyl$start

beta_matrix <- t(beta_matrix)
# Set 10% of the data to missing
set.seed(1234)
beta_matrix[sample.int(length(beta_matrix), floor(length(beta_matrix) * 0.1))] <- NA
beta_matrix[1:4, 1:4]
#>           333       718       1173      1323
#> S1 0.54966887 0.9761905 0.96000000 0.5932203
#> S2 0.70000000 0.3529412 0.73333333 0.4615385
#> S3 0.06521739 0.7053571         NA 0.6507937
#> S4 0.69444444 0.2846154 0.04347826 0.7636364
params <- expand.grid(ncp = c(2, 4), window_size = c(5000, 10000))
params$overlap_size <- 1000
params$min_window_n <- 20 # windows with less than 20 columns are dropped

# Increase n_reps from 2 in actual analyses and use another chromosome (i.e., chr22)
tune_slide_pca <- tune_imp(
  obj = beta_matrix,
  parameters = params,
  .f = "slide_imp",
  n_reps = 2,
  location = locations
)
#> Tuning `slide_imp()`
#> Step 1/2: Resolving NA locations
#> ℹ Using default `num_na` = 500 (~5% of cells).
#>   Increase for more reliability or decrease if missing is dense.
#> Running mode: sequential
#> Step 2/2: Tuning

metrics <- compute_metrics(tune_slide_pca)

aggregate(.estimate ~ .metric + ncp + window_size, data = metrics, FUN = mean)
#>   .metric ncp window_size .estimate
#> 1     mae   2        5000 0.2251033
#> 2    rmse   2        5000 0.2830913
#> 3     mae   4        5000 0.2391354
#> 4    rmse   4        5000 0.2976971
#> 5     mae   2       10000 0.2116169
#> 6    rmse   2       10000 0.2639824
#> 7     mae   4       10000 0.2220744
#> 8    rmse   4       10000 0.2754316
slide_imp(
  obj = beta_matrix,
  location = locations,
  window_size = 5000,
  overlap_size = 1000,
  ncp = 2,
  min_window_n = 20,
  dry_run = TRUE # <- dry_run to inspect the windows
)
#> Dropping 43 window(s) with fewer than 20 columns.
#> # slideimp table: 24 x 4
#>  start end window_n  subset_local
#>     15  34       20 <double [20]>
#>    104 124       21 <double [21]>
#>    188 209       22 <double [22]>
#>    205 225       21 <double [21]>
#>    222 241       20 <double [20]>
#>    239 258       20 <double [20]>
#>    254 273       20 <double [20]>
#>    369 388       20 <double [20]>
#>    385 404       20 <double [20]>
#>    402 424       23 <double [23]>
#> # ... with 14 more rows
slide_imp(
  obj = beta_matrix,
  location = locations,
  window_size = 5000,
  overlap_size = 1000,
  ncp = 2,
  min_window_n = 20,
  dry_run = FALSE,
  .progress = FALSE
)
#> Method: slide_imp (PCA imputation)
#> Dimensions: 10 x 1000
#> Note: requested columns still contain NA values
#> 
#>           333       718       1173      1323      1483       1925
#> S1 0.54966887 0.9761905 0.96000000 0.5932203        NA 0.08695652
#> S2 0.70000000 0.3529412 0.73333333 0.4615385 0.8137255 0.45454545
#> S3 0.06521739 0.7053571         NA 0.6507937 0.4393064 0.37735849
#> S4 0.69444444 0.2846154 0.04347826 0.7636364 0.9595960 0.54621849
#> S5 0.83505155 0.5777778 0.47517730        NA 0.7368421 0.21666667
#> S6 0.26612903 0.3451327 0.53072626 0.5000000 0.6465517 0.37288136
#> # Showing 6 of 10 rows and 6 of 1000 columns

Subset and Flanking Mode

slide_imp(
  obj = beta_matrix,
  location = locations,
  window_size = 5000,
  ncp = 2,
  min_window_n = 20,
  subset = c("1323", "33810"),
  flank = TRUE,
  dry_run = TRUE
)
#> # slideimp table: 2 x 5
#>  start end window_n target  subset_local
#>      1  22       22      4 <integer [1]>
#>    101 138       38    122 <integer [1]>

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.