The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

Run Time Recording

# Load relevant libraries
library(SurprisalAnalysis)
library(ggplot2)
library(peakRAM)

#setting seed to ensure reproducibility
set.seed(123)

#params
n_genes       <- 1000
sample_counts <- c(5, 10, 20, 50, 100)
n_reps        <- 3


#' Function records run time and RAM peak usage
#'
#' @param n_genes number of genes
#' @param n_samples
#'
#' @return run time and RAM peak usage
#' @export
time_ram_single_run <- function(n_genes, n_samples) {
  elapsed <- NA_real_
  pr <- peakRAM({

    expr_matrix <- matrix(
      rlnorm(n = n_genes * n_samples, meanlog = 2, sdlog = 1),
      nrow = n_genes,
      ncol = n_samples
    )
    expr_df <- as.data.frame(expr_matrix)
    rownames(expr_df) <- paste0("Gene", seq_len(n_genes))
    colnames(expr_df) <- paste0("Sample", seq_len(n_samples))

    t0 <- Sys.time()
    res <- surprisal_analysis(expr_df)
    t1 <- Sys.time()
    elapsed <- as.numeric(difftime(t1, t0, units = "secs"))
  })
  list(
    time_sec = elapsed,
    peak_ram_mib = pr$Peak_RAM_Used_MiB[1]
  )
}


#store info

results_df <- data.frame(
  n_samples               = integer(length(sample_counts)),
  mean_time               = numeric(length(sample_counts)),
  sd_time                 = numeric(length(sample_counts)),
  ci_lower                = numeric(length(sample_counts)),
  ci_upper                = numeric(length(sample_counts)),
  mean_peak_ram_mib       = numeric(length(sample_counts)),
  sd_peak_ram_mib         = numeric(length(sample_counts)),
  ci_lower_peak_ram_mib   = numeric(length(sample_counts)),
  ci_upper_peak_ram_mib   = numeric(length(sample_counts)),
  stringsAsFactors = FALSE
)


ci_halfwidth_fun <- function(x, n) {
  sdev <- sd(x)
  se   <- sdev / sqrt(n)
  qt(0.975, df = n - 1) * se
}

for (i in seq_along(sample_counts)) {
  s <- sample_counts[i]
  rep_times <- numeric(n_reps)
  rep_ram   <- numeric(n_reps)

  for (r in seq_len(n_reps)) {
    gc()
    tr <- time_ram_single_run(n_genes = n_genes, n_samples = s)
    rep_times[r] <- tr$time_sec
    rep_ram[r]   <- tr$peak_ram_mib
  }

  mu_time <- mean(rep_times)
  sd_time <- sd(rep_times)
  hw_time <- ci_halfwidth_fun(rep_times, n_reps)

  mu_ram  <- mean(rep_ram)
  sd_ram  <- sd(rep_ram)
  hw_ram  <- ci_halfwidth_fun(rep_ram, n_reps)

  results_df[i, ] <- list(
    s,
    mu_time, sd_time, mu_time - hw_time, mu_time + hw_time,
    mu_ram,  sd_ram,  mu_ram  - hw_ram,  mu_ram  + hw_ram
  )
  print(i)
  message(sprintf(
    "n_samples = %3d : time = %.4f s (sd=%.4f, 95%% CI=[%.4f, %.4f]);  peak RAM = %.2f MiB (sd=%.2f, 95%% CI=[%.2f, %.2f])",
    s, mu_time, sd_time, mu_time - hw_time, mu_time + hw_time,
    mu_ram, sd_ram, mu_ram - hw_ram, mu_ram + hw_ram
  ))
}

print(results_df)

model_cubic <- lm(mean_time ~ poly(n_samples, 3), data = results_df)
summary(model_cubic)

results_df$pred_cubic <- predict(model_cubic, newdata = results_df)

ggplot(results_df, aes(x = n_samples, y = mean_time)) +
  geom_ribbon(aes(ymin = mean_time - sd_time, ymax = mean_time + sd_time),
              fill = "steelblue", alpha = 0.2) +
  geom_point(color = "steelblue4", size = 2, shape=8, stroke=1.5) +
  geom_line(color = "steelblue4") +
  stat_smooth(method = "lm",
              formula = y ~ poly(x, 3),
              se = FALSE,
              color = "firebrick",
              linetype = "dashed",
              size = 1) +
  labs(
    x = "Number of samples",
    y = "Average elapsed time (seconds)",
    title = sprintf("SurprisalAnalysis Runtime vs. Number of Samples\n(gene count = %d; mean ± 95%% CI over %d runs)", n_genes, n_reps), tag="A"
  ) +
  theme_minimal(base_size = 12)

is_bytes <- max(results_df$mean_peak_ram_mib, na.rm = TRUE) > 1e5
scale_factor <- if (is_bytes) 1/(1024^2) else 1

use_ci <- all(c("ci_lower_peak_ram_mib", "ci_upper_peak_ram_mib") %in% names(results_df))

df_plot <- within(results_df, {
  mean_ram_plot  <- mean_peak_ram_mib * scale_factor
  lower_ram_plot <- (if (use_ci) ci_lower_peak_ram_mib else mean_peak_ram_mib - sd_peak_ram_mib) * scale_factor
  upper_ram_plot <- (if (use_ci) ci_upper_peak_ram_mib else mean_peak_ram_mib + sd_peak_ram_mib) * scale_factor
})


ggplot(df_plot, aes(x = n_samples, y = mean_ram_plot)) +
  geom_ribbon(aes(ymin = lower_ram_plot, ymax = upper_ram_plot),
              fill = "steelblue", alpha = 0.2) +
  geom_point(color = "steelblue4", size = 2, shape=8, stroke = 1.5) +
  geom_line(color = "steelblue4") +
  scale_x_continuous(breaks = sample_counts) +
  labs(
    title = sprintf("SurprisalAnalysis Peak RAM vs. Number of Samples\n(gene count = %d; mean ± 95%% CI over %d runs)", n_genes, n_reps),
    x = "Number of Samples",
    y = "Peak RAM (MiB)", tag="B"
  ) +
  theme_minimal(base_size = 12)

#write.csv(results_df, "~/Downloads/runtime_altering_samples_try_2.csv")

####Altering the number of genes


#params
n_samples_fixed <- 50
gene_counts     <- c(100, 500, 1000, 2000, 5000)
n_reps          <- 3


results_genes_df <- data.frame(
  n_genes                 = integer(length(gene_counts)),
  mean_time               = numeric(length(gene_counts)),
  sd_time                 = numeric(length(gene_counts)),
  ci_lower                = numeric(length(gene_counts)),
  ci_upper                = numeric(length(gene_counts)),
  mean_peak_ram_mib       = numeric(length(gene_counts)),
  sd_peak_ram_mib         = numeric(length(gene_counts)),
  ci_lower_peak_ram_mib   = numeric(length(gene_counts)),
  ci_upper_peak_ram_mib   = numeric(length(gene_counts)),
  stringsAsFactors = FALSE
)

ci_halfwidth_fun <- function(x, n) {
  sdev <- sd(x); se <- sdev / sqrt(n); qt(0.975, df = n - 1) * se
}


for (i in seq_along(gene_counts)) {
  g <- gene_counts[i]
  rep_times <- numeric(n_reps)
  rep_ram   <- numeric(n_reps)

  for (r in seq_len(n_reps)) {
    gc()
    tr <- time_ram_single_run(n_genes = g, n_samples = n_samples_fixed)
    rep_times[r] <- tr$time_sec
    rep_ram[r]   <- tr$peak_ram_mib
  }

  mu_time <- mean(rep_times); sd_time <- sd(rep_times); hw_time <- ci_halfwidth_fun(rep_times, n_reps)
  mu_ram  <- mean(rep_ram);   sd_ram  <- sd(rep_ram);  hw_ram  <- ci_halfwidth_fun(rep_ram,   n_reps)

  results_genes_df[i, ] <- list(
    g,
    mu_time, sd_time, mu_time - hw_time, mu_time + hw_time,
    mu_ram,  sd_ram,  mu_ram  - hw_ram,  mu_ram  + hw_ram
  )

  message(sprintf(
    "n_genes = %5d : time = %.4f s (sd=%.4f, 95%% CI=[%.4f, %.4f]);  peak RAM = %.2f MiB (sd=%.2f, 95%% CI=[%.2f, %.2f])",
    g, mu_time, sd_time, mu_time - hw_time, mu_time + hw_time,
    mu_ram, sd_ram, mu_ram - hw_ram, mu_ram + hw_ram
  ))
}

print(results_genes_df)

ggplot(results_genes_df, aes(x = n_genes, y = mean_time)) +
  geom_ribbon(aes(ymin = ci_lower, ymax = ci_upper),
              fill = "steelblue", alpha = 0.2) +
  geom_point(color = "steelblue4", size = 2, shape = 8, stroke = 1.5) +
  geom_line(color = "steelblue4") +
  geom_smooth(method = "lm", se = FALSE, color = "firebrick", size = 1) +
  scale_x_continuous(breaks = gene_counts) +
  labs(
    x = "Number of genes",
    y = "Average elapsed time (seconds)",
    title = sprintf(
      "SurprisalAnalysis Runtime vs. Number of Genes\n(sample count = %d; mean ± 95%% CI over %d runs)",
      n_samples_fixed, n_reps
    ),
    tag = "C"
  ) +
  theme_minimal(base_size = 12)

is_bytes_g <- max(results_genes_df$mean_peak_ram_mib, na.rm = TRUE) > 1e5
scale_factor_g <- if (is_bytes_g) 1/(1024^2) else 1
use_ci_g <- all(c("ci_lower_peak_ram_mib", "ci_upper_peak_ram_mib") %in% names(results_genes_df))

df_plot_genes <- within(results_genes_df, {
  mean_ram_plot_g  <- mean_peak_ram_mib * scale_factor_g
  lower_ram_plot_g <- (if (use_ci_g) ci_lower_peak_ram_mib else mean_peak_ram_mib - sd_peak_ram_mib) * scale_factor_g
  upper_ram_plot_g <- (if (use_ci_g) ci_upper_peak_ram_mib else mean_peak_ram_mib + sd_peak_ram_mib) * scale_factor_g
})


ggplot(df_plot_genes, aes(x = n_genes, y = mean_ram_plot_g)) +
  geom_ribbon(aes(ymin = lower_ram_plot_g, ymax = upper_ram_plot_g),
              fill = "steelblue", alpha = 0.2) +
  geom_point(color = "steelblue4", size = 2, shape = 8, stroke = 1.5) +
  geom_line(color = "steelblue4") +
  scale_x_continuous(breaks = gene_counts) +
  labs(
    title = sprintf("SurprisalAnalysis Peak RAM vs. Number of Genes\n(sample count = %d; mean ± 95%% CI over %d runs)", n_samples_fixed, n_reps),
    x = "Number of genes",
    y = "Peak RAM (MiB)",
    tag = "D"
  ) +
  theme_minimal(base_size = 12)





#write.csv(results_genes_df, "~/Downloads/runtime_altering_genes_try_2.csv")

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.