MDS + Procrustes Sensitivity

The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

library(DataFusionGDM)
library(ggplot2)
library(vegan)

# Generate a synthetic full matrix in memory (no file I/O)
full_matrix <- simulate_genetic_distances(n_pops = 40, verbose = FALSE, seed = 42)$distance_matrix

k_values <- c(4, 8, 12)
num_tests <- 3
seed_base <- 42

prepare_matrices_from_matrix <- function(full_matrix, k, bias_mu = 0.1, noise_sd = 0.05, randomize_shared = TRUE, seed = 42) {
  all_pop_names <- rownames(full_matrix)
  np <- length(all_pop_names)
  stopifnot(k >= 1, k <= np)
  if (randomize_shared) {
    ordering <- order(sin(seq_len(np) + seed), decreasing = TRUE)
    shared_pop_names <- all_pop_names[ordering[seq_len(k)]]
  } else {
    shared_pop_names <- all_pop_names[seq_len(k)]
  }
  unique_pop_names <- setdiff(all_pop_names, shared_pop_names)
  A <- full_matrix; B <- full_matrix
  base_index <- seq_len(length(A)) + seed
  A <- abs(A + matrix(sin(base_index) * noise_sd, nrow(A)))
  B <- abs(B + matrix(cos(base_index) * noise_sd + bias_mu, nrow(B)))
  diag(A) <- 0; diag(B) <- 0
  label_A <- label_B <- rep("", np); names(label_A) <- names(label_B) <- all_pop_names
  label_A[shared_pop_names] <- paste0("S_", shared_pop_names)
  label_B[shared_pop_names] <- paste0("S_", shared_pop_names)
  label_A[unique_pop_names] <- paste0("A_", unique_pop_names)
  label_B[unique_pop_names] <- paste0("B_", unique_pop_names)
  rownames(A) <- label_A[rownames(A)]; colnames(A) <- label_A[colnames(A)]
  rownames(B) <- label_B[rownames(B)]; colnames(B) <- label_B[colnames(B)]
  list(A = A, B = B, np = np, k = k)
}

results <- data.frame()
for (k in k_values) {
  for (test_id in seq_len(num_tests)) {
    test_seed <- seed_base + test_id * 100 + k
    matrices <- prepare_matrices_from_matrix(full_matrix, k = k, seed = test_seed)
    A <- matrices$A; B <- matrices$B
    mds <- perform_mds(A, B)
    X <- mds$X; Y <- mds$Y; d_opt <- mds$d_opt
    pop_common <- intersect(rownames(A), rownames(B))
    X_sub <- X[pop_common, 1:d_opt]
    Y_sub <- Y[pop_common, 1:d_opt]
    Yt <- apply_procrustes(X_sub, Y_sub, Y)
    B_cal <- coords_to_distances(Yt)
    the_prior <- mean((A - B)^2)
    the_post  <- mean((A - B_cal)^2)
    improvement <- (the_prior - the_post) / the_prior * 100
    results <- rbind(results, data.frame(k = k, test_id = test_id, the_prior = the_prior,
                                         the_post = the_post, improvement = improvement))
  }
}

agg <- aggregate(cbind(the_prior, the_post, improvement) ~ k, data = results,
                 FUN = function(x) c(mean = mean(x), sd = sd(x)))
agg <- do.call(data.frame, agg)

p <- ggplot(agg, aes(x = k)) +
  geom_line(aes(y = improvement.mean), color = "blue") +
  geom_point(aes(y = improvement.mean), color = "blue") +
  geom_ribbon(aes(ymin = improvement.mean - improvement.sd,
                  ymax = improvement.mean + improvement.sd), fill = "blue", alpha = 0.2) +
  labs(title = "Calibration improvement vs shared k",
       x = "Shared populations (k)", y = "% Improvement") +
  theme_minimal()
print(p)

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.