Simulation Study with penetrance

The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

BayesMendel Lab

Goal

Here we apply the penetrance package to simulated families where the data-generating penetrance function is known and illustrate the estimation of the parameters of the Weibull distribution.

Simulated Data

The data used for the simulation is available as sim_fam.Rdata. Recall the parameterization of the Weibull distribution.For the function \(f(x; \alpha, \beta, \delta, \gamma)\) we define:

\[ f(x; \alpha, \beta, \delta, \gamma) = \begin{cases} \gamma \left( \frac{\alpha}{\beta} \left( \frac{x - \delta}{\beta} \right)^{\alpha - 1} e^{-\left( \frac{x - \delta}{\beta} \right)^\alpha } \right), & x \geq \delta \\ 0, & x < \delta \end{cases} \]

And for the cumulative distribution function \(F(x; \alpha, \beta, \delta, \gamma)\):

\[ F(x; \alpha, \beta, \delta, \gamma) = \begin{cases} \gamma \left( 1 - e^{-\left( \frac{x - \delta}{\beta} \right)^\alpha } \right), & x \geq \delta \\ 0, & x < \delta \end{cases} \]

The data-generating penetrance was constructed using the following parameters for the Weibull distribution.

# Create generating_penetrance data frame
age <- 1:94

# Calculate Weibull distribution for Females
alpha <- 2 
beta  <- 50 
gamma <- 0.6
delta <- 15
penetrance.mod.f <- dweibull(age - delta, alpha, beta) * gamma

# Calculate Weibull distribution for Males
alpha <- 2 
beta  <- 50 
gamma <- 0.6
delta <- 30
penetrance.mod.m <- dweibull(age - delta, alpha, beta) * gamma

generating_penetrance <- data.frame(
    Age = age,
    Female = penetrance.mod.f,
    Male = penetrance.mod.m
)

The families were simulated using the PedUtils Rpackage.

dat <- simulated_families

Simple simulation

Then we run the estimation using the default settings.

# Set the random seed
set.seed(2024)

# Set the prior
prior_params <- list(
    asymptote = list(g1 = 1, g2 = 1),
    threshold = list(min = 5, max = 40),
    median = list(m1 = 2, m2 = 2),
    first_quartile = list(q1 = 6, q2 = 3)
)

# Set the prevalence
prevMLH1 <- 0.001

# We use the default baseline (non-carrier) penetrance
print(baseline_data_default)

# We run the estimation procedure with one chain and 20k iterations  
out_sim <- penetrance(
    pedigree  = dat, twins = NULL, n_chains = 1, n_iter_per_chain = 20000, 
    ncores = 2, baseline_data = baseline_data_default , prev = prevMLH1, 
    prior_params = prior_params, burn_in = 0.1, median_max = TRUE,  
    ageImputation = FALSE, removeProband = FALSE
)

Comparison of estimated curves vs. data-generating curves

We then plot the respective penetrance curves from the data-generating distribution defined above and the penetrance curves (including credible intervals) from the estimation procedure.

# Function to calculate Weibull cumulative density
weibull_cumulative <- function(x, alpha, beta, threshold, asymptote) {
  pweibull(x - threshold, shape = alpha, scale = beta) * asymptote
}

# Function to plot the penetrance and compare with simulated data
plot_penetrance_comparison <- function(data, generating_penetrance, prob, max_age, sex) {
  if (prob <= 0 || prob >= 1) {
    stop("prob must be between 0 and 1")
  }

  # Calculate Weibull parameters for the given sex
  params <- if (sex == "Male") {
    calculate_weibull_parameters(
      data$median_male_results,
      data$first_quartile_male_results,
      data$threshold_male_results
    )
  } else if (sex == "Female") {
    calculate_weibull_parameters(
      data$median_female_results,
      data$first_quartile_female_results,
      data$threshold_female_results
    )
  } else {
    stop("Invalid sex. Please choose 'Male' or 'Female'.")
  }

  alphas <- params$alpha
  betas <- params$beta
  thresholds <- if (sex == "Male") data$threshold_male_results else data$threshold_female_results
  asymptotes <- if (sex == "Male") data$asymptote_male_results else data$asymptote_female_results

  x_values <- seq(1, max_age)

  # Calculate cumulative densities for the specified sex
  cumulative_density <- mapply(function(alpha, beta, threshold, asymptote) {
    pweibull(x_values - threshold, shape = alpha, scale = beta) * asymptote
  }, alphas, betas, thresholds, asymptotes, SIMPLIFY = FALSE)

  distributions_matrix <- matrix(unlist(cumulative_density), nrow = length(x_values), byrow = FALSE)
  mean_density <- rowMeans(distributions_matrix, na.rm = TRUE)

  # Calculate credible intervals
  lower_prob <- (1 - prob) / 2
  upper_prob <- 1 - lower_prob
  lower_ci <- apply(distributions_matrix, 1, quantile, probs = lower_prob)
  upper_ci <- apply(distributions_matrix, 1, quantile, probs = upper_prob)

  # Recover the data-generating penetrance
  cumulative_generating_penetrance <- cumsum(generating_penetrance[[sex]])

  # Create data frame for plotting
  age_values <- seq_along(cumulative_generating_penetrance)
  min_length <- min(length(cumulative_generating_penetrance), length(mean_density))

  plot_df <- data.frame(
    age = age_values[1:min_length],
    cumulative_generating_penetrance = cumulative_generating_penetrance[1:min_length],
    mean_density = mean_density[1:min_length],
    lower_ci = lower_ci[1:min_length],
    upper_ci = upper_ci[1:min_length]
  )

  # Plot the cumulative densities with credible intervals
  p <- ggplot(plot_df, aes(x = age)) +
    geom_line(aes(y = cumulative_generating_penetrance, color = "Data-generating penetrance"), linewidth = 1, linetype = "solid", na.rm = TRUE) +
    geom_line(aes(y = mean_density, color = "Estimated penetrance"), linewidth = 1, linetype = "dotted", na.rm = TRUE) +
    geom_ribbon(aes(ymin = lower_ci, ymax = upper_ci), alpha = 0.2, fill = "red", na.rm = TRUE) +
    labs(title = paste("Cumulative Density Comparison for", sex),
         x = "Age",
         y = "Cumulative Density") +
    theme_minimal() +
    scale_color_manual(values = c("Data-generating penetrance" = "blue", 
                                  "Estimated penetrance" = "red")) +
    scale_y_continuous(labels = scales::percent) +
    theme(legend.title = element_blank())

  print(p)

  # Calculate Mean Squared Error (MSE)
  mse <- mean((plot_df$cumulative_generating_penetrance - plot_df$mean_density)^2, na.rm = TRUE)
  cat("Mean Squared Error (MSE):", mse, "\n")

  # Calculate Confidence Interval Coverage
  coverage <- mean((plot_df$cumulative_generating_penetrance >= plot_df$lower_ci) & 
                   (plot_df$cumulative_generating_penetrance <= plot_df$upper_ci), na.rm = TRUE)
  cat("Confidence Interval Coverage:", coverage, "\n")
}

# Plot for Female
plot_penetrance_comparison(
  data = out_sim$combined_chains,
  generating_penetrance = generating_penetrance,
  prob = 0.95,
  max_age = 94,
  sex = "Female"
)

## Mean Squared Error (MSE): 0.002250424 
## Confidence Interval Coverage: 0.6702128

# Plot for Male
plot_penetrance_comparison(
  data = out_sim$combined_chains,
  generating_penetrance = generating_penetrance,
  prob = 0.95,
  max_age = 94,
  sex = "Male"
)

## Mean Squared Error (MSE): 0.000929737 
## Confidence Interval Coverage: 1

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.