## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7, fig.height = 5
)

## ----message = FALSE----------------------------------------------------------
library(spqrp)

df      <- spqrp_example_data("input_cohort_df")
ranking <- spqrp_example_data("protein_ranking")

head(df)
ranking

## -----------------------------------------------------------------------------
res <- run_clustering(
  df = df, ranking = ranking,
  n_neighbors = 1L,
  max_component_size = 2L,
  metric = "manhattan",
  method = "PCA",  # switch to "UMAP" once a cohort is large enough
  plot_name = "Mock ranking on Mock data",
  #quiet = FALSE.
  #save_path = Mock_data_clustering.png
)

## -----------------------------------------------------------------------------
head(res$cluster_assignments, 8)
res$uncertain_samples
res$error_candidate_samples
res$plot
res$transitive_results

## -----------------------------------------------------------------------------
result <- perform_distance_evaluation_on_ranked_proteins(
  df = df,
  top_importance_df = ranking,
  metric = "manhattan",
  p = 0.989,
  n = 4L,
)
result$cutoff
result$eval_metrics[c("TP", "FP", "FN", "TN", "Precision", "Sensitivity", "F1")]
result$plot 

## -----------------------------------------------------------------------------
results <- train_with_normalise(
  df,
  plate_corrected   = FALSE,  # mock data has no plate column
  outlier_removal   = FALSE   # skip on tiny data
  # classifier_backend defaults to "randomForest" — closest to imblearn's
  # BalancedRandomForestClassifier. Pass "ranger" for a faster (but more
  # divergent from original Python package) backend.
)

new_ranking <- retrieve_ranking(results)
new_ranking

## -----------------------------------------------------------------------------
# Drop flagged samples in one step:
filtered <- remove_outlier_samples(df, contamination = "auto")

filtered$outlier_list      # samples flagged as anomalous
filtered$anomaly_df        # per-sample anomaly scores
head(filtered$df)          # input df minus the flagged samples

## ----eval = FALSE-------------------------------------------------------------
# #Inspect the score distribution before deciding on a cutoff:
# filtered$anomaly_plot
# 
# # Or call the underlying detector directly to keep the original df and
# # only act on the flag list — useful when you want to surface candidates
# # without auto-removing them:
# forest <- by_isolation_forest(df, impute_median = TRUE,
#                                contamination = 0.05)  # top 5% by score
# forest$outlier_list

## -----------------------------------------------------------------------------
res <- run_clustering(
  df = filtered$df, ranking = new_ranking,
  n_neighbors = 1L,
  max_component_size = 2L,
  metric = "manhattan",
  method = "PCA",  # switch to "UMAP" once a cohort is large enough
  plot_name = "Mock ranking on Mock data",
  #quiet = FALSE.
  #save_path = filtered_mock_data_clustering.png
)

head(res$cluster_assignments, 8)
res$uncertain_samples
res$error_candidate_samples
res$plot

## -----------------------------------------------------------------------------
result <- perform_distance_evaluation_on_ranked_proteins(
  df = filtered$df,
  top_importance_df = new_ranking,
  metric = "manhattan",
  p = 0.989,
  n = 4L,
)
result$cutoff
result$eval_metrics[c("TP", "FP", "FN", "TN", "Precision", "Sensitivity", "F1")]
result$plot 

## ----eval = FALSE-------------------------------------------------------------
# df_raw <- spqrp_example_data("input_cohort_df")
# 
# # Zeros become NA so missingness is explicit
# df_raw$Intensity[df_raw$Intensity == 0] <- NA
# 
# df_pp <- df_raw |>
#   log_transform() |>
#   filter_by_occurrence(0.7)
# 
# # Per-sample median normalization (returns list(data, plot))
# norm <- normalize_medianintensity(df_pp, plot = FALSE)
# df_pp <- norm$data
# 
# # Plate-effect residualisation if a `plate` column exists
# df_pp <- plate_correct_residuals_by_protein(df_pp)