The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
This vignette demonstrates the PONG2 R API using the built-in example
dataset. PONG2 is primarily a command-line tool
(pong2 impute, pong2 train) — see the other
vignettes for CLI usage. The R API demonstrated here is the underlying
interface used by the CLI.
library(PONG2)
#> ** initializing environment
#> PONG2 (Genotype Imputation with Attribute Bagging): v2.0.0
#> Supported by Streaming SIMD Extensions 2 (SSE2)
packageVersion("PONG2")
#> [1] '1.0.1'PONG2 includes a built-in example dataset with 50 samples and 200 SNPs in the KIR region (chr19), along with a pre-trained KIR3DL1 model.
# Load built-in example data
data(PONG2_example)
# SNP genotype data
cat("SNP genotype data:\n")
#> SNP genotype data:
cat(" Samples:", ncol(example_snp$genotype), "\n")
#> Samples: 50
cat(" SNPs: ", nrow(example_snp$genotype), "\n")
#> SNPs: 200
cat(" Assembly:", example_snp$assembly, "\n")
#> Assembly: hg19
# KIR allele table
cat("\nKIR allele table:\n")
#>
#> KIR allele table:
cat(" Locus: ", example_kir$locus, "\n")
#> Locus: KIR3DL1
cat(" Alleles:", length(example_kir$allele), "\n")
#> Alleles: 0
cat(" Samples:", length(example_kir$value$sample.id), "\n")
#> Samples: 50
head(example_kir$value, 3)
#> sample.id allele1 allele2
#> 1 S1 KIR3DL1*015 KIR3DL1*002
#> 2 S2 KIR3DL1*007 KIR3DL1*007
#> 3 S3 KIR3DL1*015 KIR3DL1*002# Load the pre-trained example model
model <- hlaModelFromObj(example_mobj)
cat("Model summary:\n")
#> Model summary:
cat(" KIR locus: ", model$hla.locus, "\n")
#> KIR locus: KIR3DL1
cat(" Classifiers: ", length(example_mobj$classifiers), "\n")
#> Classifiers: 10
cat(" Alleles: ", length(model$hla.allele), "\n")
#> Alleles: 5
# Predict KIR genotypes on example SNP data
pred <- kirPredict(
object = model,
snp = example_snp,
type = "response+prob",
verbose = FALSE
)
# View predictions
cat("\nFirst 5 predictions:\n")
#>
#> First 5 predictions:
head(pred$value, 5)
#> sample.id allele1 allele2 prob
#> 1 S1 KIR3DL1*002 KIR3DL1*015 0.4767310
#> 2 S2 KIR3DL1*007 KIR3DL1*007 0.4504548
#> 3 S3 KIR3DL1*002 KIR3DL1*015 0.6123777
#> 4 S4 KIR3DL1*002 KIR3DL1*015 0.6277634
#> 5 S5 KIR3DL1*002 KIR3DL1*005 0.6243871
# Call rate
called <- sum(!is.na(pred$value$allele1))
call_rate <- round(called / nrow(pred$value) * 100, 1)
cat("\nCall rate:", call_rate, "%\n")
#>
#> Call rate: 100 %
# Clean up model handle
hlaClose(model)# Set up parallel cluster
cl <- parallel::makeCluster(2)
# Train a small KIR3DL1 model using example data
model_trained <- kirParallelAttrBagging(
cl = cl,
hla = example_kir,
snp = example_snp,
nclassifier = 5,
verbose = FALSE
)
parallel::stopCluster(cl)
# View trained model summary
print(model_trained)
#> Gene: KIR3DL1
#> Training dataset: 50 samples X 200 SNPs
#> # of KIR3DL1/S1 alleles: 5
#> # of individual classifiers: 5
#> total # of SNPs used: 34
#> average # of SNPs in an individual classifier: 7.80, sd: 4.02, min: 2, max: 12
#> average # of haplotypes in an individual classifier: 100.80, sd: 73.52, min: 16, max: 181
#> average out-of-bag accuracy: 61.96%, sd: 7.55%, min: 52.27%, max: 71.05%
#> Genome assembly: hg19
# Save model as R object
mobj_trained <- hlaModelToObj(model_trained)
cat("\nModel saved with", length(mobj_trained$classifiers),
"classifiers\n")
#>
#> Model saved with 5 classifiers
# Clean up
hlaClose(model_trained)# Split example data into train and test
set.seed(42)
n <- length(example_kir$value$sample.id)
idx <- sample(1:n, floor(n * 0.7))
train_samples <- example_kir$value$sample.id[idx]
test_samples <- example_kir$value$sample.id[-idx]
# Subset KIR and SNP data
train_kir <- hlaAlleleSubset(example_kir,
match(train_samples, example_kir$value$sample.id))
test_kir <- hlaAlleleSubset(example_kir,
match(test_samples, example_kir$value$sample.id))
# Train model on training set
cl <- parallel::makeCluster(2)
model_eval <- kirParallelAttrBagging(
cl = cl,
hla = train_kir,
snp = example_snp,
nclassifier = 5,
verbose = FALSE
)
parallel::stopCluster(cl)
# Subset SNP data to test samples
test_snp_idx <- match(test_samples, example_snp$sample.id)
test_snp <- hlaGenoSubset(example_snp, samp.sel = test_snp_idx)
# Predict on test set
pred_eval <- kirPredict(
object = model_eval,
snp = test_snp,
type = "response+prob",
verbose = FALSE
)
# Compare predictions to truth
comp <- hlaCompareAllele(
TrueHLA = test_kir,
PredHLA = pred_eval,
call.threshold = 0.5,
verbose = FALSE
)
cat("Evaluation results:\n")
#> Evaluation results:
cat(" Haplotype accuracy:",
round(comp$overall$acc.haplo * 100, 1), "%\n")
#> Haplotype accuracy: NaN %
cat(" Genotype accuracy: ",
round(comp$overall$acc.ind * 100, 1), "%\n")
#> Genotype accuracy: NaN %
cat(" Call rate: ",
round(comp$overall$call.rate * 100, 1), "%\n")
#> Call rate: 0 %
# Clean up
hlaClose(model_eval)For the complete command-line workflow used in the manuscript, see:
vignette("PONG2-basics") — installation and CLI quick
startvignette("PONG2-imputation") — full imputation
workflowvignette("PONG2-training") — custom model trainingOr run directly from the terminal:
# These commands run from the terminal after adding pong2 to PATH
system("pong2 impute -i example/chr19 -o output -l KIR3DL1 -a hg19")
system("pong2 train --bfile example/chr19 --kfile example/kir_call.csv \
--output output --locus KIR3DL1 --assembly hg19")sessionInfo()
#> R version 4.5.2 (2025-10-31)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 22.04.5 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0 LAPACK version 3.10.0
#>
#> Random number generation:
#> RNG: L'Ecuyer-CMRG
#> Normal: Inversion
#> Sample: Rejection
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] PONG2_1.0.1
#>
#> loaded via a namespace (and not attached):
#> [1] digest_0.6.39 R6_2.6.1 fastmap_1.2.0 xfun_0.54
#> [5] cachem_1.1.0 parallel_4.5.2 knitr_1.51 htmltools_0.5.9
#> [9] rmarkdown_2.31 lifecycle_1.0.5 cli_3.6.6 sass_0.4.10
#> [13] jquerylib_0.1.4 compiler_4.5.2 rstudioapi_0.17.1 tools_4.5.2
#> [17] evaluate_1.0.5 bslib_0.10.0 yaml_2.3.12 otel_0.2.0
#> [21] jsonlite_2.0.0 rlang_1.1.7These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.