PONG2 R API: Direct R Usage with Example Data

The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

Norman Lab

Overview

This vignette demonstrates the PONG2 R API using the built-in example dataset. PONG2 is primarily a command-line tool (pong2 impute, pong2 train) — see the other vignettes for CLI usage. The R API demonstrated here is the underlying interface used by the CLI.

1. Installation and Setup

# From CRAN
install.packages("PONG2")

# From GitHub
remotes::install_github("NormanLabUCD/PONG2")

library(PONG2)
#> ** initializing environment
#> PONG2 (Genotype Imputation with Attribute Bagging): v2.0.0
#> Supported by Streaming SIMD Extensions 2 (SSE2)
packageVersion("PONG2")
#> [1] '1.0.1'

2. Example Data

PONG2 includes a built-in example dataset with 50 samples and 200 SNPs in the KIR region (chr19), along with a pre-trained KIR3DL1 model.

# Load built-in example data
data(PONG2_example)

# SNP genotype data
cat("SNP genotype data:\n")
#> SNP genotype data:
cat("  Samples:", ncol(example_snp$genotype), "\n")
#>   Samples: 50
cat("  SNPs:   ", nrow(example_snp$genotype), "\n")
#>   SNPs:    200
cat("  Assembly:", example_snp$assembly, "\n")
#>   Assembly: hg19

# KIR allele table
cat("\nKIR allele table:\n")
#> 
#> KIR allele table:
cat("  Locus:  ", example_kir$locus, "\n")
#>   Locus:   KIR3DL1
cat("  Alleles:", length(example_kir$allele), "\n")
#>   Alleles: 0
cat("  Samples:", length(example_kir$value$sample.id), "\n")
#>   Samples: 50
head(example_kir$value, 3)
#>   sample.id     allele1     allele2
#> 1        S1 KIR3DL1*015 KIR3DL1*002
#> 2        S2 KIR3DL1*007 KIR3DL1*007
#> 3        S3 KIR3DL1*015 KIR3DL1*002

3. KIR Genotype Prediction

# Load the pre-trained example model
model <- hlaModelFromObj(example_mobj)

cat("Model summary:\n")
#> Model summary:
cat("  KIR locus:   ", model$hla.locus, "\n")
#>   KIR locus:    KIR3DL1
cat("  Classifiers: ", length(example_mobj$classifiers), "\n")
#>   Classifiers:  10
cat("  Alleles:     ", length(model$hla.allele), "\n")
#>   Alleles:      5

# Predict KIR genotypes on example SNP data
pred <- kirPredict(
  object  = model,
  snp     = example_snp,
  type    = "response+prob",
  verbose = FALSE
)

# View predictions
cat("\nFirst 5 predictions:\n")
#> 
#> First 5 predictions:
head(pred$value, 5)
#>   sample.id     allele1     allele2      prob
#> 1        S1 KIR3DL1*002 KIR3DL1*015 0.4767310
#> 2        S2 KIR3DL1*007 KIR3DL1*007 0.4504548
#> 3        S3 KIR3DL1*002 KIR3DL1*015 0.6123777
#> 4        S4 KIR3DL1*002 KIR3DL1*015 0.6277634
#> 5        S5 KIR3DL1*002 KIR3DL1*005 0.6243871

# Call rate
called    <- sum(!is.na(pred$value$allele1))
call_rate <- round(called / nrow(pred$value) * 100, 1)
cat("\nCall rate:", call_rate, "%\n")
#> 
#> Call rate: 100 %

# Clean up model handle
hlaClose(model)

4. Model Training

# Set up parallel cluster
cl <- parallel::makeCluster(2)

# Train a small KIR3DL1 model using example data
model_trained <- kirParallelAttrBagging(
  cl          = cl,
  hla         = example_kir,
  snp         = example_snp,
  nclassifier = 5,
  verbose     = FALSE
)

parallel::stopCluster(cl)

# View trained model summary
print(model_trained)
#> Gene: KIR3DL1
#> Training dataset: 50 samples X 200 SNPs
#>  # of KIR3DL1/S1 alleles: 5
#>  # of individual classifiers: 5
#>  total # of SNPs used: 34
#>  average # of SNPs in an individual classifier: 7.80, sd: 4.02, min: 2, max: 12
#>  average # of haplotypes in an individual classifier: 100.80, sd: 73.52, min: 16, max: 181
#>  average out-of-bag accuracy: 61.96%, sd: 7.55%, min: 52.27%, max: 71.05%
#> Genome assembly: hg19

# Save model as R object
mobj_trained <- hlaModelToObj(model_trained)
cat("\nModel saved with", length(mobj_trained$classifiers), 
    "classifiers\n")
#> 
#> Model saved with 5 classifiers

# Clean up
hlaClose(model_trained)

5. Model Evaluation

# Split example data into train and test
set.seed(42)
n       <- length(example_kir$value$sample.id)
idx     <- sample(1:n, floor(n * 0.7))
train_samples <- example_kir$value$sample.id[idx]
test_samples  <- example_kir$value$sample.id[-idx]

# Subset KIR and SNP data
train_kir <- hlaAlleleSubset(example_kir,
               match(train_samples, example_kir$value$sample.id))
test_kir  <- hlaAlleleSubset(example_kir,
               match(test_samples, example_kir$value$sample.id))

# Train model on training set
cl <- parallel::makeCluster(2)
model_eval <- kirParallelAttrBagging(
  cl          = cl,
  hla         = train_kir,
  snp         = example_snp,
  nclassifier = 5,
  verbose     = FALSE
)
parallel::stopCluster(cl)

# Subset SNP data to test samples
test_snp_idx <- match(test_samples, example_snp$sample.id)
test_snp     <- hlaGenoSubset(example_snp, samp.sel = test_snp_idx)

# Predict on test set
pred_eval <- kirPredict(
  object  = model_eval,
  snp     = test_snp,
  type    = "response+prob",
  verbose = FALSE
)

# Compare predictions to truth
comp <- hlaCompareAllele(
  TrueHLA        = test_kir,
  PredHLA        = pred_eval,
  call.threshold = 0.5,
  verbose        = FALSE
)

cat("Evaluation results:\n")
#> Evaluation results:
cat("  Haplotype accuracy:", 
    round(comp$overall$acc.haplo * 100, 1), "%\n")
#>   Haplotype accuracy: NaN %
cat("  Genotype accuracy: ", 
    round(comp$overall$acc.ind * 100, 1), "%\n")
#>   Genotype accuracy:  NaN %
cat("  Call rate:         ", 
    round(comp$overall$call.rate * 100, 1), "%\n")
#>   Call rate:          0 %

# Clean up
hlaClose(model_eval)

6. CLI Usage

For the complete command-line workflow used in the manuscript, see:

vignette("PONG2-basics") — installation and CLI quick start
vignette("PONG2-imputation") — full imputation workflow
vignette("PONG2-training") — custom model training

Or run directly from the terminal:

# These commands run from the terminal after adding pong2 to PATH
system("pong2 impute -i example/chr19 -o output -l KIR3DL1 -a hg19")
system("pong2 train --bfile example/chr19 --kfile example/kir_call.csv \
        --output output --locus KIR3DL1 --assembly hg19")

Session Info

sessionInfo()
#> R version 4.5.2 (2025-10-31)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 22.04.5 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0
#> 
#> Random number generation:
#>  RNG:     L'Ecuyer-CMRG 
#>  Normal:  Inversion 
#>  Sample:  Rejection 
#>  
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] PONG2_1.0.1
#> 
#> loaded via a namespace (and not attached):
#>  [1] digest_0.6.39     R6_2.6.1          fastmap_1.2.0     xfun_0.54        
#>  [5] cachem_1.1.0      parallel_4.5.2    knitr_1.51        htmltools_0.5.9  
#>  [9] rmarkdown_2.31    lifecycle_1.0.5   cli_3.6.6         sass_0.4.10      
#> [13] jquerylib_0.1.4   compiler_4.5.2    rstudioapi_0.17.1 tools_4.5.2      
#> [17] evaluate_1.0.5    bslib_0.10.0      yaml_2.3.12       otel_0.2.0       
#> [21] jsonlite_2.0.0    rlang_1.1.7

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.