sim1000G: Simulating family data from 1000 genomes haplotypes

Apostolos Dimitromanolakis

2017-07-19

Introduction and examples

output: prettydoc::html_pretty: theme: cayman highlight: github

Installing the package and dependencies (if installing from source)

sim1000G depends on two R packages stringr and hapsim. Stringr is included in CRAN and can be installed as usual. Hapsim, can be installed from its source package.

install.packages("stringr")
install.packages("hapsim_0.31.tar.gz",repos=NULL, source=T)

The sim1000G package can then be installed:

install.packages("sim1000G_1.04.tar.gz",repos=NULL, source=T)

Genetic Map

The genetic map we use is from the Hapmap 2010 release, lifted over to GrCH37 coordinates. It was downloaded from:

ftp://ftp.ncbi.nlm.nih.gov/hapmap/recombination/2011-01_phaseII_B37/

Reading a VCF file and starting the simulation

Before starting the simulator, a VCF file of the region of interest is needed. The VCF file is used to provide the haplotypes that will be used for the simulator.

For this example we use an unfiltered region from 1000 genomes Phase III sequencing data VCF, chromosome 4, CEU samples.

We also need to initialize all the simulator internal structures with the command startSimulation.

The following parameters can be set:

library(sim1000G)
## Loading required package: hapsim
## Loading required package: MASS
## Loading required package: stringr
download.file("https://adimitromanolakis.github.io/sim1000G/data/region.vcf.gz", destfile = "region.vcf.gz")


vcf = readVCF("region.vcf.gz", maxNumberOfVariants = 100 , min_maf = 0.02 , max_maf = NA)


#downloadGeneticMap(4)
readGeneticMap( chromosome = 4 )

startSimulation(vcf, totalNumberOfIndividuals = 1200)
## [#.......] Reading VCF file..
## [##......] Chromosome:   4  Mbp:  77.35631  Region Size:  348.083 kb  Num of variants: 9038 
## [###.....] Filtering and thinning variants
## [##......] Chromosome:   4  Mbp:  77.35692  Region Size:  338.941 kb  Num of variants: 100 (after filtering)
##       -> Genetic map has 211115 entries
## [1] 0
## [#####...] Creating SIM object
## [#####...] Haplodata object created

Generating unrelated individuals

Generation of new founder individuals is done using the function SIM$addUnrelatedIndividual(). The function return the index of the individual generated.

After the individual is generated, its haplotypes are available at the arrays SIM$gt1[i,] and SIM$gt2[i,].

An example with 30 individuals is below

SIM$reset()


id = c()
for(i in 1:30) id[i] = SIM$addUnrelatedIndividual()

# Show haplotype 1  of first 5 individuals
#print(SIM$gt1[1:5,1:6])

# Show haplotype 2
#print(SIM$gt1[1:5,1:6])



genotypes = SIM$gt1[1:20,] + SIM$gt2[1:20,]

print(dim(genotypes))

str(genotypes)

library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
heatmap.2(cor(genotypes)^2, col=rev(heat.colors(100)) , trace="none",Rowv=F,Colv=F)
## Warning in cor(genotypes): the standard deviation is zero
## Warning in heatmap.2(cor(genotypes)^2, col = rev(heat.colors(100)), trace =
## "none", : Discrepancy: Rowv is FALSE, while dendrogram is `both'. Omitting
## row dendogram.
## Warning in heatmap.2(cor(genotypes)^2, col = rev(heat.colors(100)), trace
## = "none", : Discrepancy: Colv is FALSE, while dendrogram is `column'.
## Omitting column dendogram.

## Generate new individual pool n=200
## Adding individual  1  from pool
## Adding individual  2  from pool
## Adding individual  3  from pool
## Adding individual  4  from pool
## Adding individual  5  from pool
## Adding individual  6  from pool
## Adding individual  7  from pool
## Adding individual  8  from pool
## Adding individual  9  from pool
## Adding individual  10  from pool
## Adding individual  11  from pool
## Adding individual  12  from pool
## Adding individual  13  from pool
## Adding individual  14  from pool
## Adding individual  15  from pool
## Adding individual  16  from pool
## Adding individual  17  from pool
## Adding individual  18  from pool
## Adding individual  19  from pool
## Adding individual  20  from pool
## Adding individual  21  from pool
## Adding individual  22  from pool
## Adding individual  23  from pool
## Adding individual  24  from pool
## Adding individual  25  from pool
## Adding individual  26  from pool
## Adding individual  27  from pool
## Adding individual  28  from pool
## Adding individual  29  from pool
## Adding individual  30  from pool
## [1]  20 100
##  num [1:20, 1:100] 0 1 0 1 0 0 0 0 0 0 ...

Simulating genotypes within families

For related individuals in pedigrees, we simulate meiotic recombination by using the function SIM$mate(i,j).

Below we show an example on how to simulate 100 families with 2 offspring each.

In addition we write the output to a PED/MAP file in plink format, for further analysis.

# Simulate one family with 2 offspring

fam = newFamilyWithOffspring("fam1",2)
print(fam)



# Simulate 100 families
 
SIM$reset()


## For testing the IBD, we set the cM so that the regions spans to 4000cm
## Remove for normal use
SIM$cm = seq( 0,4000, length = length(SIM$cm) )



time100families = function() {
    
    
    fam = lapply(1:10, function(x) newFamilyWithOffspring(x,2) )
    fam = do.call(rbind, fam)
    fam

}

fam <- time100families() 


writePED(vcf, fam,"/tmp/out")
## Adding individual  31  from pool
## Adding individual  32  from pool
## Adding individual  33  from specified genotypes
## Adding individual  34  from specified genotypes
##    fid id father mother sex gtindex
## 1 fam1  1      0      0   1      31
## 2 fam1  2      0      0   2      32
## 3 fam1 11      1      2   1      33
## 4 fam1 12      1      2   1      34
## Adding individual  1  from pool
## Adding individual  2  from pool
## Adding individual  3  from specified genotypes
## Adding individual  4  from specified genotypes
## Adding individual  5  from pool
## Adding individual  6  from pool
## Adding individual  7  from specified genotypes
## Adding individual  8  from specified genotypes
## Adding individual  9  from pool
## Adding individual  10  from pool
## Adding individual  11  from specified genotypes
## Adding individual  12  from specified genotypes
## Adding individual  13  from pool
## Adding individual  14  from pool
## Adding individual  15  from specified genotypes
## Adding individual  16  from specified genotypes
## Adding individual  17  from pool
## Adding individual  18  from pool
## Adding individual  19  from specified genotypes
## Adding individual  20  from specified genotypes
## Adding individual  21  from pool
## Adding individual  22  from pool
## Adding individual  23  from specified genotypes
## Adding individual  24  from specified genotypes
## Adding individual  25  from pool
## Adding individual  26  from pool
## Adding individual  27  from specified genotypes
## Adding individual  28  from specified genotypes
## Adding individual  29  from pool
## Adding individual  30  from pool
## Adding individual  31  from specified genotypes
## Adding individual  32  from specified genotypes
## Adding individual  33  from pool
## Adding individual  34  from pool
## Adding individual  35  from specified genotypes
## Adding individual  36  from specified genotypes
## Adding individual  37  from pool
## Adding individual  38  from pool
## Adding individual  39  from specified genotypes
## Adding individual  40  from specified genotypes
## [] PED file written as  /tmp/out.ped

Computing the IBD matrices

The simulator tracks the locations of all the ancestral alleles in 2 seperate arrays. These can be used to compute the IBD1,2 matrices, in arbitrary pedigrees.

Unfortunately, tracking the ancestral alleles makes the simulator a lot slower, so if we don’t need this functionality, we can remove it later.

n = SIM$individuals_generated

IBD1matrix = 
sapply(1:n, function(y) {
        z = sapply(1:n, function(x) computePairIBD12(x,y) [1]) 
        names(z) = 1:n
        z
})

IBD2matrix = 
    sapply(1:n, function(y) {
        z = sapply(1:n, function(x) computePairIBD12(x,y) [2]) 
        names(z) = 1:n
        z
    })

IBD1 matrix

colnames(IBD1matrix) = 1:nrow(IBD1matrix)
rownames(IBD1matrix) = 1:nrow(IBD1matrix)
colnames(IBD2matrix) = 1:nrow(IBD2matrix)
rownames(IBD2matrix) = 1:nrow(IBD2matrix)

knitr::kable(IBD1matrix[1:8,1:8] )
1 2 3 4 5 6 7 8
0 0 1.0 1.0 0 0 0.00 0.00
0 0 1.0 1.0 0 0 0.00 0.00
1 1 0.0 0.5 0 0 0.00 0.00
1 1 0.5 0.0 0 0 0.00 0.00
0 0 0.0 0.0 0 0 1.00 1.00
0 0 0.0 0.0 0 0 1.00 1.00
0 0 0.0 0.0 1 1 0.00 0.43
0 0 0.0 0.0 1 1 0.43 0.00

IBD2 matrix

colnames(IBD1matrix) = 1:nrow(IBD1matrix)
rownames(IBD1matrix) = 1:nrow(IBD1matrix)
colnames(IBD2matrix) = 1:nrow(IBD2matrix)
rownames(IBD2matrix) = 1:nrow(IBD2matrix)

knitr::kable(IBD2matrix[1:8,1:8] )
1 2 3 4 5 6 7 8
1 0 0.00 0.00 0 0 0.00 0.00
0 1 0.00 0.00 0 0 0.00 0.00
0 0 1.00 0.22 0 0 0.00 0.00
0 0 0.22 1.00 0 0 0.00 0.00
0 0 0.00 0.00 1 0 0.00 0.00
0 0 0.00 0.00 0 1 0.00 0.00
0 0 0.00 0.00 0 0 1.00 0.32
0 0 0.00 0.00 0 0 0.32 1.00

Simulating data in multi-generational pedigrees

The function to generate family data can be extended to simulate arbitraty pedigrees, it is shown below:

newFamilyWithOffspring = function(familyid, noffspring = 2) {
    
    fam = data.frame(fid = familyid  , 
                     id = c(1:2) , 
                     father = c(0,0), 
                     mother = c(0,0), 
                     sex = c(1,2)
    )
    
    
    j1 = SIM$addUnrelatedIndividual()
    j2 = SIM$addUnrelatedIndividual()
    
    fam$gtindex = c(j1,j2) # Holds the genotype position in the arrays SIM$gt1 and SIM$gt2
    
    for(i in 1:noffspring) {
        j3 = SIM$mate(j1,j2)
        
        newFamilyMember = c(familyid, i+10, 1,2, 1 , j3)
        fam = rbind(fam, newFamilyMember)
    }
    
    return (fam)
}

Simulating a 3 generational pedigree and computing IBD1/2 matrices

In this example, we generate a pedigree with 6 individuals, across 3 generations. After that, we compute the IBD matrices of the family.

# Reset simulation
SIM$reset()



# Set the region size in cM (0-4000cm, for testing the correctness of the function)
SIM$cm = seq(0,4000,l=length(SIM$cm))



A = SIM$addUnrelatedIndividual()
B = SIM$addUnrelatedIndividual()
C = SIM$mate(A,B)
D = SIM$mate(A,B)
G = SIM$addUnrelatedIndividual()
E = SIM$mate(G,C)



computePairIBD12(C,D)
computePairIBD12(E,A)


n = SIM$individuals_generated

IBD1matrix = 
sapply(1:n, function(y) {
        z = sapply(1:n, function(x) computePairIBD12(x,y) [1]) 
        names(z) = 1:n
        z
})

IBD2matrix = 
    sapply(1:n, function(y) {
        z = sapply(1:n, function(x) computePairIBD12(x,y) [2]) 
        names(z) = 1:n
        z
    })



printMatrix(IBD1matrix)
## Adding individual  1  from pool
## Adding individual  2  from pool
## Adding individual  3  from specified genotypes
## Adding individual  4  from specified genotypes
## Adding individual  5  from pool
## Adding individual  6  from specified genotypes
## IBD1 IBD2 
## 0.44 0.35 
## IBD1 IBD2 
##  0.6  0.0 
##          [   1]  [   2]  [   3]  [   4]  [   5]  [   6]  
## [   1]    0.000   0.000   1.000   1.000   0.000   0.600  
## [   2]    0.000   0.000   1.000   1.000   0.000   0.400  
## [   3]    1.000   1.000   0.000   0.440   0.000   1.000  
## [   4]    1.000   1.000   0.440   0.000   0.000   0.570  
## [   5]    0.000   0.000   0.000   0.000   0.000   1.000  
## [   6]    0.600   0.400   1.000   0.570   1.000   0.000