The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
sim1000G allows for easy simulation of unrelated individuals starting from sequencing 1000 genomes varians.
The following, somewhat complicated example, showcases the following:
The original 1000 genomes VCF files are obtained from 1000 genomes ftp site, at the location:
http://hgdownload.cse.ucsc.edu/gbdb/hg19/1000Genomes/phase3/
sink(tempfile())
ped_file_1000genomes = system.file("examples", "20130606_g1k.ped", package = "sim1000G")
ped = read.table(ped_file_1000genomes,h=T,as=T,sep="\t")
pop1 = c("CEU","TSI","GBR")
id1 = ped$Individual.ID [ ped$Population %in% pop1 ]
id2 = ped$Individual.ID [ ped$Population == "ASW" ]
pop_map = ped$Population
names(pop_map) = ped$Individual.ID
write_sample_files = 0
if(write_sample_files == 1) {
cat(c(id1,id2),file="/tmp/samples1.txt",sep="\n")
# cat(c(id2),file="/tmp/samples2.txt",sep="\n")
}
We extract the CEU,TSI,GBR and ASW samples from a region of chromosome 4 from 73MBp to 74MBp using bcftools. The following command are run in the shell:
#77356278-77703432
INPUT_VCF=ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
bcftools view -S /tmp/samples1.txt -r 4:73000000-74000000 --force-samples $INPUT_VCF > /tmp/chr4-80.vcf
bcftools filter -i 'AF>0 && EUR_AF>0 && AFR_AF>0' < /tmp/chr4-80.vcf | gzip > /tmp/chr4-80-filt.vcf.gz
library(sim1000G)
vcf_file = "/tmp/chr4-80-filt.vcf.gz"
if(1) {
examples_dir = system.file("examples", package = "sim1000G")
vcf_file = file.path(examples_dir, "region-chr4-93-TMEM156.vcf.gz" )
}
vcf = readVCF(vcf_file, maxNumberOfVariants = 200 , min_maf = 0.02, max_maf = 0.32)
table( pop_map[ vcf$individual_ids ])
C = cor(t( vcf$gt1+vcf$gt2))^2
gplots::heatmap.2(C,col=rev( heat.colors(100) ) ,Rowv=F,Colv=F,trace="none",breaks=seq(0,1,l=101))
#gplots::heatmap.2(cor(t(vcf2$gt1))^2,col=rev( heat.colors(100) ) ,Rowv=F,Colv=F,trace="none")
if(0) {
ids = vcf$individual_ids
id_pop1 = which(ids %in% id1)
id_pop2 = which(ids %in% id2)
gplots::heatmap.2(cor(t( vcf$gt1[,id_pop1]+vcf$gt2[,id_pop1]))^2,col=rev( heat.colors(100) ) ,Rowv=F,Colv=F,trace="none",breaks=seq(0,1,l=101))
gplots::heatmap.2(cor(t( vcf$gt1[,id_pop2]+vcf$gt2[,id_pop2]))^2,col=rev( heat.colors(100) ) ,Rowv=F,Colv=F,trace="none",breaks=seq(0,1,l=101))
}
#
startSimulation(vcf, subset = id1)
saveSimulation("pop1")
N = 200
id = c()
for(i in 1:N) id[i] = SIM$addUnrelatedIndividual()
genotypes = retrieveGenotypes(id)
gplots::heatmap.2(cor( genotypes )^2,col=rev( heat.colors(100) ) ,Rowv=F,Colv=F,trace="none",breaks=seq(0,1,l=101))
startSimulation(vcf, subset = id2)
saveSimulation("pop2")
N = 200
id = c()
for(i in 1:N) id[i] = SIM$addUnrelatedIndividual()
genotypes2 = retrieveGenotypes(id)
gplots::heatmap.2(cor( genotypes2 )^2,col=rev( heat.colors(100) ) ,Rowv=F,Colv=F,trace="none",breaks=seq(0,1,l=101))
library(SKAT)
loadSimulation("pop1")
plot(apply(genotypes,2,mean), apply(genotypes2,2,mean))
gt = rbind(genotypes,genotypes2)
#gt = genotypes
dim(gt)
maf = apply(gt,2,mean,na.rm=T)/2
apply(gt,2,function(x) sum(is.na(x)))
flip = which(maf > 0.5) ; gt[,flip] = 2 - gt[,flip]
#gt = genotypes
dim(gt)
maf = apply(gt,2,mean,na.rm=T)/2
plot(maf)
sum(maf==0)
apply(gt,2,function(x) sum(is.na(x)))
flip = which(maf > 0.5)
gt[,flip] = 2 - gt[,flip]
dim(gt)
effect_sizes = rep(0, ncol(gt))
nvar = length(effect_sizes)
s = sample(1:nvar, 33)
effect_sizes[s] = 5
apply(gt[,s],1,sum)
predictor2 = function(b, geno) {
x = b[1]
for(i in 1:ncol(geno)) { x = x + b[i+1] * ( geno[,i] > 0) + b[i+1] * ( geno[,i] > 1) }
exp(x) / (1+exp(x) )
}
p =predictor2 ( c(-1.5,effect_sizes) , gt)
phenotype = rbinom( length(p) , 1 , p )
#phenotype = sample(phenotype)
obj<-SKAT_Null_Model(phenotype ~ 1, out_type="D")
library(SKAT)
SKATBinary((gt),obj)$p.value
Through the functions readGeneticMap and downloadGeneticMap, we provide the functionality to automatically download genetic maps for GRCH37 build of the human genome.
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.