Downloading necessary files

1000 genomes vcf files are downloaded from:

http://hgdownload.cse.ucsc.edu/gbdb/hg19/1000Genomes/phase3/

Genetic map can be downloaded through sim1000G.

Selecting the target population (CEU)

We extract the CEU samples from a region of chromosome 4 from 80MBp to 1000MBp using bcftools:

ped_file_1000genomes = system.file("examples", "20130606_g1k.ped", package = "sim1000G")

ped = read.table(ped_file_1000genomes,h=T,as=T,sep="\t")

pop1 = c("CEU","TSI","GBR")
id1 = ped$Individual.ID [ ped$Population %in% pop1 ]

cat(c(id1),file="/tmp/samples1.txt",sep="\n")


id2 = ped$Individual.ID [ ped$Population == "ASW" ]

cat(c(id2),file="/tmp/samples2.txt",sep="\n")

pop_map = ped$Population
names(pop_map) = ped$Individual.ID

#77356278-77703432

INPUT_VCF=ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz

bcftools view -S /tmp/samples1.txt -r 4:73000000-74000000 --force-samples $INPUT_VCF > ~/tmp/chr4-80.vcf

 bcftools  filter -i 'AF>0 && EUR_AF>0 && AFR_AF>0' < ~/tmp/chr4-80.vcf | gzip > ~/tmp/chr4-80-filt.vcf.gz

## Loading required package: hapsim
## Loading required package: MASS
## Loading required package: stringr
## Loading required package: readr
## [#.......] Reading VCF file..
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `#CHROM` = col_integer(),
##   POS = col_integer(),
##   QUAL = col_integer()
## )
## See spec(...) for full column specifications.
## [##......] Chromosome:   4  Mbp:  38.96839  Region Size:  65.587 kb  Num of individuals: 267 
## [##......] Before filtering  Num of variants: 2102 Num of individuals: 267 
## [1] 0 1
## [1] 2102
## [###.....] After filtering  Num of variants: 200 Num of individuals: 267
## 
## ASW CEU TSI 
##  61  99 107

plot of chunk unnamed-chunk-3

library(SKAT)

loadSimulation("pop1")

plot(apply(genotypes,2,mean), apply(genotypes2,2,mean))

gt = rbind(genotypes,genotypes2)

#gt = genotypes

dim(gt)

maf = apply(gt,2,mean,na.rm=T)/2
apply(gt,2,function(x) sum(is.na(x)))

flip  = which(maf > 0.5) ; gt[,flip] = 2 - gt[,flip]


#gt = genotypes

dim(gt)

maf = apply(gt,2,mean,na.rm=T)/2
plot(maf)

sum(maf==0)


apply(gt,2,function(x) sum(is.na(x)))



flip  = which(maf > 0.5)
gt[,flip] = 2 - gt[,flip]


dim(gt)


effect_sizes = rep(0, ncol(gt))
nvar = length(effect_sizes)

s = sample(1:nvar, 33)
effect_sizes[s] = 5


apply(gt[,s],1,sum)




predictor2 = function(b, geno) {
    x = b[1] 
    for(i in 1:ncol(geno)) { x = x  + b[i+1] * ( geno[,i] > 0) + b[i+1] * ( geno[,i] > 1)   }
    exp(x) / (1+exp(x) )
}

p =predictor2 (  c(-1.5,effect_sizes) ,  gt)



phenotype = rbinom( length(p) , 1 , p ) 

#phenotype = sample(phenotype)
obj<-SKAT_Null_Model(phenotype ~ 1, out_type="D")


library(SKAT)
SKATBinary((gt),obj)$p.value