library(ggplot2); theme_set(theme_bw())
require(glmpca)
## Loading required package: glmpca
Simulate some data. Thanks to Jake Yeung for providing the original inspiration for the simulation. We create three biological groups (clusters) of 50 cells each. There are 5,000 total genes and of these we set 10% to be differentially expressed across clusters. We also create two batches, one with a high total count and the other with a low total count. Each batch has an equal number of cells from the three biological clusters. A successful dimension reduction will recover the three true clusters and avoid separating cells by batch.
set.seed(202)
ngenes <- 5000 #must be divisible by 10
ngenes_informative<-ngenes*.1
ncells <- 50 #number of cells per cluster, must be divisible by 2
nclust<- 3
# simulate two batches with different depths
batch<-rep(1:2, each = nclust*ncells/2)
ncounts <- rpois(ncells*nclust, lambda = 1000*batch)
# generate profiles for 3 clusters
profiles_informative <- replicate(nclust, exp(rnorm(ngenes_informative)))
profiles_const<-matrix(ncol=nclust,rep(exp(rnorm(ngenes-ngenes_informative)),nclust))
profiles <- rbind(profiles_informative,profiles_const)
# generate cluster labels
clust <- sample(rep(1:3, each = ncells))
# generate single-cell transcriptomes
counts <- sapply(seq_along(clust), function(i){
rmultinom(1, ncounts[i], prob = profiles[,clust[i]])
})
rownames(counts) <- paste("gene", seq(nrow(counts)), sep = "_")
colnames(counts) <- paste("cell", seq(ncol(counts)), sep = "_")
# clean up rows
Y <- counts[rowSums(counts) > 0, ]
sz<-colSums(Y)
Ycpm<-1e6*t(t(Y)/sz)
Yl2<-log2(1+Ycpm)
z<-log10(sz)
pz<-1-colMeans(Y>0)
cm<-data.frame(total_counts=sz,zero_frac=pz,clust=factor(clust),batch=factor(batch))
Run GLM-PCA on raw counts and standard PCA on log2(1+CPM).
L<-2 #number of latent dimensions
#Poisson likelihood
system.time(res1<-glmpca(Y,L,fam="poi",verbose=TRUE)) #about 4 seconds
## Iteration: 1 | deviance=4.974e+05
## Iteration: 2 | deviance=4.974e+05
## Iteration: 3 | deviance=4.919e+05
## Iteration: 4 | deviance=4.818e+05
## Iteration: 5 | deviance=4.8e+05
## Iteration: 6 | deviance=4.791e+05
## Iteration: 7 | deviance=4.781e+05
## Iteration: 8 | deviance=4.771e+05
## Iteration: 9 | deviance=4.762e+05
## Iteration: 10 | deviance=4.755e+05
## Iteration: 11 | deviance=4.751e+05
## Iteration: 12 | deviance=4.749e+05
## Iteration: 13 | deviance=4.748e+05
## Iteration: 14 | deviance=4.747e+05
## user system elapsed
## 3.040 0.568 3.189
pd1<-cbind(cm,res1$factors,dimreduce="glmpca-poi")
#negative binomial likelihood
system.time(res2<-glmpca(Y,L,fam="nb",verbose=TRUE)) #about 6 seconds
## Iteration: 1 | deviance=4.951e+05 | nb_theta: 100
## Iteration: 2 | deviance=4.951e+05 | nb_theta: 98.9
## Iteration: 3 | deviance=4.89e+05 | nb_theta: 98.5
## Iteration: 4 | deviance=4.797e+05 | nb_theta: 98.8
## Iteration: 5 | deviance=4.771e+05 | nb_theta: 99.3
## Iteration: 6 | deviance=4.752e+05 | nb_theta: 100
## Iteration: 7 | deviance=4.74e+05 | nb_theta: 101
## Iteration: 8 | deviance=4.733e+05 | nb_theta: 102
## Iteration: 9 | deviance=4.73e+05 | nb_theta: 103
## Iteration: 10 | deviance=4.728e+05 | nb_theta: 104
## Iteration: 11 | deviance=4.727e+05 | nb_theta: 106
## user system elapsed
## 4.822 0.512 4.992
pd2<-cbind(cm,res2$factors,dimreduce="glmpca-nb")
#standard PCA
system.time(res3<-prcomp(log2(1+t(Ycpm)),center=TRUE,scale.=TRUE,rank.=L)) #<0.5 sec
## user system elapsed
## 0.464 0.077 0.319
pca_factors<-res3$x
colnames(pca_factors)<-paste0("dim",1:L)
pd3<-cbind(cm,pca_factors,dimreduce="pca-logcpm")
pd<-rbind(pd1,pd2,pd3)
#visualize results
ggplot(pd,aes(x=dim1,y=dim2,colour=clust,shape=batch))+geom_point(size=4)+facet_wrap(~dimreduce,scales="free",nrow=3)
GLM-PCA identifies the three biological clusters and removes the batch effect. The result is the same whether we use the Poisson or negative binomial likelihood (although the latter is slightly slower). Standard PCA identifies the batch effect as the primary source of variation in the data, even after normalization. Application of a clustering algorithm to the PCA dimension reduction would identify incorrect clusters.
The glmpca function returns a list with several components. We will examine more closely the result of the negative binomial GLM-PCA.
nbres<-res2
names(nbres) #glmpca returns a list
## [1] "factors" "loadings" "coefX" "coefZ" "dev" "family"
dim(Y)
## [1] 4989 150
dim(nbres$factors)
## [1] 150 2
dim(nbres$loadings)
## [1] 4989 2
dim(nbres$coefX)
## [1] 4989 1
hist(nbres$coefX[,1],breaks=100,main="feature-specific intercepts")
plot(nbres$dev,type="b",main="trace plot of glmpca optimization",xlab="iteration")
nbres$family
##
## Family: Negative Binomial(107.058)
## Link function: log