The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

Smoking data

Rob Dunne

Saturday, August 8, 2020

A small example using the smoking data set, containing normalized transcript measurements for 51 subjects (23 “never-smoked” and 34 “smokers”) and 22283 transcripts of lung tissue. See Spira et al. (2004)

I have removed that data set temporarily until I solve a problem with upload the package to CRAN

load the data

library(RFlocalfdr.data)
data(smoking)
?smoking 
y<-smoking$y
smoking_data<-smoking$rma
y.numeric <-ifelse((y=="never-smoked"),0,1)

fit a ranger model

library(ranger)
rf1 <-ranger(y=y.numeric ,x=smoking_data,importance="impurity",seed=123, num.trees = 10000,
             classification=TRUE)
t2 <-count_variables(rf1)
imp<-log(rf1$variable.importance)
#png("./supp_figures/smoking_log_importances.png")
plot(density(imp),xlab="log importances",main="")
#dev.off()
A small simulated data set. Each band contains blocks of size {1, 2, 4, 8, 16, 32, 64}, and each block consists of correlated (identical variables).

A small simulated data set. Each band contains blocks of size {1, 2, 4, 8, 16, 32, 64}, and each block consists of correlated (identical variables).

Determine a cutoff to get a unimodal density.

See @ref(fig:log_importances) for the log importances. They are clearly multimodal and we try to determine a cutoff so that we are left with a unimodal distribution.

cutoffs <- c(2,3,4,5)
#png("./supp_figures/smoking_data_determine_cutoff.png")
res.con<- determine_cutoff(imp,t2,cutoff=cutoffs,plot=c(2,3,4,5))
#dev.off()

#png("./supp_figures/smoking_data_determine_cutoffs_2.png")
plot(cutoffs,res.con[,3],pch=15,col="red",cex=1.5,ylab="max(abs(y - t1))")
#dev.off()
cutoffs[which.min(res.con[,3])]

fit RFlocalfdr

We select a cutoff of 3 and fit the RFlocalfdr model

temp<-imp[t2 > 3]
temp <- temp - min(temp) + .Machine$double.eps
qq <- plotQ(temp,debug.flag = 1)
ppp<-run.it.importances(qq,temp,debug.flag = 0)


png("./supp_figures/smoking_significant_genes.png")
#aa<-significant.genes(ppp,temp,cutoff=0.05,do.plot=1)
aa<-significant.genes(ppp,temp,cutoff=0.05,debug.flag=0,do.plot=TRUE,use_95_q=TRUE)
dev.off()
length(aa$probabilities) # 17

aa<-significant.genes(ppp,temp,cutoff=0.05,debug.flag=0,do.plot=TRUE,use_95_q=FALSE)
length(aa$probabilities) # 19

The option do.plot=1 returns a plot containing the

The option do.plot=2 returns the same plot with the addition of

sessionInfo()
devtools::install_github("parsifal9/RFlocalfdr", build_vignettes = TRUE, force = TRUE)

library(RFlocalfdr)
data(smoking)
?smoking 
y<-smoking$y
smoking_data<-smoking$rma
y.numeric <-ifelse((y=="never-smoked"),0,1)

library(ranger)
rf1 <-ranger(y=y.numeric ,x=smoking_data,importance="impurity",seed=123, num.trees = 10000,
             classification=TRUE)
t2 <-count_variables(rf1)
imp<-log(rf1$variable.importance)
#png("./supp_figures/smoking_log_importances.png")
plot(density(imp),xlab="log importances",main="")
#dev.off()



cutoffs <- c(2,3,4,5)
#png("./supp_figures/smoking_data_determine_cutoff.png")
res.con<- determine_cutoff(imp,t2,cutoff=cutoffs,plot=c(2,3,4,5))
#dev.off()

#png("./supp_figures/smoking_data_determine_cutoffs_2.png")
plot(cutoffs,res.con[,3],pch=15,col="red",cex=1.5,ylab="max(abs(y - t1))")
#dev.off()
cutoffs[which.min(res.con[,3])]


temp<-imp[t2 > 3]
temp <- temp - min(temp) + .Machine$double.eps
qq <- plotQ(temp,debug.flag = 1)
ppp<-run.it.importances(qq,temp,debug.flag = 0)


#png("./supp_figures/smoking_significant_genes.png")
#aa<-significant.genes(ppp,temp,cutoff=0.05,do.plot=1)
aa<-significant.genes(ppp,temp,cutoff=0.05,debug.flag=0,do.plot=TRUE,use_95_q=TRUE)
#dev.off()
length(aa$probabilities) # 17  -- Roc gets 30

aa<-significant.genes(ppp,temp,cutoff=0.05,debug.flag=0,do.plot=TRUE,use_95_q=FALSE)
length(aa$probabilities) # 19-- Roc gets 30
Spira, A. E., J. Beane, V. Pinto-Plata, A. Kadar, G. Liu, V. Shah, B. Celli, and J. S. Brody. 2004. “Gene Expression Profiling of Human Lung Tissue from Smokers with Severe Emphysema.” Am J Respir Cell Mol Biol.

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.