factorMerger: TCGA use case

Przemyslaw Biecek

2017-03-22

Data preparation

library(RTCGA.PANCAN12)
library(factorMerger) 

data("expression.cb1")
data("expression.cb2")
data("clinical.cb")
expression.cb <- rbind(expression.cb1, expression.cb2)
rownames(expression.cb) <- expression.cb[,1]
expression.cb <- data.frame(t(expression.cb[,-1]))
expression.cb$sampleID <- gsub(rownames(expression.cb), pattern = ".", replacement = "-", fixed = TRUE)

selectedCols <- c(grep(colnames(expression.cb), pattern = "HSP", value = TRUE), "sampleID")
selected <- merge(expression.cb[,selectedCols], clinical.cb[,c("sampleID","X_cohort")], by = "sampleID")
selected$Cancer <- substr(as.character(selected$X_cohort), 6, 100)

library(ggplot2)
ggplot(selected, aes(Cancer,HSPA12B)) + 
  geom_violin() + coord_flip()

Time for ANOVA

anova(lm(HSPA12B ~ Cancer, data = selected))
#> Analysis of Variance Table
#> 
#> Response: HSPA12B
#>             Df Sum Sq Mean Sq F value    Pr(>F)    
#> Cancer      12 2336.3 194.692  177.69 < 2.2e-16 ***
#> Residuals 3585 3928.0   1.096                      
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(HSPA12B ~ Cancer, data = selected))
#> 
#> Call:
#> lm(formula = HSPA12B ~ Cancer, data = selected)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -4.3866 -0.6535  0.0236  0.6922  4.1969 
#> 
#> Coefficients:
#>                                                       Estimate Std. Error
#> (Intercept)                                           -1.96185    0.07958
#> CancerBladder Cancer                                   1.65407    0.12375
#> CancerBreast Cancer                                    2.46959    0.08739
#> CancerColon Cancer                                     1.40436    0.11000
#> CancerEndometrioid Cancer                              1.38728    0.09641
#> CancerFormalin Fixed Paraffin-Embedded Pilot Phase II  2.71519    0.31247
#> CancerGlioblastoma                                     2.28696    0.11373
#> CancerHead and Neck Cancer                             1.68020    0.09975
#> CancerKidney Clear Cell Carcinoma                      3.42967    0.09293
#> CancerLung Adenocarcinoma                              1.85849    0.09724
#> CancerLung Squamous Cell Carcinoma                     1.44430    0.10286
#> CancerOvarian Cancer                                   1.35578    0.10231
#> CancerRectal Cancer                                    1.46895    0.14680
#>                                                       t value Pr(>|t|)    
#> (Intercept)                                           -24.652   <2e-16 ***
#> CancerBladder Cancer                                   13.366   <2e-16 ***
#> CancerBreast Cancer                                    28.258   <2e-16 ***
#> CancerColon Cancer                                     12.767   <2e-16 ***
#> CancerEndometrioid Cancer                              14.390   <2e-16 ***
#> CancerFormalin Fixed Paraffin-Embedded Pilot Phase II   8.689   <2e-16 ***
#> CancerGlioblastoma                                     20.109   <2e-16 ***
#> CancerHead and Neck Cancer                             16.845   <2e-16 ***
#> CancerKidney Clear Cell Carcinoma                      36.908   <2e-16 ***
#> CancerLung Adenocarcinoma                              19.113   <2e-16 ***
#> CancerLung Squamous Cell Carcinoma                     14.041   <2e-16 ***
#> CancerOvarian Cancer                                   13.251   <2e-16 ***
#> CancerRectal Cancer                                    10.006   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 1.047 on 3585 degrees of freedom
#>   (1 observation deleted due to missingness)
#> Multiple R-squared:  0.373,  Adjusted R-squared:  0.3709 
#> F-statistic: 177.7 on 12 and 3585 DF,  p-value: < 2.2e-16

library(agricolae)
lsdResult <- LSD.test(aov(HSPA12B ~ Cancer, data = selected),
                      trt = "Cancer", 
                      p.adj = "bonferroni")
lsdResult$groups
#>                                                trt      means    M
#> 1                      Kidney Clear Cell Carcinoma  1.4678151    a
#> 2  Formalin Fixed Paraffin-Embedded Pilot Phase II  0.7533333   ab
#> 3                                    Breast Cancer  0.5077381    b
#> 4                                     Glioblastoma  0.3251024    b
#> 5                              Lung Adenocarcinoma -0.1033618   bc
#> 6                             Head and Neck Cancer -0.2816502  bcd
#> 7                                   Bladder Cancer -0.3077869 bcde
#> 8                                    Rectal Cancer -0.4929067  cde
#> 9                     Lung Squamous Cell Carcinoma -0.5175581   de
#> 10                                    Colon Cancer -0.5574900   de
#> 11                             Endometrioid Cancer -0.5745711    e
#> 12                                  Ovarian Cancer -0.6060749    e
#> 13                          Acute Myeloid Leukemia -1.9618542    f

Time for factor Merger

Single dimensional response

merging <- mergeFactors(selected$HSPA12A, factor(selected$Cancer), subsequent = TRUE)
merging
#> Factor levels were recoded as below:
#> 
#> recoded    original                                        
#> ---------  ------------------------------------------------
#> (AcML)     Acute Myeloid Leukemia                          
#> (BldC)     Bladder Cancer                                  
#> (BrsC)     Breast Cancer                                   
#> (ClnC)     Colon Cancer                                    
#> (EndC)     Endometrioid Cancer                             
#> (FFPPPI)   Formalin Fixed Paraffin-Embedded Pilot Phase II 
#> (Glbl)     Glioblastoma                                    
#> (HaNC)     Head and Neck Cancer                            
#> (KCCC)     Kidney Clear Cell Carcinoma                     
#> (LngA)     Lung Adenocarcinoma                             
#> (LSCC)     Lung Squamous Cell Carcinoma                    
#> (OvrC)     Ovarian Cancer                                  
#> (RctC)     Rectal Cancer                                   
#> 
#> 
#> 
#> groupA                                   groupB                                              model     pval
#> ---------------------------------------  ---------------------------------------------  ----------  -------
#>                                                                                          -5560.944   1.0000
#> (RctC)                                   (ClnC)                                          -5560.944   0.9953
#> (BrsC)                                   (FFPPPI)                                        -5560.988   0.7680
#> (EndC)                                   (BrsC)(FFPPPI)                                  -5561.724   0.2257
#> (RctC)(ClnC)                             (LSCC)                                          -5563.653   0.0499
#> (BldC)                                   (LngA)                                          -5565.844   0.0366
#> (Glbl)                                   (OvrC)                                          -5568.162   0.0315
#> (HaNC)                                   (EndC)(BrsC)(FFPPPI)                            -5574.389   0.0004
#> (BldC)(LngA)                             (RctC)(ClnC)(LSCC)                              -5589.883   0.0000
#> (HaNC)(EndC)(BrsC)(FFPPPI)               (Glbl)(OvrC)                                    -5670.261   0.0000
#> (AcML)                                   (BldC)(LngA)(RctC)(ClnC)(LSCC)                  -5775.098   0.0000
#> (HaNC)(EndC)(BrsC)(FFPPPI)(Glbl)(OvrC)   (KCCC)                                          -6172.380   0.0000
#> (AcML)(BldC)(LngA)(RctC)(ClnC)(LSCC)     (HaNC)(EndC)(BrsC)(FFPPPI)(Glbl)(OvrC)(KCCC)    -6758.771   0.0000
plotTree(merging, simplify = FALSE)

plotTree(merging, "pval")

Multidimensional response

df <- subset(selected, select = -c(sampleID, X_cohort, Cancer))

cancer <- as.factor(selected$Cancer)
merging <- mergeFactors(df, cancer, subsequent = TRUE)
appendToTree(merging, plotProfile(merging) + 
               theme(axis.text.x = element_text(angle = 90, hjust = 1)))

appendToTree(merging, plotHeatmap(merging) + 
               theme(axis.text.x = element_text(angle = 90, hjust = 1)))