Get the released version from CRAN:
install.packages("GeoTcgaData")
Or the development version from github:
if(!requireNamespace("devtools", quietly = TRUE))
install.packages("devtools")
::install_github("huerqiang/GeoTcgaData") devtools
GEO and TCGA provide us with a wealth of data, such as RNA-seq, DNA Methylation, single nucleotide Variation and Copy number variation data. It’s easy to download data from TCGA using the gdc tool or TCGAbiolinks
, but processing these data into a format suitable for bioinformatics analysis requires more work. This R package was developed to handle these data.
library(GeoTcgaData)
#> Hello, friend! welcome to use!
This is a basic example which shows you how to solve a common problem:
It is convenient to use TCGAbiolinks
or GDCRNATools
to download and analysis Gene expression data. TCGAbiolinks
use edgeR
package to do differential expression analysis, while GDCRNATools
can implement three most commonly used methods: limma, edgeR , and DESeq2 to identify differentially expressed genes (DEGs).
However, unlike the chip data, the RNA-seq data had one bias: the longer the gene, the more likely it was to be identified as a differential gene, while there was no such trend in the chip data. This is because in RNA-seq data a long gene has more reads mapping to it compared to a short gene of similar expression, and most of the statistical methods used to detect differential expression have stronger detection ability for genes with more reads. Therefore, we need to correct the gene length bias in downstream analysis such as enrichment analysis.
GOseq based on Wallenius’ noncentral hypergeometric distribution can effectively correct the gene length deviation in enrichment analysis. However, its algorithm can not directly correct the deviation of the expression profile, and its results can not be used for GSEA enrichment analysis. CQN present a normalization algorithm to correct systematic biases(gene length bias and GC-content bias), whose result can be seamlessly docked with downstream difference analysis software such as DESeq2 and edgeR.
Here we use TCGAbiolinks
to download RNA-seq data, use CQN
to correct gene length bias and GC content bias, and then use DESeq2
for difference analysis.
use TCGAbiolinks
to download TCGA data
# download RNA-seq data
library(TCGAbiolinks)
<- GDCquery(project = "TCGA-ACC",
query data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts")
GDCdownload(query, method = "api", files.per.chunk = 3,
directory = Your_Path)
<- GDCprepare(query = query, directory = Your_Path,
dataRNA save = TRUE, save.filename = "dataRNA.RData")
## get raw count matrix
<- TCGAanalyze_Preprocessing(object = dataRNA,
dataPrep cor.cut = 0.6,
datatype = "HTSeq - Counts")
Use diff_RNA
to do difference analysis
## Random value is used as gene length and GC content.
<- sample(1000:2000, nrow(dataPrep), replace = TRUE)
geneLength names(geneLength) <- colnames(dataPrep)
<- runif(nrow(dataPrep))
gccontent names(gccontent) <- colnames(dataPrep)
## Random value is used as sample group.
<- sample(c("grp1", "grp2"), ncol(dataPrep), replace = TRUE)
group library(cqn) # To avoid reporting errors: there is no function "rq"
<- diff_RNA(counts = dataPrep, group = group,
DEGAll geneLength = geneLength, gccontent = gccontent)
Use clusterProfiler
to do enrichment analytics:
<- DEGAll$logFC
diffGenes names(diffGenes) <- rownames(DEGAll)
<- sort(diffGenes, decreasing = TRUE)
diffGenes library(clusterProfiler)
library(enrichplot)
library(org.Hs.eg.db)
<- gseGO(gene = diffGenes, OrgDb = org.Hs.eg.db, keyType = "ENSEMBL")
gsego dotplot(gsego)
use TCGAbiolinks
to download TCGA data
library(TCGAbiolinks)
<- GDCquery(project = "TCGA-ACC",
query data.category = "DNA Methylation",
data.type = "Methylation Beta Value",
platform = "Illumina Human Methylation 450")
GDCdownload(query, method = "api", files.per.chunk = 5, directory = Your_Path)
The function Merge_methy_tcga could Merge methylation data downloaded from TCGA official website or TCGAbiolinks. This makes it easier to extract differentially methylated genes in the downstream analysis. For example:
<- Merge_methy_tcga(Your_Path_to_DNA_Methylation_data) merge_result
Then use ChAMP
package to do difference analysis.
library(ChAMP)
<- methyDiff(cpgData = merge_result, sampleGroup = sample(c("C","T"),
diff_gene ncol(merge_result[[1]]), replace = TRUE))
Use clusterProfiler
to do enrichment analytics:
$p.adj <- p.adjust(diff_gene$pvalue)
diff_gene<- diff_gene[diff_gene$p.adj < 0.05, "gene"]
genes library(clusterProfiler)
library(enrichplot)
library(org.Hs.eg.db)
<- enrichGO(gene = genes, OrgDb = org.Hs.eg.db, keyType = "SYMBOL")
ego dotplot(ego)
use TCGAbiolinks to download TCGA data(Gene Level Copy Number Scores)
library(TCGAbiolinks)
<- GDCquery(project = "TCGA-LGG",
query data.category = "Copy Number Variation",
data.type = "Gene Level Copy Number Scores")
GDCdownload(query, method = "api", files.per.chunk = 5, directory = Your_Path)
<- GDCprepare(query = query,
data directory = "Your_Path")
Do difference analysis of gene level copy number variation data using diff_CNV
class(data) <- "data.frame"
<- data[, -c(1,2,3)]
cnvData rownames(cnvData) <- data[, 1]
= sample(c("A","B"), ncol(cnvData), replace = TRUE)
sampleGroup <- diff_CNV(cnvData, sampleGroup) diffCnv
Use clusterProfiler
to do enrichment analytics:
<- diffCnv$pvalue * sign(diffCnv$odds)
pvalues <- rownames(diffCnv)[diffCnv$pvalue < 0.05]
genes library(clusterProfiler)
library(enrichplot)
library(org.Hs.eg.db)
<- enrichGO(gene = genes, OrgDb = org.Hs.eg.db, keyType = "ENSEMBL")
ego dotplot(ego)
Use TCGAbiolinks to download TCGA data
library(TCGAbiolinks)
<- GDCquery(project = "TCGA-ACC",
query data.category = "Simple Nucleotide Variation",
data.type = "Masked Somatic Mutation",
workflow.type = "MuSE Variant Aggregation and Masking")
GDCdownload(query, method = "api", files.per.chunk = 5, directory = Your_Path)
<- GDCprepare(query = query,
data_snp directory = "Your_Path")
Use diff_SNP_tcga
to do difference analysis
<- unique(data_snp$Tumor_Sample_Barcode)
samples <- sample(c("A","B"), length(samples), replace = TRUE)
sampleType names(sampleType) <- samples
<- diff_SNP_tcga(snpData = data_snp, sampleType = sampleType) pvalue
Use clusterProfiler
to do enrichment analysis
<- sort(pvalue, decreasing = TRUE)
pvalue2 library(clusterProfiler)
library(enrichplot)
library(org.Hs.eg.db)
<- gseGO(pvalue2, OrgDb = org.Hs.eg.db, keyType = "SYMBOL")
gsego dotplot(gsego)
The function gene_ave
could average the expression data of different ids for the same gene in the GEO chip data. For example:
<- c("MARCH1","MARC1","MARCH1","MARCH1","MARCH1")
aa <- c(2.969058399,4.722410064,8.165514853,8.24243893,8.60815086)
bb <- c(3.969058399,5.722410064,7.165514853,6.24243893,7.60815086)
cc <- data.frame(aa=aa,bb=bb,cc=cc)
file_gene_ave colnames(file_gene_ave) <- c("Gene", "GSM1629982", "GSM1629983")
<- gene_ave(file_gene_ave, 1) result
Multiple genes symbols may correspond to a same chip id. The result of function rep1
is to assign the expression of this id to each gene, and function rep2
deletes the expression. For example:
<- c("MARCH1 /// MMA","MARC1","MARCH2 /// MARCH3",
aa "MARCH3 /// MARCH4","MARCH1")
<- c("2.969058399","4.722410064","8.165514853","8.24243893","8.60815086")
bb <- c("3.969058399","5.722410064","7.165514853","6.24243893","7.60815086")
cc <- data.frame(aa=aa,bb=bb,cc=cc)
input_file <- rep1(input_file," /// ")
rep1_result <- rep2(input_file," /// ") rep2_result
id_conversion_vector
could convert gene id from one of symbol
, RefSeq_ID
, Ensembl_ID
, NCBI_Gene_ID
, UCSC_ID
, and UniProt_ID
, etc. to another. Use id_ava()
to get all the convertible ids. For example:id_conversion_vector("symbol", "ensembl_gene_id", c("A2ML1", "A2ML1-AS1", "A4GALT", "A12M1", "AAAS"))
#> 80% were successfully converted.
#> from to
#> 1 A2ML1 ENSG00000166535
#> 2 A2ML1-AS1 ENSG00000256661
#> 3 A4GALT ENSG00000128274
#> 4 A12M1 <NA>
#> 5 AAAS ENSG00000094914
When the user converts the Ensembl ID to other ids, the version number needs to be removed. For example, “ENSG00000186092.4” doesn’t work, you need to change it to “ENSG00000186092”.
Especially, the function id_conversion
could convert ENSEMBL gene id to gene Symbol in TCGA. For example:
<- GeoTcgaData::profile
profile <- id_conversion(profile) result
The parameter profile
is a data.frame or matrix of gene expression data in TCGA.
Note: In previous versions(< 1.0.0) the id_conversion
and id_conversion_vector
used HGNC data to convert human gene id. In future versions, we will use clusterProfiler::bitr
for ID conversion.
library(clusterProfiler)
#>
#> clusterProfiler v4.0.0 For help: https://guangchuangyu.github.io/software/clusterProfiler
#>
#> If you use clusterProfiler in published research, please cite:
#> Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.
#>
#> 载入程辑包:'clusterProfiler'
#> The following object is masked from 'package:stats':
#>
#> filter
library(org.Hs.eg.db)
#> 载入需要的程辑包:AnnotationDbi
#> 载入需要的程辑包:stats4
#> 载入需要的程辑包:BiocGenerics
#> 载入需要的程辑包:parallel
#>
#> 载入程辑包:'BiocGenerics'
#> The following objects are masked from 'package:parallel':
#>
#> clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
#> clusterExport, clusterMap, parApply, parCapply, parLapply,
#> parLapplyLB, parRapply, parSapply, parSapplyLB
#> The following objects are masked from 'package:stats':
#>
#> IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#>
#> Filter, Find, Map, Position, Reduce, anyDuplicated, append,
#> as.data.frame, basename, cbind, colnames, dirname, do.call,
#> duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
#> lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
#> pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
#> tapply, union, unique, unsplit, which.max, which.min
#> 载入需要的程辑包:Biobase
#> Welcome to Bioconductor
#>
#> Vignettes contain introductory material; view with
#> 'browseVignettes()'. To cite Bioconductor, see
#> 'citation("Biobase")', and for packages 'citation("pkgname")'.
#> 载入需要的程辑包:IRanges
#> 载入需要的程辑包:S4Vectors
#>
#> 载入程辑包:'S4Vectors'
#> The following object is masked from 'package:clusterProfiler':
#>
#> rename
#> The following objects are masked from 'package:base':
#>
#> I, expand.grid, unname
#>
#> 载入程辑包:'IRanges'
#> The following object is masked from 'package:clusterProfiler':
#>
#> slice
#> The following object is masked from 'package:grDevices':
#>
#> windows
#>
#> 载入程辑包:'AnnotationDbi'
#> The following object is masked from 'package:clusterProfiler':
#>
#> select
#>
bitr(c("A2ML1", "A2ML1-AS1", "A4GALT", "A12M1", "AAAS"), fromType = "SYMBOL",
toType = "ENSEMBL", OrgDb = org.Hs.eg.db, drop = FALSE)
#> 'select()' returned 1:1 mapping between keys and columns
#> Warning in bitr(c("A2ML1", "A2ML1-AS1", "A4GALT", "A12M1", "AAAS"), fromType =
#> "SYMBOL", : 40% of input gene IDs are fail to map...
#> SYMBOL ENSEMBL
#> 1 A2ML1 ENSG00000166535
#> 2 A2ML1-AS1 <NA>
#> 3 A4GALT ENSG00000128274
#> 4 A12M1 <NA>
#> 5 AAAS ENSG00000094914
countToFpkm_matrix
and countToTpm_matrix
could convert count data to FPKM or TPM data.<- matrix(c(1,2,3,4,5,6,7,8,9),ncol=3)
lung_squ_count2 rownames(lung_squ_count2) <- c("DISC1","TCOF1","SPPL3")
colnames(lung_squ_count2) <- c("sample1","sample2","sample3")
<- countToFpkm_matrix(lung_squ_count2) jieguo
<- matrix(c(0.11,0.22,0.43,0.14,0.875,0.66,0.77,0.18,0.29),ncol=3)
lung_squ_count2 rownames(lung_squ_count2) <- c("DISC1","TCOF1","SPPL3")
colnames(lung_squ_count2) <- c("sample1","sample2","sample3")
<- countToTpm_matrix(lung_squ_count2) jieguo
tcga_cli_deal
could combine clinical information obtained from TCGA and extract survival data. For example:<- tcga_cli_deal(system.file(file.path("extdata","tcga_cli"),package="GeoTcgaData")) tcga_cli