The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
Load libraries
##
##
##
Let us test the normalization methods here pseudocount vs log1p
#' Plot Bland Altman and density plots of the difference between the two zero handling
#' methods
#'
#' @param aa the gene expression matrix
#' @param pc the pseudocount
#'
#' @return two plots, the Bland Altman plot and the density plot
plot.norm.diffs <- function(X, pc){
mat_logpc <- log(X + pc)
mat_log1p <- log1p(X)
avg_vals <- (mat_logpc + mat_log1p) / 2
diff_vals <- mat_logpc - mat_log1p
df_ba <- data.frame(
Avg = as.vector(avg_vals),
Diff = as.vector(diff_vals)
)
p_ba <- ggplot(df_ba, aes(x = Avg, y = Diff)) +
geom_point(size = 0.8, color = "darkred") +
geom_hline(yintercept = 0, linetype = 2, color = "navy") +
labs(
title = "Bland–Altman Plot",
subtitle = "log(x+1e-6) vs log1p(x)",
x = "Average of two transforms",
y = "Difference (log(x+1e-6) - log1p(x))"
) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5, size = 20),
axis.text = element_text(size = 15),
text = element_text(size = 18)
)
df_diff <- data.frame(Diff = as.vector(diff_vals))
p_density <- ggplot(df_diff, aes(x = Diff)) +
geom_density(aes(y = after_stat(scaled)), fill = "steelblue", alpha = 0.5) +
geom_vline(xintercept = 0, linetype = 2, color = "navy") +
labs(
title = "Distribution of Differences",
x = "Difference (log(x+1e-6) - log1p(x))",
y = "Density"
) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5, size = 20),
axis.text = element_text(size = 15),
text = element_text(size = 18)
)
return(p_ba + p_density)
}
#pseudocount
pc <- 1e-6
df <- read.table(system.file("extdata", "GSE60450_Lactation-GenewiseCounts.txt", package = "SurprisalAnalysis"),header = TRUE, sep = "\t", check.names = FALSE)
df[,3:ncol(df)]->testing.case.1
df$EntrezGeneID->rownames(testing.case.1)
as.matrix(testing.case.1)->testing.case.1
stopifnot(all(c("EntrezGeneID","Length") %in% names(df)))
gene_ids <- df$EntrezGeneID
len_bp <- df$Length
counts <- as.matrix(df[ , -(1:2)])
keep <- is.finite(len_bp) & (len_bp > 0)
counts <- counts[keep, , drop = FALSE]
len_bp <- len_bp[keep]
gene_ids <- gene_ids[keep]
rownames(counts) <- gene_ids
len_kb <- len_bp / 1000
#FPKM
#FPKM_ij = counts_ij / (len_kb_i * libsize_j_in_millions)
libsize <- colSums(counts, na.rm = TRUE)
libsize_mill <- libsize / 1e6
fpkm <- sweep(counts, 1, len_kb, "/")
fpkm <- sweep(fpkm, 2, libsize_mill, "/")
#TPM
# TPM_ij = (counts_ij / len_kb_i) / sum_i(counts_ij / len_kb_i) * 1e6
rpk <- sweep(counts, 1, len_kb, "/")
scale <- colSums(rpk, na.rm = TRUE)
tpm <- sweep(rpk, 2, scale, "/") * 1e6
testing.case.2 <- fpkm
testing.case.3 <- tpm
plot.norm.diffs(testing.case.1, pc)
plot.norm.diffs(testing.case.2, pc)
plot.norm.diffs(testing.case.3, pc)
baseline <- data.frame(raw = surprisal_analysis(testing.case.1)[[1]][,1],
fpkm = surprisal_analysis(testing.case.2)[[1]][,1],
tpm = surprisal_analysis(testing.case.3)[[1]][,1])
baseline.pseudo <- ggplot(baseline)+
geom_line(aes(x=1:nrow(baseline), y=raw, colour="Raw"), size=1)+
geom_point(aes(x=1:nrow(baseline), y=raw, colour="Raw"), shape=8, stroke=1)+
geom_line(aes(x=1:nrow(baseline), y=fpkm, color = "FPKM Normalized"), size=1)+
geom_point(aes(x=1:nrow(baseline), y=fpkm, colour = "FPKM Normalized"), shape=8, stroke=1)+
geom_line(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), size=1,)+
geom_point(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), shape=8, stroke=1)+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5, size = 20),
axis.text = element_text(size = 15),
text = element_text(size = 18)
)+
scale_color_manual(values=c("Raw"="#9ECAD6", "FPKM Normalized" = "#4A9782","TPM Normalized"="#004030" ))+
labs(x="Sample", title=TeX("$\\lambda_0$"), y="Value",colour="Normalization")
baseline.pseudo
lambda_1 <- data.frame(raw = surprisal_analysis(testing.case.1)[[1]][,2],
fpkm = surprisal_analysis(testing.case.2)[[1]][,2],
tpm = surprisal_analysis(testing.case.3)[[1]][,2])
lambda_1.pseudo <- ggplot(lambda_1)+
geom_line(aes(x=1:nrow(baseline), y=raw, colour="Raw"), size=1)+
geom_point(aes(x=1:nrow(baseline), y=raw, colour="Raw"), shape=8, stroke=1)+
geom_line(aes(x=1:nrow(baseline), y=fpkm, color = "FPKM Normalized"), size=1)+
geom_point(aes(x=1:nrow(baseline), y=fpkm, colour = "FPKM Normalized"), shape=8, stroke=1)+
geom_line(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), size=1,)+
geom_point(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), shape=8, stroke=1)+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5, size = 20),
axis.text = element_text(size = 15),
text = element_text(size = 18)
)+
scale_color_manual(values=c("Raw"="#9ECAD6", "FPKM Normalized" = "#4A9782","TPM Normalized"="#004030" ))+
labs(x="Sample", title=TeX("$\\lambda_1$"), y="Value",colour="Normalization")
lambda_1.pseudo
lambda_2 <- data.frame(raw = surprisal_analysis(testing.case.1)[[1]][,3],
fpkm = surprisal_analysis(testing.case.2)[[1]][,3],
tpm = surprisal_analysis(testing.case.3)[[1]][,3])
lambda_2.pseudo <- ggplot(lambda_2)+
geom_line(aes(x=1:nrow(baseline), y=raw, colour="Raw"), size=1)+
geom_point(aes(x=1:nrow(baseline), y=raw, colour="Raw"), shape=8, stroke=1)+
geom_line(aes(x=1:nrow(baseline), y=fpkm, color = "FPKM Normalized"), size=1)+
geom_point(aes(x=1:nrow(baseline), y=fpkm, colour = "FPKM Normalized"), shape=8, stroke=1)+
geom_line(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), size=1,)+
geom_point(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), shape=8, stroke=1)+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5, size = 20),
axis.text = element_text(size = 15),
text = element_text(size = 18)
)+
scale_color_manual(values=c("Raw"="#9ECAD6", "FPKM Normalized" = "#4A9782","TPM Normalized"="#004030" ))+
labs(x="Sample", title=TeX("$\\lambda_2$"), y="Value",colour="Normalization")
lambda_2.pseudo
baseline <- data.frame(raw = surprisal_analysis(testing.case.1, "log1p")[[1]][,1],
fpkm = surprisal_analysis(testing.case.2, "log1p")[[1]][,1],
tpm = surprisal_analysis(testing.case.3, "log1p")[[1]][,1])
baseline.log1p <- ggplot(baseline)+
geom_line(aes(x=1:nrow(baseline), y=raw, colour="Raw"), size=1)+
geom_point(aes(x=1:nrow(baseline), y=raw, colour="Raw"), shape=8, stroke=1)+
geom_line(aes(x=1:nrow(baseline), y=fpkm, color = "FPKM Normalized"), size=1)+
geom_point(aes(x=1:nrow(baseline), y=fpkm, colour = "FPKM Normalized"), shape=8, stroke=1)+
geom_line(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), size=1,)+
geom_point(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), shape=8, stroke=1)+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5, size = 20),
axis.text = element_text(size = 15),
text = element_text(size = 18)
)+
scale_color_manual(values=c("Raw"="#9ECAD6", "FPKM Normalized" = "#4A9782","TPM Normalized"="#004030" ))+
labs(x="Sample", title=TeX("$\\lambda_0$"), y="Value",colour="Normalization")
baseline.log1p
lambda_1 <- data.frame(raw = surprisal_analysis(testing.case.1, "log1p")[[1]][,2],
fpkm = surprisal_analysis(testing.case.2, "log1p")[[1]][,2],
tpm = surprisal_analysis(testing.case.3, "log1p")[[1]][,2])
lambda_1.log1p <- ggplot(lambda_1)+
geom_line(aes(x=1:nrow(baseline), y=raw, colour="Raw"), size=1)+
geom_point(aes(x=1:nrow(baseline), y=raw, colour="Raw"), shape=8, stroke=1)+
geom_line(aes(x=1:nrow(baseline), y=fpkm, color = "FPKM Normalized"), size=1)+
geom_point(aes(x=1:nrow(baseline), y=fpkm, colour = "FPKM Normalized"), shape=8, stroke=1)+
geom_line(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), size=1,)+
geom_point(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), shape=8, stroke=1)+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5, size = 20),
axis.text = element_text(size = 15),
text = element_text(size = 18)
)+
scale_color_manual(values=c("Raw"="#9ECAD6", "FPKM Normalized" = "#4A9782","TPM Normalized"="#004030" ))+
labs(x="Sample", title=TeX("$\\lambda_1$"), y="Value",colour="Normalization")
lambda_1.log1p
lambda_2 <- data.frame(raw = surprisal_analysis(testing.case.1, "log1p")[[1]][,3],
fpkm = surprisal_analysis(testing.case.2, "log1p")[[1]][,3],
tpm = surprisal_analysis(testing.case.3, "log1p")[[1]][,3])
lambda_2.log1p <- ggplot(lambda_2)+
geom_line(aes(x=1:nrow(baseline), y=raw, colour="Raw"), size=1)+
geom_point(aes(x=1:nrow(baseline), y=raw, colour="Raw"), shape=8, stroke=1)+
geom_line(aes(x=1:nrow(baseline), y=fpkm, color = "FPKM Normalized"), size=1)+
geom_point(aes(x=1:nrow(baseline), y=fpkm, colour = "FPKM Normalized"), shape=8, stroke=1)+
geom_line(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), size=1,)+
geom_point(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), shape=8, stroke=1)+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5, size = 20),
axis.text = element_text(size = 15),
text = element_text(size = 18)
)+
scale_color_manual(values=c("Raw"="#9ECAD6", "FPKM Normalized" = "#4A9782","TPM Normalized"="#004030" ))+
labs(x="Sample", title=TeX("$\\lambda_2$"), y="Value",colour="Normalization")
lambda_2.log1p
#(baseline.pseudo+baseline.log1p)/(lambda.1.pseudo+lambda.1.log1p)/(lambda_2.pseudo+lambda_2.log1p)
running.test.case.1<-data.frame(Sample = as.numeric(rownames(testing.case.1)), testing.case.1)
running.test.case.2<-data.frame(Sample = as.numeric(rownames(testing.case.2)), testing.case.2)
running.test.case.3<-data.frame(Sample = as.numeric(rownames(testing.case.3)), testing.case.3)
testing.case.2.log1p <- GO_analysis_surprisal_analysis(surprisal_analysis(running.test.case.2, "log1p")[[2]], 0.85, 2, key_type = "ENTREZID", flip = FALSE, species.db.str = "org.Mm.eg.db", top_GO_terms=15)
testing.case.2.pseudo <- GO_analysis_surprisal_analysis(surprisal_analysis(running.test.case.2, "pseudocount")[[2]], 0.85, 2, key_type = "ENTREZID", flip = FALSE, species.db.str = "org.Mm.eg.db", top_GO_terms=15)
#ggplot(testing.case.2.pseudo, aes(x = reorder(Description, -Count), y = Count, fill = p.adjust)) +
#geom_col() +
#coord_flip() +
#scale_fill_gradient(low = "steelblue", high = "red") +
#labs(
# title = "Top 15 Enriched GO Terms",
# x = "GO Term",
# y = "Gene Count",
# fill = "Adj. p-value"
#) +
#theme_minimal(base_size = 14)
#Jaccard similarity
extreme_pct_lambda <- function(sa_mat, lambda_col = "lambda_1", p = 0.05) {
stopifnot(lambda_col %in% colnames(sa_mat))
v <- sa_mat[, lambda_col]
n <- length(v)
k <- max(1, ceiling(p * n))
top_ids <- names(sort(v, decreasing = TRUE))[seq_len(k)]
bot_ids <- names(sort(v, decreasing = FALSE))[seq_len(k)]
return(sort(unique(c(top_ids, bot_ids))))
}
sa1_log1p <- surprisal_analysis(running.test.case.1, "log1p")[[2]]
sa1_pc <- surprisal_analysis(running.test.case.1, "pseudocount")[[2]]
sa2_log1p <- surprisal_analysis(running.test.case.2, "log1p")[[2]]
sa2_pc <- surprisal_analysis(running.test.case.2, "pseudocount")[[2]]
sa3_log1p <- surprisal_analysis(running.test.case.3, "log1p")[[2]]
sa3_pc <- surprisal_analysis(running.test.case.3, "pseudocount")[[2]]
set1_log1p <- extreme_pct_lambda(sa1_log1p, "lambda_1", 0.05)
set1_pc <- extreme_pct_lambda(sa1_pc, "lambda_1", 0.05)
set2_log1p <- extreme_pct_lambda(sa2_log1p, "lambda_1", 0.05)
set2_pc <- extreme_pct_lambda(sa2_pc, "lambda_1", 0.05)
set3_log1p <- extreme_pct_lambda(sa3_log1p, "lambda_1", 0.05)
set3_pc <- extreme_pct_lambda(sa3_pc, "lambda_1", 0.05)
sets <- list(
`Raw-log1p` = set1_log1p,
`Raw-pseudocount` = set1_pc,
`FPKM-log1p` = set2_log1p,
`FPKM-pseudocount` = set2_pc,
`TPM-log1p` = set3_log1p,
`TPM-pseudocount` = set3_pc
)
pairwise_matrix <- function(sets, fun) {
n <- length(sets)
M <- matrix(0, n, n, dimnames = list(names(sets), names(sets)))
for (i in seq_len(n)) for (j in seq_len(n)) M[i, j] <- fun(sets[[i]], sets[[j]])
M
}
overlap_fun <- function(a, b) length(intersect(a, b))
jaccard_fun <- function(a, b) {
inter <- length(intersect(a, b)); uni <- length(union(a, b))
if (uni == 0) return(NA_real_) else inter / uni
}
overlap_mat <- pairwise_matrix(sets, overlap_fun)
diag(overlap_mat) <- vapply(sets, length, integer(1))
jaccard_mat <- pairwise_matrix(sets, jaccard_fun)
pheatmap::pheatmap(
jaccard_mat,
main = TeX("Jaccard similarity (Top & Bottom 5% of $\\lambda_0$)"),
display_numbers = TRUE, number_format = "%.2f",
fontsize_row = 10, fontsize_col = 10, border_color = NA,
color = colorRampPalette(c("#1B3C53", "#456882", "#91C8E4"))(100)
)
set1_log1p <- extreme_pct_lambda(sa1_log1p, "lambda_2", 0.05)
set1_pc <- extreme_pct_lambda(sa1_pc, "lambda_2", 0.05)
set2_log1p <- extreme_pct_lambda(sa2_log1p, "lambda_2", 0.05)
set2_pc <- extreme_pct_lambda(sa2_pc, "lambda_2", 0.05)
set3_log1p <- extreme_pct_lambda(sa3_log1p, "lambda_2", 0.05)
set3_pc <- extreme_pct_lambda(sa3_pc, "lambda_2", 0.05)
sets <- list(
`Raw-log1p` = set1_log1p,
`Raw-pseudocount` = set1_pc,
`FPKM-log1p` = set2_log1p,
`FPKM-pseudocount` = set2_pc,
`TPM-log1p` = set3_log1p,
`TPM-pseudocount` = set3_pc
)
overlap_mat <- pairwise_matrix(sets, overlap_fun)
diag(overlap_mat) <- vapply(sets, length, integer(1))
jaccard_mat <- pairwise_matrix(sets, jaccard_fun)
pheatmap::pheatmap(
jaccard_mat,
main = TeX("Jaccard similarity (Top & Bottom 5% of $\\lambda_1$)"),
display_numbers = TRUE, number_format = "%.2f",
fontsize_row = 10, fontsize_col = 10, border_color = NA,
color = colorRampPalette(c("#1B3C53", "#456882", "#91C8E4"))(100)
)
set1_log1p <- extreme_pct_lambda(sa1_log1p, "lambda_3", 0.05)
set1_pc <- extreme_pct_lambda(sa1_pc, "lambda_3", 0.05)
set2_log1p <- extreme_pct_lambda(sa2_log1p, "lambda_3", 0.05)
set2_pc <- extreme_pct_lambda(sa2_pc, "lambda_3", 0.05)
set3_log1p <- extreme_pct_lambda(sa3_log1p, "lambda_3", 0.05)
set3_pc <- extreme_pct_lambda(sa3_pc, "lambda_3", 0.05)
sets <- list(
`Raw-log1p` = set1_log1p,
`Raw-pseudocount` = set1_pc,
`FPKM-log1p` = set2_log1p,
`FPKM-pseudocount` = set2_pc,
`TPM-log1p` = set3_log1p,
`TPM-pseudocount` = set3_pc
)
overlap_mat <- pairwise_matrix(sets, overlap_fun)
diag(overlap_mat) <- vapply(sets, length, integer(1))
jaccard_mat <- pairwise_matrix(sets, jaccard_fun)
pheatmap::pheatmap(
jaccard_mat,
main = TeX("Jaccard similarity (Top & Bottom 5% of $\\lambda_2$)"),
display_numbers = TRUE, number_format = "%.2f",
fontsize_row = 10, fontsize_col = 10, border_color = NA,
color = colorRampPalette(c("#1B3C53", "#456882", "#91C8E4"))(100)
)
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.