The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

Normalizations Comparisons

Load libraries

library(ggplot2)
library(SurprisalAnalysis)
## 

## 

## 
library(data.table)
library(pheatmap)
library(latex2exp)

Let us test the normalization methods here pseudocount vs log1p

#' Plot Bland Altman and density plots of the difference between the two zero handling
#' methods
#'
#' @param aa the gene expression matrix
#' @param pc the pseudocount
#'
#' @return two plots, the Bland Altman plot and the density plot
plot.norm.diffs <- function(X, pc){


  mat_logpc <- log(X + pc)
  mat_log1p <- log1p(X)

  avg_vals  <- (mat_logpc + mat_log1p) / 2
  diff_vals <- mat_logpc - mat_log1p

  df_ba <- data.frame(
    Avg  = as.vector(avg_vals),
    Diff = as.vector(diff_vals)
  )


  p_ba <- ggplot(df_ba, aes(x = Avg, y = Diff)) +
    geom_point(size = 0.8, color = "darkred") +
    geom_hline(yintercept = 0, linetype = 2, color = "navy") +
    labs(
      title = "Bland–Altman Plot",
      subtitle = "log(x+1e-6) vs log1p(x)",
      x = "Average of two transforms",
      y = "Difference (log(x+1e-6) - log1p(x))"
    ) +
    theme(
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.background = element_blank(),
      axis.line = element_line(colour = "black"),
      plot.title = element_text(hjust = 0.5, size = 20),
      axis.text = element_text(size = 15),
      text = element_text(size = 18)
    )


  df_diff <- data.frame(Diff = as.vector(diff_vals))

  p_density <- ggplot(df_diff, aes(x = Diff)) +
    geom_density(aes(y = after_stat(scaled)), fill = "steelblue", alpha = 0.5) +
    geom_vline(xintercept = 0, linetype = 2, color = "navy") +
    labs(
      title = "Distribution of Differences",
      x = "Difference (log(x+1e-6) - log1p(x))",
      y = "Density"
    ) +
    theme(
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.background = element_blank(),
      axis.line = element_line(colour = "black"),
      plot.title = element_text(hjust = 0.5, size = 20),
      axis.text = element_text(size = 15),
      text = element_text(size = 18)
    )

  return(p_ba + p_density)
}
#pseudocount
pc <- 1e-6



df <- read.table(system.file("extdata", "GSE60450_Lactation-GenewiseCounts.txt", package = "SurprisalAnalysis"),header = TRUE, sep = "\t", check.names = FALSE)
  

df[,3:ncol(df)]->testing.case.1

df$EntrezGeneID->rownames(testing.case.1)


as.matrix(testing.case.1)->testing.case.1

stopifnot(all(c("EntrezGeneID","Length") %in% names(df)))


gene_ids <- df$EntrezGeneID
len_bp   <- df$Length
counts   <- as.matrix(df[ , -(1:2)])


keep <- is.finite(len_bp) & (len_bp > 0)
counts <- counts[keep, , drop = FALSE]
len_bp <- len_bp[keep]
gene_ids <- gene_ids[keep]
rownames(counts) <- gene_ids


len_kb <- len_bp / 1000

#FPKM
#FPKM_ij = counts_ij / (len_kb_i * libsize_j_in_millions)
libsize        <- colSums(counts, na.rm = TRUE)
libsize_mill   <- libsize / 1e6

fpkm <- sweep(counts, 1, len_kb, "/")
fpkm <- sweep(fpkm, 2, libsize_mill, "/")

#TPM
# TPM_ij = (counts_ij / len_kb_i) / sum_i(counts_ij / len_kb_i) * 1e6
rpk  <- sweep(counts, 1, len_kb, "/")
scale <- colSums(rpk, na.rm = TRUE)
tpm <- sweep(rpk, 2, scale, "/") * 1e6

testing.case.2 <- fpkm
testing.case.3 <- tpm

plot.norm.diffs(testing.case.1, pc)

plot.norm.diffs(testing.case.2, pc)

plot.norm.diffs(testing.case.3, pc)



baseline <- data.frame(raw = surprisal_analysis(testing.case.1)[[1]][,1],
           fpkm = surprisal_analysis(testing.case.2)[[1]][,1],
           tpm = surprisal_analysis(testing.case.3)[[1]][,1])


baseline.pseudo <- ggplot(baseline)+
  geom_line(aes(x=1:nrow(baseline), y=raw, colour="Raw"), size=1)+
  geom_point(aes(x=1:nrow(baseline), y=raw, colour="Raw"), shape=8, stroke=1)+
  geom_line(aes(x=1:nrow(baseline), y=fpkm, color = "FPKM Normalized"), size=1)+
  geom_point(aes(x=1:nrow(baseline), y=fpkm, colour = "FPKM Normalized"), shape=8, stroke=1)+
  geom_line(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), size=1,)+
  geom_point(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), shape=8, stroke=1)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+
  scale_color_manual(values=c("Raw"="#9ECAD6", "FPKM Normalized" = "#4A9782","TPM Normalized"="#004030" ))+
  labs(x="Sample", title=TeX("$\\lambda_0$"), y="Value",colour="Normalization")


baseline.pseudo

lambda_1 <- data.frame(raw = surprisal_analysis(testing.case.1)[[1]][,2],
                       fpkm = surprisal_analysis(testing.case.2)[[1]][,2],
                       tpm = surprisal_analysis(testing.case.3)[[1]][,2])


lambda_1.pseudo <- ggplot(lambda_1)+
  geom_line(aes(x=1:nrow(baseline), y=raw, colour="Raw"), size=1)+
  geom_point(aes(x=1:nrow(baseline), y=raw, colour="Raw"), shape=8, stroke=1)+
  geom_line(aes(x=1:nrow(baseline), y=fpkm, color = "FPKM Normalized"), size=1)+
  geom_point(aes(x=1:nrow(baseline), y=fpkm, colour = "FPKM Normalized"), shape=8, stroke=1)+
  geom_line(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), size=1,)+
  geom_point(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), shape=8, stroke=1)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+
  scale_color_manual(values=c("Raw"="#9ECAD6", "FPKM Normalized" = "#4A9782","TPM Normalized"="#004030" ))+
  labs(x="Sample", title=TeX("$\\lambda_1$"), y="Value",colour="Normalization")


lambda_1.pseudo

lambda_2 <- data.frame(raw = surprisal_analysis(testing.case.1)[[1]][,3],
                       fpkm = surprisal_analysis(testing.case.2)[[1]][,3],
                       tpm = surprisal_analysis(testing.case.3)[[1]][,3])


lambda_2.pseudo <-   ggplot(lambda_2)+
  geom_line(aes(x=1:nrow(baseline), y=raw, colour="Raw"), size=1)+
  geom_point(aes(x=1:nrow(baseline), y=raw, colour="Raw"), shape=8, stroke=1)+
  geom_line(aes(x=1:nrow(baseline), y=fpkm, color = "FPKM Normalized"), size=1)+
  geom_point(aes(x=1:nrow(baseline), y=fpkm, colour = "FPKM Normalized"), shape=8, stroke=1)+
  geom_line(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), size=1,)+
  geom_point(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), shape=8, stroke=1)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+
  scale_color_manual(values=c("Raw"="#9ECAD6", "FPKM Normalized" = "#4A9782","TPM Normalized"="#004030" ))+
  labs(x="Sample", title=TeX("$\\lambda_2$"), y="Value",colour="Normalization")


lambda_2.pseudo


baseline <- data.frame(raw = surprisal_analysis(testing.case.1, "log1p")[[1]][,1],
                       fpkm = surprisal_analysis(testing.case.2, "log1p")[[1]][,1],
                       tpm = surprisal_analysis(testing.case.3, "log1p")[[1]][,1])


baseline.log1p <-  ggplot(baseline)+
  geom_line(aes(x=1:nrow(baseline), y=raw, colour="Raw"), size=1)+
  geom_point(aes(x=1:nrow(baseline), y=raw, colour="Raw"), shape=8, stroke=1)+
  geom_line(aes(x=1:nrow(baseline), y=fpkm, color = "FPKM Normalized"), size=1)+
  geom_point(aes(x=1:nrow(baseline), y=fpkm, colour = "FPKM Normalized"), shape=8, stroke=1)+
  geom_line(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), size=1,)+
  geom_point(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), shape=8, stroke=1)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+
  scale_color_manual(values=c("Raw"="#9ECAD6", "FPKM Normalized" = "#4A9782","TPM Normalized"="#004030" ))+
  labs(x="Sample", title=TeX("$\\lambda_0$"), y="Value",colour="Normalization")


baseline.log1p

lambda_1 <- data.frame(raw = surprisal_analysis(testing.case.1, "log1p")[[1]][,2],
                       fpkm = surprisal_analysis(testing.case.2, "log1p")[[1]][,2],
                       tpm = surprisal_analysis(testing.case.3, "log1p")[[1]][,2])


lambda_1.log1p <- ggplot(lambda_1)+
  geom_line(aes(x=1:nrow(baseline), y=raw, colour="Raw"), size=1)+
  geom_point(aes(x=1:nrow(baseline), y=raw, colour="Raw"), shape=8, stroke=1)+
  geom_line(aes(x=1:nrow(baseline), y=fpkm, color = "FPKM Normalized"), size=1)+
  geom_point(aes(x=1:nrow(baseline), y=fpkm, colour = "FPKM Normalized"), shape=8, stroke=1)+
  geom_line(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), size=1,)+
  geom_point(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), shape=8, stroke=1)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+
  scale_color_manual(values=c("Raw"="#9ECAD6", "FPKM Normalized" = "#4A9782","TPM Normalized"="#004030" ))+
  labs(x="Sample", title=TeX("$\\lambda_1$"), y="Value",colour="Normalization")


lambda_1.log1p


lambda_2 <- data.frame(raw = surprisal_analysis(testing.case.1, "log1p")[[1]][,3],
                       fpkm = surprisal_analysis(testing.case.2, "log1p")[[1]][,3],
                       tpm = surprisal_analysis(testing.case.3, "log1p")[[1]][,3])


lambda_2.log1p <- ggplot(lambda_2)+
  geom_line(aes(x=1:nrow(baseline), y=raw, colour="Raw"), size=1)+
  geom_point(aes(x=1:nrow(baseline), y=raw, colour="Raw"), shape=8, stroke=1)+
  geom_line(aes(x=1:nrow(baseline), y=fpkm, color = "FPKM Normalized"), size=1)+
  geom_point(aes(x=1:nrow(baseline), y=fpkm, colour = "FPKM Normalized"), shape=8, stroke=1)+
  geom_line(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), size=1,)+
  geom_point(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), shape=8, stroke=1)+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 15),
    text = element_text(size = 18)
  )+
  scale_color_manual(values=c("Raw"="#9ECAD6", "FPKM Normalized" = "#4A9782","TPM Normalized"="#004030" ))+
  labs(x="Sample", title=TeX("$\\lambda_2$"), y="Value",colour="Normalization")


lambda_2.log1p

#(baseline.pseudo+baseline.log1p)/(lambda.1.pseudo+lambda.1.log1p)/(lambda_2.pseudo+lambda_2.log1p)






running.test.case.1<-data.frame(Sample = as.numeric(rownames(testing.case.1)), testing.case.1)

running.test.case.2<-data.frame(Sample = as.numeric(rownames(testing.case.2)), testing.case.2)

running.test.case.3<-data.frame(Sample = as.numeric(rownames(testing.case.3)), testing.case.3)

testing.case.2.log1p <- GO_analysis_surprisal_analysis(surprisal_analysis(running.test.case.2, "log1p")[[2]], 0.85, 2, key_type = "ENTREZID", flip = FALSE, species.db.str =  "org.Mm.eg.db", top_GO_terms=15)

testing.case.2.pseudo <- GO_analysis_surprisal_analysis(surprisal_analysis(running.test.case.2, "pseudocount")[[2]], 0.85, 2, key_type = "ENTREZID", flip = FALSE, species.db.str =  "org.Mm.eg.db", top_GO_terms=15)





#ggplot(testing.case.2.pseudo, aes(x = reorder(Description, -Count), y = Count, fill = p.adjust)) +
  #geom_col() +
  #coord_flip() +
  #scale_fill_gradient(low = "steelblue", high = "red") +
  #labs(
  #  title = "Top 15 Enriched GO Terms",
  #  x = "GO Term",
  #  y = "Gene Count",
  #  fill = "Adj. p-value"
  #) +
  #theme_minimal(base_size = 14)





#Jaccard similarity


extreme_pct_lambda <- function(sa_mat, lambda_col = "lambda_1", p = 0.05) {
  stopifnot(lambda_col %in% colnames(sa_mat))
  v <- sa_mat[, lambda_col]
  n <- length(v)
  k <- max(1, ceiling(p * n))


  top_ids <- names(sort(v, decreasing = TRUE))[seq_len(k)]
  bot_ids <- names(sort(v, decreasing = FALSE))[seq_len(k)]

  return(sort(unique(c(top_ids, bot_ids))))
}



sa1_log1p <- surprisal_analysis(running.test.case.1, "log1p")[[2]]
sa1_pc    <- surprisal_analysis(running.test.case.1, "pseudocount")[[2]]


sa2_log1p <- surprisal_analysis(running.test.case.2, "log1p")[[2]]
sa2_pc    <- surprisal_analysis(running.test.case.2, "pseudocount")[[2]]

sa3_log1p <- surprisal_analysis(running.test.case.3, "log1p")[[2]]
sa3_pc    <- surprisal_analysis(running.test.case.3, "pseudocount")[[2]]


set1_log1p <- extreme_pct_lambda(sa1_log1p, "lambda_1", 0.05)
set1_pc    <- extreme_pct_lambda(sa1_pc,    "lambda_1", 0.05)

set2_log1p <- extreme_pct_lambda(sa2_log1p, "lambda_1", 0.05)
set2_pc    <- extreme_pct_lambda(sa2_pc,    "lambda_1", 0.05)

set3_log1p <- extreme_pct_lambda(sa3_log1p, "lambda_1", 0.05)
set3_pc    <- extreme_pct_lambda(sa3_pc,    "lambda_1", 0.05)

sets <- list(
  `Raw-log1p`        = set1_log1p,
  `Raw-pseudocount`  = set1_pc,
  `FPKM-log1p`       = set2_log1p,
  `FPKM-pseudocount` = set2_pc,
  `TPM-log1p`        = set3_log1p,
  `TPM-pseudocount`  = set3_pc
)


pairwise_matrix <- function(sets, fun) {
  n <- length(sets)
  M <- matrix(0, n, n, dimnames = list(names(sets), names(sets)))
  for (i in seq_len(n)) for (j in seq_len(n)) M[i, j] <- fun(sets[[i]], sets[[j]])
  M
}

overlap_fun <- function(a, b) length(intersect(a, b))
jaccard_fun <- function(a, b) {
  inter <- length(intersect(a, b)); uni <- length(union(a, b))
  if (uni == 0) return(NA_real_) else inter / uni
}

overlap_mat <- pairwise_matrix(sets, overlap_fun)
diag(overlap_mat) <- vapply(sets, length, integer(1))
jaccard_mat <- pairwise_matrix(sets, jaccard_fun)


pheatmap::pheatmap(
  jaccard_mat,
  main = TeX("Jaccard similarity (Top & Bottom 5% of $\\lambda_0$)"),
  display_numbers = TRUE, number_format = "%.2f",
  fontsize_row = 10, fontsize_col = 10, border_color = NA,
  color = colorRampPalette(c("#1B3C53", "#456882", "#91C8E4"))(100)
)


set1_log1p <- extreme_pct_lambda(sa1_log1p, "lambda_2", 0.05)
set1_pc    <- extreme_pct_lambda(sa1_pc,    "lambda_2", 0.05)

set2_log1p <- extreme_pct_lambda(sa2_log1p, "lambda_2", 0.05)
set2_pc    <- extreme_pct_lambda(sa2_pc,    "lambda_2", 0.05)

set3_log1p <- extreme_pct_lambda(sa3_log1p, "lambda_2", 0.05)
set3_pc    <- extreme_pct_lambda(sa3_pc,    "lambda_2", 0.05)

sets <- list(
  `Raw-log1p`        = set1_log1p,
  `Raw-pseudocount`  = set1_pc,
  `FPKM-log1p`       = set2_log1p,
  `FPKM-pseudocount` = set2_pc,
  `TPM-log1p`        = set3_log1p,
  `TPM-pseudocount`  = set3_pc
)


overlap_mat <- pairwise_matrix(sets, overlap_fun)
diag(overlap_mat) <- vapply(sets, length, integer(1))
jaccard_mat <- pairwise_matrix(sets, jaccard_fun)


pheatmap::pheatmap(
  jaccard_mat,
  main = TeX("Jaccard similarity (Top & Bottom 5% of $\\lambda_1$)"),
  display_numbers = TRUE, number_format = "%.2f",
  fontsize_row = 10, fontsize_col = 10, border_color = NA,
  color = colorRampPalette(c("#1B3C53", "#456882", "#91C8E4"))(100)
)



set1_log1p <- extreme_pct_lambda(sa1_log1p, "lambda_3", 0.05)
set1_pc    <- extreme_pct_lambda(sa1_pc,    "lambda_3", 0.05)

set2_log1p <- extreme_pct_lambda(sa2_log1p, "lambda_3", 0.05)
set2_pc    <- extreme_pct_lambda(sa2_pc,    "lambda_3", 0.05)

set3_log1p <- extreme_pct_lambda(sa3_log1p, "lambda_3", 0.05)
set3_pc    <- extreme_pct_lambda(sa3_pc,    "lambda_3", 0.05)

sets <- list(
  `Raw-log1p`        = set1_log1p,
  `Raw-pseudocount`  = set1_pc,
  `FPKM-log1p`       = set2_log1p,
  `FPKM-pseudocount` = set2_pc,
  `TPM-log1p`        = set3_log1p,
  `TPM-pseudocount`  = set3_pc
)


overlap_mat <- pairwise_matrix(sets, overlap_fun)
diag(overlap_mat) <- vapply(sets, length, integer(1))
jaccard_mat <- pairwise_matrix(sets, jaccard_fun)


pheatmap::pheatmap(
  jaccard_mat,
  main = TeX("Jaccard similarity (Top & Bottom 5% of $\\lambda_2$)"),
  display_numbers = TRUE, number_format = "%.2f",
  fontsize_row = 10, fontsize_col = 10, border_color = NA,
  color = colorRampPalette(c("#1B3C53", "#456882", "#91C8E4"))(100)
)

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.