Identify Copy Number Variations (CNVs) in cancer cells

The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

Analyze snATAC-seq data of basal cell carcinoma sample SU008_Tumor_Pre in GEO (GSE129785).

Download from GEO

Download GSE129785_scATAC-TME-All.cell_barcodes.txt.gz from below and gunzip https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE129785&format=file&file=GSE129785%5FscATAC%2DTME%2DAll%2Ecell%5Fbarcodes%2Etxt%2Egz

Download GSM3722064_SU008_Tumor_Pre_fragments.tsv.gz from https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM3722064&format=file&file=GSM3722064%5FSU008%5FTumor%5FPre%5Ffragments%2Etsv%2Egz

Prepare windows and peaks

The input window file window.hg37.20MB.bed and resultant peak file multi_tissue_peaks.hg37.20MB.bed can be downloaded from https://doi.org/10.6084/m9.figshare.23574066

To reproduce by yourself, download chromatin accessibility DHS_Index_and_Vocabulary_hg19_WM20190703.txt.gz from https://doi.org/10.5281/zenodo.3838751

Generate peaks for 20MB windows using peak_sum.R by yardimcilab in https://github.com/yardimcilab/RIDDLER/blob/main/util/peak_sum.R

SU008_Tumor_Pre_windowcovariates =
  read.table(
    "multi_tissue_peaks.hg37.20MB.bed",
    header = FALSE)
colnames(SU008_Tumor_Pre_windowcovariates) =
  c("chr", "start", "end", "window", "peaks")

Compute fragmentoverlap from fragments

See vignette of R package scPloidy. Load setting for hg19 genome.

simpleRepeat = readr::read_tsv(
  "~/human/publichuman/hg37_ucsc/simpleRepeat.chrom_chromStart_chromEnd.txt.gz",
  col_names = c("chrom", "chromStart", "chromEnd"))
rmsk = readr::read_tsv(
  "~/human/publichuman/hg37_ucsc/rmsk.Simple_repeat.genoName_genoStart_genoEnd.txt.gz",
  col_names = c("chrom", "chromStart", "chromEnd"))
simpleRepeat = rbind(simpleRepeat, rmsk)
rm(rmsk)
# convert from 0-based position to 1-based
simpleRepeat[, 2] = simpleRepeat[, 2] + 1
simpleRepeat = GenomicRanges::makeGRangesFromDataFrame(
  as.data.frame(simpleRepeat),
  seqnames.field = "chrom",
  start.field    = "chromStart",
  end.field      = "chromEnd")
# remove duplicates
simpleRepeat = GenomicRanges::union(simpleRepeat, GenomicRanges::GRanges())

window = read.table("window.hg37.20MB.bed", header = FALSE)
colnames(window) = c("chr", "start", "end", "window")
at = GenomicRanges::makeGRangesFromDataFrame(window[, 1:3])
barcodesuffix = paste0(".", window$window)

sc = read.csv(
  "GSE129785_scATAC-TME-All.cell_barcodes.txt",
  header = TRUE,
  sep = "\t")

Compute and save fragmentoverlap.

sample = "GSM3722064"
tissue = "SU008_Tumor_Pre"

bc = sc$Barcodes[sc$Group == tissue]
SU008_Tumor_Pre_fragmentoverlap =
  fragmentoverlapcount(
    paste0("SRX5679934/", sample, "_", tissue, "_fragments.tsv.gz"),
    at,
    excluderegions = simpleRepeat,
    targetbarcodes = bc,
    Tn5offset = c(1, 0),
    barcodesuffix = barcodesuffix
  )

CNV

You can skip above and load preprocessed data attached to the package. The data file GSE129785_SU008_Tumor_Pre.RData is also available from https://doi.org/10.6084/m9.figshare.23574066

data(GSE129785_SU008_Tumor_Pre)

Infer CNVs.

levels = c(2, 4)
result = cnv(SU008_Tumor_Pre_fragmentoverlap,
             SU008_Tumor_Pre_windowcovariates,
             levels = levels,
             deltaBICthreshold = -600)
#> [1] "Computing span = 2"
#> [1] "Computing span = 3"
#> [1] "Computing span = 4"
#> [1] "Computing span = 5"
#> [1] "Computing span = 6"
#> [1] "Computing span = 7"
#> [1] "Computing span = 8"
#> [1] "Computing span = 9"
#> [1] "Computing span = 10"
#> [1] "Computing span = 2"
#> [1] "Computing span = 3"
#> [1] "Computing span = 4"
#> [1] "Computing span = 5"
#> [1] "Computing span = 6"
#> [1] "Computing span = 7"
#> [1] "Computing span = 8"
#> [1] "Computing span = 9"
#> [1] "Computing span = 10"

Attach the result to fragmentoverlap.

windowcovariates = SU008_Tumor_Pre_windowcovariates
windowcovariates$w =
  as.numeric(sub("window_", "", windowcovariates$window))

fragmentoverlap = SU008_Tumor_Pre_fragmentoverlap
fragmentoverlap$cell =
  sub(".window.*", "", fragmentoverlap$barcode)
fragmentoverlap$window =
  sub(".*window", "window", fragmentoverlap$barcode)
fragmentoverlap$w =
  as.numeric(sub("window_", "", fragmentoverlap$window))

x = match(fragmentoverlap$barcode,
        result$cellwindowCN$barcode)
fragmentoverlap$CN = result$cellwindowCN$CN[x]
fragmentoverlap$ploidy.moment.cell = result$cellwindowCN$ploidy.moment.cell[x]
fragmentoverlap = fragmentoverlap[!is.na(fragmentoverlap$CN), ]

# For better hierarchical clustering
fragmentoverlap$pwindownormalizedcleanedceiled =
  pmin(fragmentoverlap$CN, min(levels) * 2)

Make dataframe for plotting.

dataplot =
  fragmentoverlap %>%
  dplyr::select("w", "cell", "pwindownormalizedcleanedceiled") %>%
  tidyr::pivot_wider(names_from = "w", values_from = "pwindownormalizedcleanedceiled")
dataplot = as.data.frame(dataplot)
rownames(dataplot) = dataplot$cell
dataplot = dataplot[, colnames(dataplot) != "cell"]
dataplot = as.matrix(dataplot)
n = max(as.numeric(colnames(dataplot)))
dataplot = dataplot[, match(as.character(1:n), colnames(dataplot))]
colnames(dataplot) = as.character(1:n)

Plot.

breaks = c(0, min(levels) - 1, min(levels) + 1, min(levels) * 2)

x = windowcovariates
x$chr[duplicated(windowcovariates$chr)] = NA
x = x$chr[match(colnames(dataplot), x$w)]

RowSideColors =
  unlist(
    lapply(
      fragmentoverlap$ploidy.moment.cell[
        match(rownames(dataplot), fragmentoverlap$cell)],
      function (x) { which(sort(levels) == x)}))
RowSideColors = topo.colors(length(levels))[RowSideColors]

gplots::heatmap.2(
  dataplot,
  Colv = FALSE,
  dendrogram = "none",
  breaks = breaks,
  col = c("blue", "gray80", "red"),
  trace = "none", labRow = FALSE, na.color = "white",
  labCol = x,
  RowSideColors= RowSideColors)

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.