R Code

This appendix contains all of the R code to reproduce the work presented. Note, I have omitted the lines of code defining captions for figures/tables for formatting purposes.
options(knitr.graphics.auto_pdf = TRUE)
knitr::opts_chunk$set(
  echo      = FALSE,
  warning   = FALSE,
  message   = FALSE,
  error     = FALSE,
  cache     = TRUE,
  fig.align = "center",
  fig.width = 5,
  fig.height = 4,
  dev.args = list(pdf = list(pointsize = 10),
                  png = list(pointsize = 10)),
  dpi = 300
)
tex  <- TRUE
docx <- FALSE
html <- FALSE
if (!knitr:::is_latex_output()) {
    tex <- FALSE
    if (knitr:::is_html_output()) {
        html <- TRUE
    } else {
        docx <- TRUE
    }
}
library(filer2020A)
library(filer2020B)
library(mcCNV)
library(xtable)
library(eulerr)
library(dlfUtils)
library(parallel)
library(grid)
library(kableExtra)
library(MASS)
library(tufte)
defOutWid <- if (html) '70%' else NULL
dblOutWid <- if (html) '40%' else '49%'
data(subjectMeta)
poolTbl <- subjectMeta[ ,
                       .(N = .N,
                         medExon = round(median(medIntMolCount), 0),
                         medTotal = round(median(totalMolCount), 0),
                         minTotal = min(totalMolCount),
                         maxTotal = max(totalMolCount),
                         rsdTotal = sd(totalMolCount)/mean(totalMolCount)*100),
                       by = .(pool, capture, multiplexCapture)]
poolTbl[ , rsdTotal := round(rsdTotal, 1)]
poolTbl[(!multiplexCapture), pool := paste0(pool, "$^\\dagger$")]
kable(poolTbl[ , .(pool, capture, N, medExon, medTotal, 
                   minTotal, maxTotal, rsdTotal)], 
      row.names = FALSE, 
      label = "poolSummary",
      format.args = list(big.mark = ",", scientific = FALSE),
      booktabs = TRUE,
      caption.short = "Summary of whole-exome sequencing for CNV project.",
      escape = FALSE) %>%
  kable_classic()
## Create IC pools
smplSubject <- function(poolName, n) {
  data(subjectMeta, envir = environment())
  subjectMeta[pool == poolName, sample(subject, n, replace = FALSE)]
}
set.seed(1234)
pools <- c(replicate(5, smplSubject("NCGENES", 16), simplify = FALSE))
names(pools) <- c(sprintf("randNCG_%d", 1:5))
pools <- c(pools,
           with(subjectMeta[pool != "NCGENES"], split(subject, pool)))
## Calculate mean-variance by pool
mnvr <- mclapply(pools, subsetCounts, mc.cores = length(pools))
mnvr <- mclapply(mnvr, calcIntStats, mc.cores = length(mnvr))
for (i in seq_along(mnvr)) {
  mnvr[[i]][ , pool := names(mnvr)[i]]
}
mnvr <- rbindlist(mnvr)
setkey(mnvr, pool); setcolorder(mnvr)

## Estimate alpha0
alpha0 <- mclapply(pools, estAlpha0, mc.cores = length(pools))
a0tbl <- data.table(pool = names(alpha0),
                    a0 = sapply(alpha0, "[[", "a0"),
                    N = sapply(alpha0, "[[", "N"))
a0tbl[ , aMn := a0/N]
calcRange <- function(x) {
  subjectMeta[subject %in% x,
              .(mnCount = min(totalMolCount),
                mdCount = median(totalMolCount),
                mxCount = max(totalMolCount),
                rsCount = sd(totalMolCount)/mean(totalMolCount)*100)]
}
poolCts <- lapply(pools, calcRange)
poolCts <- lapply(names(poolCts), function(x) poolCts[[x]][ , pool := x])
poolCts <- rbindlist(poolCts)
a0tbl <- merge(a0tbl, poolCts)
a0tbl[ , mc := !grepl("IDT-IC|rand", pool)]
a0tbl[ , idt := grepl("IDT", pool)]
pltAlpha0(a0tbl)
aglPools <- c(sprintf("randNCG_%d", 1:5), "Pool1", "Pool2", "WGS", "SMA1", "SMA2")
with(mnvr[pool %in% aglPools], {
  pltMnVrCont(dat = as.data.table(as.list(environment())),
              grpVec = factor(pool, levels = aglPools),
              colVec = c(rep('darkblue', 5), rep('darkorange', 5)),
              lgnd = FALSE)
})
idtPools <- c("IDT-MC", "IDT-IC", "IDT-RR")
with(mnvr[pool %in% idtPools], {
  pltMnVrCont(dat = as.data.table(as.list(environment())),
              grpVec = factor(pool, levels = idtPools),
              colVec = c("darkorange", "darkblue", "darkorange"),
              lgnd = FALSE)
})
with(mnvr[pool %in% c("WGS", "IDT-RR")], {
  pltMnVrCont(dat = as.data.table(as.list(environment())),
              grpVec = factor(pool, levels = c("WGS", "IDT-RR")),
              colVec = c("darkblue", "darkorange"))
})
pltSubjectStatByPool("medIntMolCount", ylab = "Median count per exon")
pltSubjectStatByPool("nSelected", ylab = "Number of controls selected")
pltSubjectStatByPool("propSelected", ylab = "Proportion of controls selected")
pltSubjectStatByPool("overallPhi", ylab = "Overdispersion (phi)")
setkey(subjectMeta, subject)
data(subjectCorr)
setkey(subjectCorr, subject)
subjectCorr <- subjectMeta[subjectCorr]
data(simRes)
procSimRes <- lapply(simRes, function(x) procRes(x$clpRes))

simResTbl <- lapply(procSimRes,
                    function(x) x$mnDat[ , .(mcc, tpr, fdr), keyby = dep])
simResTbl <- Reduce(merge, simResTbl)
setnames(simResTbl,
         c("dep",
           "mcMCC", "mcTPR", "mcFDR",  ## mcCNV
           "edMCC", "edTPR", "edFDR",  ## ExomeDepthDefault
           "ebMCC", "ebTPR", "ebFDR")) ## ExomeDepthBest
setcolorder(simResTbl,
            c("dep", "mcMCC", "edMCC", "ebMCC", "mcTPR",
              "edTPR", "ebTPR", "mcFDR", "edFDR", "ebFDR"))
simResTbl <- simResTbl[ , lapply(.SD, signif, 3), by = dep]
pltStatCompare(xRes = procSimRes$ExomeDepthDefault, yRes = procSimRes$mcCNV,
               stat = "mcc", xlab = "ExomeDepth (default)", ylab = "mcCNV")
addfiglab("A")
pltStatCompare(xRes = procSimRes$ExomeDepthDefault, yRes = procSimRes$mcCNV,
               stat = "tpr", xlab = "ExomeDepth (default)", ylab = "mcCNV")
addfiglab("B")
pltStatCompare(xRes = procSimRes$ExomeDepthDefault, yRes = procSimRes$mcCNV,
               stat = "fdr", xlab = "ExomeDepth (default)", ylab = "mcCNV")
addfiglab("C")
pltStatCompare(xRes = procSimRes$ExomeDepthBest, yRes = procSimRes$mcCNV,
               stat = "mcc", xlab = "ExomeDepth (correct)", ylab = "mcCNV")
addfiglab("D")
pltStatCompare(xRes = procSimRes$ExomeDepthBest, yRes = procSimRes$mcCNV,
               stat = "tpr", xlab = "ExomeDepth (correct)", ylab = "mcCNV")
addfiglab("E")
pltStatCompare(xRes = procSimRes$ExomeDepthBest, yRes = procSimRes$mcCNV,
               stat = "fdr", xlab = "ExomeDepth (correct)", ylab = "mcCNV")
addfiglab("F")
kable(simResTbl,
      col.names = c("dep", rep(c("mcCNV", "ED-def", "ED-sim"), 3)),
      row.names = FALSE, 
      label = "simResTbl",
      format.args = list(big.mark = ",", scientific = FALSE),
      booktabs = TRUE) %>%
  kable_classic() %>%
  add_header_above(c(" " = 1, "MCC" = 3, "TPR" = 3, "FDR" = 3))
data(wgsPoolCalls)
mergeAll <- function(x, y) merge(x, y, all = TRUE)
wgs <- Reduce(mergeAll, wgsPoolCalls)
data(intAgl)
xpandInt <- function(int, sbjVec) {
  lst <- vector(mode = "list", length = length(sbjVec))
  names(lst) <- sbjVec
  for (s in sbjVec) {
    lst[[s]] <- copy(int)
    lst[[s]][ , subject := s]
  }
  rbindlist(lst)
}
wgsAgl <- xpandInt(intAgl, wgs[ , unique(subject)])
setkeyv(wgsAgl, key(wgs))
wgs <- wgs[wgsAgl]
rm(wgsAgl)
wgs <- wgs[!(rlcr),
           .(mcDup = !is.na(passFilter) & CN > 1,
             edDup = !is.na(type) & type == "duplication",
             wgDup = !is.na(erds) & !is.na(cnvpytor) & erds == "dup",
             mcDel = !is.na(passFilter) & CN < 1,
             edDel = !is.na(type) & type == "deletion",
             wgDel = !is.na(erds) & !is.na(cnvpytor) & erds == "del"),
           by = .(subject, seqnames, start, end)]
wgs[ , mc := mcDup | mcDel]
wgs[ , ed := edDup | edDel]
wgs[ , wg := wgDup | wgDel]
setcolorder(wgs, c(key(wgs), 'mc', 'ed', 'wg'))
wgsCallBySbj <- wgs[ , lapply(.SD, sum), .SDcols = is.logical, by = subject]
kable(wgsCallBySbj,
      col.names = c("subject", rep(c("MC", "ED", "WG"), 3)),
      row.names = FALSE, 
      label = "wgsCallSbj",
      format.args = list(big.mark = ",", scientific = FALSE),
      booktabs = TRUE) %>%
  kable_classic() %>%
  add_header_above(c(" " = 1, "Total" = 3, "Duplications" = 3, "Deletions" = 3))
pmLst <- list()
pmLst$mc <- with(wgs, evalPred(mc, wg))
pmLst$ed <- with(wgs, evalPred(ed, wg))
pmLst$mcSub <- with(wgs[!grepl("790|851", subject)], evalPred(mc, wg))
pmLst$edSub <- with(wgs[!grepl("790|851", subject)], evalPred(ed, wg))
pmLst$mcDup <- with(wgs, evalPred(mcDup, wgDup))
pmLst$edDup <- with(wgs, evalPred(edDup, wgDup))
pmLst$mcSubDup <- with(wgs[!grepl("790|851", subject)],
                       evalPred(mcDup, wgDup))
pmLst$edSubDup <- with(wgs[!grepl("790|851", subject)],
                       evalPred(edDup, wgDup))
pmLst$mcDel <- with(wgs, evalPred(mcDel, wgDel))
pmLst$edDel <- with(wgs, evalPred(edDel, wgDel))
pmLst$mcSubDel <- with(wgs[!grepl("790|851", subject)],
                       evalPred(mcDel, wgDel))
pmLst$edSubDel <- with(wgs[!grepl("790|851", subject)],
                       evalPred(edDel, wgDel))
predMetrics <- as.data.table(do.call(rbind, pmLst), keep.rownames = "PredSet")
predMetrics[ , typ := rep(c("ALL", "DUP", "DEL"), each = 4)]
predMetrics[ , set := ifelse(grepl("Sub", PredSet), "Sub", "Total")]
predMetrics[ , alg := ifelse(grepl("mc", PredSet), "MC", "ED")]
setcolorder(predMetrics, c("typ", "set", "alg"))
kable(predMetrics[ , .(typ, set, alg, MCC, TPR, FDR, PPV)],
      col.names = c(rep("", 3), "MCC", "TPR", "FDR", "PPV"),
      row.names = FALSE, 
      label = "predMet",
      format.args = list(big.mark = ",", scientific = FALSE),
      booktabs = TRUE) %>%
  kable_classic() %>%
  collapse_rows(columns = 1:3, valign = "middle")
ctsAll <- euler(wgs[ , .(mc, ed, wg)])
ctsSub <- euler(wgs[!grepl("790|851", subject), .(mc, ed, wg)])
ctsAllDup <- euler(wgs[ , .(mcDup, edDup, wgDup)])
ctsSubDup <- euler(wgs[!grepl("790|851", subject), .(mcDup, edDup, wgDup)])
ctsAllDel <- euler(wgs[ , .(mcDel, edDel, wgDel)])
ctsSubDel <- euler(wgs[!grepl("790|851", subject), .(mcDel, edDel, wgDel)])
eulerr_options(fills = list(fill = c("#E9E9E9", "#7F7FC4", "#FFC57F")),
               quantities = list(cex = 0.5))
gridFigLab <- function(lab) {
  grid.text(lab, x = 0, y = 1, hjust = 0, vjust = 1, gp = gpar(font = 2))
}
eulerr_options(fills = list(fill = c("#E9E9E9", "#7F7FC4", "#FFC57F")),
               quantities = list(cex = 0.5))
plot(ctsAllDup, quantities = TRUE, labels = FALSE, main = "")
gridFigLab("A")
grid.text("DUPLICATIONS", x = 0.5, y = 0.9)
plot(ctsAllDel, quantities = TRUE, labels = FALSE, main = "")
gridFigLab("B")
grid.text("DELETIONS", x = 0.5, y = 0.9)

eulerr_options(fills = list(fill = c("#E9E9E9", "#7F7FC4", "#FFC57F")),
               quantities = list(cex = 0.5))
plot(ctsSubDup, quantities = TRUE, labels = FALSE, main = "")
gridFigLab("A")
grid.text("DUPLICATIONS", x = 0.5, y = 0.9)
plot(ctsSubDel, quantities = TRUE, labels = FALSE, main = "")
gridFigLab("B")
grid.text("DELETIONS", x = 0.5, y = 0.9)
## Estimate maternal-fetal genotypes from cell-free exome sequencing
data(gt)
gt[ , udep := ref + alt]
gt[ , use := udep > 80 & ref > 5 & alt > 5]
for (s in c("S1", "S2", "FES-0034-4")) {
  gt[smp == s & use,
     c("ff", "gtCall", "gtLike") := callSmpl(alt, udep, .N, median(sdep/ldep))]
}
## Calculate median read depth & fetal fraction estimates by sample
smry <- gt[(use), .(md = median(udep), ff = ff[1]), by = smp]
setkey(smry, smp)
## Allele depth histograms
# par(oma = c(3, 0, 0, 0), mfrow = c(3, 1))
layout(matrix(1:4, ncol = 1), heights = c(rep(10, 3), 1))
with(gt[use & smp == "S1"], cfPltFreqHist(alt, udep, gtCall, ff = ff[1]))
title(ylab = "Case 1", line = 1)
addfiglab("A")

data(rs140468248)
data(GenoMeta)
with(gt[use & smp == "S2"], cfPltFreqHist(alt, udep, gtCall, ff = ff[1]))
oiCall <- gt[varid == rs140468248, gtCall]
oiPmar <- gt[varid == rs140468248, alt/(ref + alt)]
abline(v = oiPmar, col = GenoMeta$color[GenoMeta$name == oiCall])
text(x = oiPmar,
     y = grconvertY(0.75, "nfc"),
     "rs140468248",
     srt = 90,
     adj = c(0.5, 1.5),
     col = GenoMeta$color[GenoMeta$name == oiCall])
title(ylab = "Case 2", line = 1)
addfiglab("B", units = 'nfc')

with(gt[use & smp == "FES-0034-4"], cfPltFreqHist(alt, udep, gtCall, ff = ff[1]))
title(ylab = "Case 3", line = 1)
addfiglab("C", units = 'nfc')

par(mar = rep(0, 4))
plot.new()
legend(x = grconvertX(0.5, from = "nfc"),
       y = grconvertY(0.5, from = "nfc"),
       legend = GenoMeta$name,
       horiz = TRUE,
       lwd = 4,
       col = GenoMeta$color,
       xjust = 0.5,
       yjust = 0.5,
       xpd = NA,
       bty = "n")
## Collect read depth and filtering information into a table
data(readSmry)
rs <- readSmry$summary
rs[ , pctDup := round(pctDup, 2)]
rs[ , pctFlt := round(pctFlt, 2)]
setkey(rs, smp)
rs <- smry[rs]
rs[ , ff := round(ff, 3)]
rs[smp == "S1", case := "Case 1"]
rs[smp == "S2", case := "Case 2"]
rs[grepl("FES", smp), case := "Case 3"]
caseSmry <- rs[!is.na(ff)][order(case)][ , {
  .(GA = sprintf('(ref:case%dga)', 1:3),
    `Clinical findings` = sprintf('(ref:case%dcc)', 1:3),
    `Genetic diagnosis` = sprintf('(ref:case%dgd)', 1:3),
    FF = ff, 
    Dep = md, 
    `%Dup` = pctDup, 
    `%Filt` = pctFlt)
}]
kable(caseSmry, 
      row.names = TRUE,
      label = "caseSmry",
      format.args = list(big.mark = ",", scientific = FALSE),
      booktabs = TRUE,
      caption.short = "Cell-free exome sequencing case summaries") %>%
  kable_classic() %>%
  column_spec(3:4, width = "10em")
## Load c3MatFetReads
data(c3MatFetReads)
## Calculate density and ECDF for fragment read lengths
matDen <- density(c3MatFetReads[source == "maternal", isize])
fetDen <- density(c3MatFetReads[source == "fetal",    isize])
matCdf <- ecdf(c3MatFetReads[source == "maternal", isize])
fetCdf <- ecdf(c3MatFetReads[source == "fetal",    isize])

## Generate plots
par(mfrow = c(1, 2), oma = c(3, 0, 0, 0))
par(mar = c(4, 4, 1, 1))
plot.new()
plot.window(xlim = range(c(matDen$x, fetDen$x)), range(c(matDen$y, fetDen$y)))
lines(matDen, col = "darkblue", lwd = 2)
lines(fetDen, col = "darkorange", lwd =2)
axis(side = 1)
title(xlab = "Fragment length (insert size)", ylab = "Density")
addfiglab("A")
plot.new()
xv <- seq(50, 300, 1)
plot.window(xlim = range(xv), ylim = 0:1)
points(x = xv, y = matCdf(xv), col = "darkblue", lwd = 2, type = "l")
points(x = xv, y = fetCdf(xv), col = "darkorange", lwd = 2, type = "l")
addfiglab("B", units = 'nfc')
axis(side = 1)
axis(side = 2)
title(xlab = "Fragment length (insert size)",
      ylab = "Cumulative distribution")
legend(x = grconvertX(0.5, from = "ndc"),
       y = line2user(2, side = 1, outer = TRUE),
       legend = c("Maternal", "Fetal"),
       horiz = TRUE,
       lwd = 2,
       col = c("darkblue", "darkorange"),
       xjust = 0.5,
       yjust = 0.5,
       xpd = NA,
       bty = "n")
gt[ , pmar := alt/(alt + ref)]
gt[ , sratio := sdep/adep]
pltSratioByPmar <- function(smpNm) {
  par(mar = c(4, 4, 1, 1))
  with(gt[gtCall == "AAab" & smp == smpNm], {
    plot(pmar ~ sratio,
         xlab = "Proportion of fragments < 140 bp",
         ylab = "PMAR",
         pch = 16,
         cex = 0.5,
         col = col2alpha('darkgray'),
         bty = "n")
  })
  with(gt[gtCall == "AAab" & smp == smpNm], {
    contour(kde2d(x = sratio, y = pmar, n = 500),
            nlevels = 25,
            add = TRUE,
            drawlabels = FALSE,
            col = "darkblue")
  })
}
pltSratioByPmar("S1")
addfiglab("A")
pltSratioByPmar("S2")
addfiglab("B")
pltSratioByPmar("FES-0034-4")
addfiglab("C")
## Calculate call concordance between cell-free and direct for Case 3
c3 <- gt[(smp == "FES-0034-0" & udep > 30 & gt %in% c("0/0", "0/1", "1/1")) |
           (smp == "FES-0034-1" & udep > 30 & gt %in% c("0/0", "0/1", "1/1")) |
           (use & smp == "FES-0034-4")]
c3[ , mrg := substr(gtCall, 3, 4)]
c3[ , mrg := c("0/0", "0/1", "1/1")[match(mrg, c("aa", "ab", "bb"))]]
c3[is.na(gtCall), mrg := gt]
c3 <- dcast(c3, varid ~ smp, value.var = "mrg")
setnames(c3, c("varid", "fet", "mat", "cff"))
c3 <- c3[ , .N, by = .(mat, fet, cff)]
## call by call matrix, cell-free calls in columns, fetal in rows
c3FetByCf <- dcast(c3[!is.na(fet) & !is.na(cff)], 
                   fet ~ cff,
                   fun.aggregate = sum, 
                   value.var = "N")
setkey(c3FetByCf, fet)
## all c3 calls
c3Calls <- c3[!is.na(mat) & !is.na(fet) & !is.na(cff), ][order(mat, fet, cff)]
kable(c3FetByCf[ , .(" " = "Fetal", " " = fet, `0/0`, `0/1`, `1/1`)], 
      row.names = FALSE,
      label = "c3FetByCf",
      format.args = list(big.mark = ",", scientific = FALSE),
      booktabs = TRUE,
      caption.short = 'Case 3 fetal versus cell-free genotype calls.') %>%
  kable_classic() %>%
  collapse_rows(columns = 1, valign = "middle") %>%
  add_header_above(c(" " = 2, "Cell-free" = 3))
kable(c3Calls[ , .(Maternal = mat, Fetal = fet, `Cell-free` = cff, N)], 
      row.names = FALSE,
      label = "c3Calls",
      format.args = list(big.mark = ",", scientific = FALSE),
      booktabs = TRUE,
      caption.short = 'Maternal, fetal, and cell-free genotype calls.') %>%
  kable_classic() %>%
  collapse_rows(columns = 1:2, valign = "middle", latex_hline = "full") 
pltExpMAF(500)
pltMissclass()