R Code
This appendix contains all of the R code to reproduce the work presented. Note, I have omitted the lines of code defining captions for figures/tables for formatting purposes.
options(knitr.graphics.auto_pdf = TRUE)
::opts_chunk$set(
knitrecho = FALSE,
warning = FALSE,
message = FALSE,
error = FALSE,
cache = TRUE,
fig.align = "center",
fig.width = 5,
fig.height = 4,
dev.args = list(pdf = list(pointsize = 10),
png = list(pointsize = 10)),
dpi = 300
)<- TRUE
tex <- FALSE
docx <- FALSE
html if (!knitr:::is_latex_output()) {
<- FALSE
tex if (knitr:::is_html_output()) {
<- TRUE
html else {
} <- TRUE
docx
}
}library(filer2020A)
library(filer2020B)
library(mcCNV)
library(xtable)
library(eulerr)
library(dlfUtils)
library(parallel)
library(grid)
library(kableExtra)
library(MASS)
library(tufte)
<- if (html) '70%' else NULL
defOutWid <- if (html) '40%' else '49%'
dblOutWid data(subjectMeta)
<- subjectMeta[ ,
poolTbl N = .N,
.(medExon = round(median(medIntMolCount), 0),
medTotal = round(median(totalMolCount), 0),
minTotal = min(totalMolCount),
maxTotal = max(totalMolCount),
rsdTotal = sd(totalMolCount)/mean(totalMolCount)*100),
= .(pool, capture, multiplexCapture)]
by := round(rsdTotal, 1)]
poolTbl[ , rsdTotal !multiplexCapture), pool := paste0(pool, "$^\\dagger$")]
poolTbl[(kable(poolTbl[ , .(pool, capture, N, medExon, medTotal,
minTotal, maxTotal, rsdTotal)], row.names = FALSE,
label = "poolSummary",
format.args = list(big.mark = ",", scientific = FALSE),
booktabs = TRUE,
caption.short = "Summary of whole-exome sequencing for CNV project.",
escape = FALSE) %>%
kable_classic()
## Create IC pools
<- function(poolName, n) {
smplSubject data(subjectMeta, envir = environment())
== poolName, sample(subject, n, replace = FALSE)]
subjectMeta[pool
}set.seed(1234)
<- c(replicate(5, smplSubject("NCGENES", 16), simplify = FALSE))
pools names(pools) <- c(sprintf("randNCG_%d", 1:5))
<- c(pools,
pools with(subjectMeta[pool != "NCGENES"], split(subject, pool)))
## Calculate mean-variance by pool
<- mclapply(pools, subsetCounts, mc.cores = length(pools))
mnvr <- mclapply(mnvr, calcIntStats, mc.cores = length(mnvr))
mnvr for (i in seq_along(mnvr)) {
:= names(mnvr)[i]]
mnvr[[i]][ , pool
}<- rbindlist(mnvr)
mnvr setkey(mnvr, pool); setcolorder(mnvr)
## Estimate alpha0
<- mclapply(pools, estAlpha0, mc.cores = length(pools))
alpha0 <- data.table(pool = names(alpha0),
a0tbl a0 = sapply(alpha0, "[[", "a0"),
N = sapply(alpha0, "[[", "N"))
:= a0/N]
a0tbl[ , aMn <- function(x) {
calcRange %in% x,
subjectMeta[subject mnCount = min(totalMolCount),
.(mdCount = median(totalMolCount),
mxCount = max(totalMolCount),
rsCount = sd(totalMolCount)/mean(totalMolCount)*100)]
}<- lapply(pools, calcRange)
poolCts <- lapply(names(poolCts), function(x) poolCts[[x]][ , pool := x])
poolCts <- rbindlist(poolCts)
poolCts <- merge(a0tbl, poolCts)
a0tbl := !grepl("IDT-IC|rand", pool)]
a0tbl[ , mc := grepl("IDT", pool)]
a0tbl[ , idt pltAlpha0(a0tbl)
<- c(sprintf("randNCG_%d", 1:5), "Pool1", "Pool2", "WGS", "SMA1", "SMA2")
aglPools with(mnvr[pool %in% aglPools], {
pltMnVrCont(dat = as.data.table(as.list(environment())),
grpVec = factor(pool, levels = aglPools),
colVec = c(rep('darkblue', 5), rep('darkorange', 5)),
lgnd = FALSE)
})<- c("IDT-MC", "IDT-IC", "IDT-RR")
idtPools with(mnvr[pool %in% idtPools], {
pltMnVrCont(dat = as.data.table(as.list(environment())),
grpVec = factor(pool, levels = idtPools),
colVec = c("darkorange", "darkblue", "darkorange"),
lgnd = FALSE)
})with(mnvr[pool %in% c("WGS", "IDT-RR")], {
pltMnVrCont(dat = as.data.table(as.list(environment())),
grpVec = factor(pool, levels = c("WGS", "IDT-RR")),
colVec = c("darkblue", "darkorange"))
})pltSubjectStatByPool("medIntMolCount", ylab = "Median count per exon")
pltSubjectStatByPool("nSelected", ylab = "Number of controls selected")
pltSubjectStatByPool("propSelected", ylab = "Proportion of controls selected")
pltSubjectStatByPool("overallPhi", ylab = "Overdispersion (phi)")
setkey(subjectMeta, subject)
data(subjectCorr)
setkey(subjectCorr, subject)
<- subjectMeta[subjectCorr]
subjectCorr data(simRes)
<- lapply(simRes, function(x) procRes(x$clpRes))
procSimRes
<- lapply(procSimRes,
simResTbl function(x) x$mnDat[ , .(mcc, tpr, fdr), keyby = dep])
<- Reduce(merge, simResTbl)
simResTbl setnames(simResTbl,
c("dep",
"mcMCC", "mcTPR", "mcFDR", ## mcCNV
"edMCC", "edTPR", "edFDR", ## ExomeDepthDefault
"ebMCC", "ebTPR", "ebFDR")) ## ExomeDepthBest
setcolorder(simResTbl,
c("dep", "mcMCC", "edMCC", "ebMCC", "mcTPR",
"edTPR", "ebTPR", "mcFDR", "edFDR", "ebFDR"))
<- simResTbl[ , lapply(.SD, signif, 3), by = dep]
simResTbl pltStatCompare(xRes = procSimRes$ExomeDepthDefault, yRes = procSimRes$mcCNV,
stat = "mcc", xlab = "ExomeDepth (default)", ylab = "mcCNV")
addfiglab("A")
pltStatCompare(xRes = procSimRes$ExomeDepthDefault, yRes = procSimRes$mcCNV,
stat = "tpr", xlab = "ExomeDepth (default)", ylab = "mcCNV")
addfiglab("B")
pltStatCompare(xRes = procSimRes$ExomeDepthDefault, yRes = procSimRes$mcCNV,
stat = "fdr", xlab = "ExomeDepth (default)", ylab = "mcCNV")
addfiglab("C")
pltStatCompare(xRes = procSimRes$ExomeDepthBest, yRes = procSimRes$mcCNV,
stat = "mcc", xlab = "ExomeDepth (correct)", ylab = "mcCNV")
addfiglab("D")
pltStatCompare(xRes = procSimRes$ExomeDepthBest, yRes = procSimRes$mcCNV,
stat = "tpr", xlab = "ExomeDepth (correct)", ylab = "mcCNV")
addfiglab("E")
pltStatCompare(xRes = procSimRes$ExomeDepthBest, yRes = procSimRes$mcCNV,
stat = "fdr", xlab = "ExomeDepth (correct)", ylab = "mcCNV")
addfiglab("F")
kable(simResTbl,
col.names = c("dep", rep(c("mcCNV", "ED-def", "ED-sim"), 3)),
row.names = FALSE,
label = "simResTbl",
format.args = list(big.mark = ",", scientific = FALSE),
booktabs = TRUE) %>%
kable_classic() %>%
add_header_above(c(" " = 1, "MCC" = 3, "TPR" = 3, "FDR" = 3))
data(wgsPoolCalls)
<- function(x, y) merge(x, y, all = TRUE)
mergeAll <- Reduce(mergeAll, wgsPoolCalls)
wgs data(intAgl)
<- function(int, sbjVec) {
xpandInt <- vector(mode = "list", length = length(sbjVec))
lst names(lst) <- sbjVec
for (s in sbjVec) {
<- copy(int)
lst[[s]] := s]
lst[[s]][ , subject
}rbindlist(lst)
}<- xpandInt(intAgl, wgs[ , unique(subject)])
wgsAgl setkeyv(wgsAgl, key(wgs))
<- wgs[wgsAgl]
wgs rm(wgsAgl)
<- wgs[!(rlcr),
wgs mcDup = !is.na(passFilter) & CN > 1,
.(edDup = !is.na(type) & type == "duplication",
wgDup = !is.na(erds) & !is.na(cnvpytor) & erds == "dup",
mcDel = !is.na(passFilter) & CN < 1,
edDel = !is.na(type) & type == "deletion",
wgDel = !is.na(erds) & !is.na(cnvpytor) & erds == "del"),
= .(subject, seqnames, start, end)]
by := mcDup | mcDel]
wgs[ , mc := edDup | edDel]
wgs[ , ed := wgDup | wgDel]
wgs[ , wg setcolorder(wgs, c(key(wgs), 'mc', 'ed', 'wg'))
<- wgs[ , lapply(.SD, sum), .SDcols = is.logical, by = subject]
wgsCallBySbj kable(wgsCallBySbj,
col.names = c("subject", rep(c("MC", "ED", "WG"), 3)),
row.names = FALSE,
label = "wgsCallSbj",
format.args = list(big.mark = ",", scientific = FALSE),
booktabs = TRUE) %>%
kable_classic() %>%
add_header_above(c(" " = 1, "Total" = 3, "Duplications" = 3, "Deletions" = 3))
<- list()
pmLst $mc <- with(wgs, evalPred(mc, wg))
pmLst$ed <- with(wgs, evalPred(ed, wg))
pmLst$mcSub <- with(wgs[!grepl("790|851", subject)], evalPred(mc, wg))
pmLst$edSub <- with(wgs[!grepl("790|851", subject)], evalPred(ed, wg))
pmLst$mcDup <- with(wgs, evalPred(mcDup, wgDup))
pmLst$edDup <- with(wgs, evalPred(edDup, wgDup))
pmLst$mcSubDup <- with(wgs[!grepl("790|851", subject)],
pmLstevalPred(mcDup, wgDup))
$edSubDup <- with(wgs[!grepl("790|851", subject)],
pmLstevalPred(edDup, wgDup))
$mcDel <- with(wgs, evalPred(mcDel, wgDel))
pmLst$edDel <- with(wgs, evalPred(edDel, wgDel))
pmLst$mcSubDel <- with(wgs[!grepl("790|851", subject)],
pmLstevalPred(mcDel, wgDel))
$edSubDel <- with(wgs[!grepl("790|851", subject)],
pmLstevalPred(edDel, wgDel))
<- as.data.table(do.call(rbind, pmLst), keep.rownames = "PredSet")
predMetrics := rep(c("ALL", "DUP", "DEL"), each = 4)]
predMetrics[ , typ := ifelse(grepl("Sub", PredSet), "Sub", "Total")]
predMetrics[ , set := ifelse(grepl("mc", PredSet), "MC", "ED")]
predMetrics[ , alg setcolorder(predMetrics, c("typ", "set", "alg"))
kable(predMetrics[ , .(typ, set, alg, MCC, TPR, FDR, PPV)],
col.names = c(rep("", 3), "MCC", "TPR", "FDR", "PPV"),
row.names = FALSE,
label = "predMet",
format.args = list(big.mark = ",", scientific = FALSE),
booktabs = TRUE) %>%
kable_classic() %>%
collapse_rows(columns = 1:3, valign = "middle")
<- euler(wgs[ , .(mc, ed, wg)])
ctsAll <- euler(wgs[!grepl("790|851", subject), .(mc, ed, wg)])
ctsSub <- euler(wgs[ , .(mcDup, edDup, wgDup)])
ctsAllDup <- euler(wgs[!grepl("790|851", subject), .(mcDup, edDup, wgDup)])
ctsSubDup <- euler(wgs[ , .(mcDel, edDel, wgDel)])
ctsAllDel <- euler(wgs[!grepl("790|851", subject), .(mcDel, edDel, wgDel)])
ctsSubDel eulerr_options(fills = list(fill = c("#E9E9E9", "#7F7FC4", "#FFC57F")),
quantities = list(cex = 0.5))
<- function(lab) {
gridFigLab grid.text(lab, x = 0, y = 1, hjust = 0, vjust = 1, gp = gpar(font = 2))
}eulerr_options(fills = list(fill = c("#E9E9E9", "#7F7FC4", "#FFC57F")),
quantities = list(cex = 0.5))
plot(ctsAllDup, quantities = TRUE, labels = FALSE, main = "")
gridFigLab("A")
grid.text("DUPLICATIONS", x = 0.5, y = 0.9)
plot(ctsAllDel, quantities = TRUE, labels = FALSE, main = "")
gridFigLab("B")
grid.text("DELETIONS", x = 0.5, y = 0.9)
eulerr_options(fills = list(fill = c("#E9E9E9", "#7F7FC4", "#FFC57F")),
quantities = list(cex = 0.5))
plot(ctsSubDup, quantities = TRUE, labels = FALSE, main = "")
gridFigLab("A")
grid.text("DUPLICATIONS", x = 0.5, y = 0.9)
plot(ctsSubDel, quantities = TRUE, labels = FALSE, main = "")
gridFigLab("B")
grid.text("DELETIONS", x = 0.5, y = 0.9)
## Estimate maternal-fetal genotypes from cell-free exome sequencing
data(gt)
:= ref + alt]
gt[ , udep := udep > 80 & ref > 5 & alt > 5]
gt[ , use for (s in c("S1", "S2", "FES-0034-4")) {
== s & use,
gt[smp c("ff", "gtCall", "gtLike") := callSmpl(alt, udep, .N, median(sdep/ldep))]
}## Calculate median read depth & fetal fraction estimates by sample
<- gt[(use), .(md = median(udep), ff = ff[1]), by = smp]
smry setkey(smry, smp)
## Allele depth histograms
# par(oma = c(3, 0, 0, 0), mfrow = c(3, 1))
layout(matrix(1:4, ncol = 1), heights = c(rep(10, 3), 1))
with(gt[use & smp == "S1"], cfPltFreqHist(alt, udep, gtCall, ff = ff[1]))
title(ylab = "Case 1", line = 1)
addfiglab("A")
data(rs140468248)
data(GenoMeta)
with(gt[use & smp == "S2"], cfPltFreqHist(alt, udep, gtCall, ff = ff[1]))
<- gt[varid == rs140468248, gtCall]
oiCall <- gt[varid == rs140468248, alt/(ref + alt)]
oiPmar abline(v = oiPmar, col = GenoMeta$color[GenoMeta$name == oiCall])
text(x = oiPmar,
y = grconvertY(0.75, "nfc"),
"rs140468248",
srt = 90,
adj = c(0.5, 1.5),
col = GenoMeta$color[GenoMeta$name == oiCall])
title(ylab = "Case 2", line = 1)
addfiglab("B", units = 'nfc')
with(gt[use & smp == "FES-0034-4"], cfPltFreqHist(alt, udep, gtCall, ff = ff[1]))
title(ylab = "Case 3", line = 1)
addfiglab("C", units = 'nfc')
par(mar = rep(0, 4))
plot.new()
legend(x = grconvertX(0.5, from = "nfc"),
y = grconvertY(0.5, from = "nfc"),
legend = GenoMeta$name,
horiz = TRUE,
lwd = 4,
col = GenoMeta$color,
xjust = 0.5,
yjust = 0.5,
xpd = NA,
bty = "n")
## Collect read depth and filtering information into a table
data(readSmry)
<- readSmry$summary
rs := round(pctDup, 2)]
rs[ , pctDup := round(pctFlt, 2)]
rs[ , pctFlt setkey(rs, smp)
<- smry[rs]
rs := round(ff, 3)]
rs[ , ff == "S1", case := "Case 1"]
rs[smp == "S2", case := "Case 2"]
rs[smp grepl("FES", smp), case := "Case 3"]
rs[<- rs[!is.na(ff)][order(case)][ , {
caseSmry GA = sprintf('(ref:case%dga)', 1:3),
.(`Clinical findings` = sprintf('(ref:case%dcc)', 1:3),
`Genetic diagnosis` = sprintf('(ref:case%dgd)', 1:3),
FF = ff,
Dep = md,
`%Dup` = pctDup,
`%Filt` = pctFlt)
}]kable(caseSmry,
row.names = TRUE,
label = "caseSmry",
format.args = list(big.mark = ",", scientific = FALSE),
booktabs = TRUE,
caption.short = "Cell-free exome sequencing case summaries") %>%
kable_classic() %>%
column_spec(3:4, width = "10em")
## Load c3MatFetReads
data(c3MatFetReads)
## Calculate density and ECDF for fragment read lengths
<- density(c3MatFetReads[source == "maternal", isize])
matDen <- density(c3MatFetReads[source == "fetal", isize])
fetDen <- ecdf(c3MatFetReads[source == "maternal", isize])
matCdf <- ecdf(c3MatFetReads[source == "fetal", isize])
fetCdf
## Generate plots
par(mfrow = c(1, 2), oma = c(3, 0, 0, 0))
par(mar = c(4, 4, 1, 1))
plot.new()
plot.window(xlim = range(c(matDen$x, fetDen$x)), range(c(matDen$y, fetDen$y)))
lines(matDen, col = "darkblue", lwd = 2)
lines(fetDen, col = "darkorange", lwd =2)
axis(side = 1)
title(xlab = "Fragment length (insert size)", ylab = "Density")
addfiglab("A")
plot.new()
<- seq(50, 300, 1)
xv plot.window(xlim = range(xv), ylim = 0:1)
points(x = xv, y = matCdf(xv), col = "darkblue", lwd = 2, type = "l")
points(x = xv, y = fetCdf(xv), col = "darkorange", lwd = 2, type = "l")
addfiglab("B", units = 'nfc')
axis(side = 1)
axis(side = 2)
title(xlab = "Fragment length (insert size)",
ylab = "Cumulative distribution")
legend(x = grconvertX(0.5, from = "ndc"),
y = line2user(2, side = 1, outer = TRUE),
legend = c("Maternal", "Fetal"),
horiz = TRUE,
lwd = 2,
col = c("darkblue", "darkorange"),
xjust = 0.5,
yjust = 0.5,
xpd = NA,
bty = "n")
:= alt/(alt + ref)]
gt[ , pmar := sdep/adep]
gt[ , sratio <- function(smpNm) {
pltSratioByPmar par(mar = c(4, 4, 1, 1))
with(gt[gtCall == "AAab" & smp == smpNm], {
plot(pmar ~ sratio,
xlab = "Proportion of fragments < 140 bp",
ylab = "PMAR",
pch = 16,
cex = 0.5,
col = col2alpha('darkgray'),
bty = "n")
})with(gt[gtCall == "AAab" & smp == smpNm], {
contour(kde2d(x = sratio, y = pmar, n = 500),
nlevels = 25,
add = TRUE,
drawlabels = FALSE,
col = "darkblue")
})
}pltSratioByPmar("S1")
addfiglab("A")
pltSratioByPmar("S2")
addfiglab("B")
pltSratioByPmar("FES-0034-4")
addfiglab("C")
## Calculate call concordance between cell-free and direct for Case 3
<- gt[(smp == "FES-0034-0" & udep > 30 & gt %in% c("0/0", "0/1", "1/1")) |
c3 == "FES-0034-1" & udep > 30 & gt %in% c("0/0", "0/1", "1/1")) |
(smp & smp == "FES-0034-4")]
(use := substr(gtCall, 3, 4)]
c3[ , mrg := c("0/0", "0/1", "1/1")[match(mrg, c("aa", "ab", "bb"))]]
c3[ , mrg is.na(gtCall), mrg := gt]
c3[<- dcast(c3, varid ~ smp, value.var = "mrg")
c3 setnames(c3, c("varid", "fet", "mat", "cff"))
<- c3[ , .N, by = .(mat, fet, cff)]
c3 ## call by call matrix, cell-free calls in columns, fetal in rows
<- dcast(c3[!is.na(fet) & !is.na(cff)],
c3FetByCf ~ cff,
fet fun.aggregate = sum,
value.var = "N")
setkey(c3FetByCf, fet)
## all c3 calls
<- c3[!is.na(mat) & !is.na(fet) & !is.na(cff), ][order(mat, fet, cff)]
c3Calls kable(c3FetByCf[ , .(" " = "Fetal", " " = fet, `0/0`, `0/1`, `1/1`)],
row.names = FALSE,
label = "c3FetByCf",
format.args = list(big.mark = ",", scientific = FALSE),
booktabs = TRUE,
caption.short = 'Case 3 fetal versus cell-free genotype calls.') %>%
kable_classic() %>%
collapse_rows(columns = 1, valign = "middle") %>%
add_header_above(c(" " = 2, "Cell-free" = 3))
kable(c3Calls[ , .(Maternal = mat, Fetal = fet, `Cell-free` = cff, N)],
row.names = FALSE,
label = "c3Calls",
format.args = list(big.mark = ",", scientific = FALSE),
booktabs = TRUE,
caption.short = 'Maternal, fetal, and cell-free genotype calls.') %>%
kable_classic() %>%
collapse_rows(columns = 1:2, valign = "middle", latex_hline = "full")
pltExpMAF(500)
pltMissclass()