## ----style, echo = FALSE, results = "asis"------------------------------------ BiocStyle::markdown() ## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 5, fig.height = 4.5, dpi = 200 ) ## ----InstallGeomxTools, echo = TRUE, eval = FALSE----------------------------- # install.packages("devtools") # devtools::install_github("Nanostring-Biostats/NanoStringNCTools") # devtools::install_github("Nanostring-Biostats/GeomxTools", ref = "dev") # devtools::install_github("Nanostring-Biostats/GeoMxWorkflows", ref = "main") ## ----libs, message = FALSE, warning = FALSE, eval = TRUE---------------------- library(NanoStringNCTools) library(GeomxTools) library(GeoMxWorkflows) ## ----quickstart, message = FALSE, warning = FALSE----------------------------- # Reference the main folder 'file.path' containing the sub-folders with each # data file type: datadir <- system.file("extdata", "WTA_NGS_Example", package="GeoMxWorkflows") # to locate a specific file path replace the above line with # datadir <- file.path("~/Folder/SubFolder/DataLocation") # replace the Folder, SubFolder, DataLocation as needed # the DataLocation folder should contain a dccs, pkcs, and annotation folder # with each set of files present as needed ## ----locateFiles, message = FALSE, warning = FALSE---------------------------- # automatically list files in each directory for use DCCFiles <- dir(file.path(datadir, "dccs"), pattern = ".dcc$", full.names = TRUE, recursive = TRUE) PKCFiles <- unzip(zipfile = dir(file.path(datadir, "pkcs"), pattern = ".zip$", full.names = TRUE, recursive = TRUE)) SampleAnnotationFile <- dir(file.path(datadir, "annotation"), pattern = ".xlsx$", full.names = TRUE, recursive = TRUE) ## ----loadData, message = FALSE, warning = FALSE------------------------------- # load data demoData <- readNanoStringGeoMxSet(dccFiles = DCCFiles, pkcFiles = PKCFiles, phenoDataFile = SampleAnnotationFile, phenoDataSheet = "Template", phenoDataDccColName = "Sample_ID", protocolDataColNames = c("aoi", "roi"), experimentDataColNames = c("panel")) ## ----modules------------------------------------------------------------------ library(knitr) pkcs <- annotation(demoData) modules <- gsub(".pkc", "", pkcs) kable(data.frame(PKCs = pkcs, modules = modules)) ## ----sampleFlow, fig.width = 10, fig.height = 8, fig.wide = TRUE, message = FALSE, warning = FALSE---- library(dplyr) library(ggforce) # select the annotations we want to show, use `` to surround column names with # spaces or special symbols count_mat <- count(pData(demoData), `slide name`, class, region, segment) # simplify the slide names count_mat$`slide name` <- gsub("disease", "d", gsub("normal", "n", count_mat$`slide name`)) # gather the data and plot in order: class, slide name, region, segment test_gr <- gather_set_data(count_mat, 1:4) test_gr$x <- factor(test_gr$x, levels = c("class", "slide name", "region", "segment")) # plot Sankey ggplot(test_gr, aes(x, id = id, split = y, value = n)) + geom_parallel_sets(aes(fill = region), alpha = 0.5, axis.width = 0.1) + geom_parallel_sets_axes(axis.width = 0.2) + geom_parallel_sets_labels(color = "white", size = 5) + theme_classic(base_size = 17) + theme(legend.position = "bottom", axis.ticks.y = element_blank(), axis.line = element_blank(), axis.text.y = element_blank()) + scale_y_continuous(expand = expansion(0)) + scale_x_discrete(expand = expansion(0)) + labs(x = "", y = "") + annotate(geom = "segment", x = 4.25, xend = 4.25, y = 20, yend = 120, lwd = 2) + annotate(geom = "text", x = 4.19, y = 70, angle = 90, size = 5, hjust = 0.5, label = "100 segments") ## ----shiftCounts, eval = TRUE------------------------------------------------- # Shift counts to one demoData <- shiftCountsOne(demoData, useDALogic = TRUE) ## ----setqcflagupdated, eval = TRUE------------------------------------------- # Default QC cutoffs are commented in () adjacent to the respective parameters # study-specific values were selected after visualizing the QC results in more # detail below QC_params <- list(minSegmentReads = 1000, # Minimum number of reads (1000) percentTrimmed = 80, # Minimum % of reads trimmed (80%) percentStitched = 80, # Minimum % of reads stitched (80%) percentAligned = 75, # Minimum % of reads aligned (80%) percentSaturation = 50, # Minimum sequencing saturation (50%) minNegativeCount = 1, # Minimum negative control counts (10) maxNTCCount = 9000, # Maximum counts observed in NTC well (1000) minNuclei = 20, # Minimum # of nuclei estimated (100) minArea = 1000) # Minimum segment area (5000) demoData <- setSegmentQCFlags(demoData, qcCutoffs = QC_params) # Collate QC Results QCResults <- protocolData(demoData)[["QCFlags"]] flag_columns <- colnames(QCResults) QC_Summary <- data.frame(Pass = colSums(!QCResults[, flag_columns]), Warning = colSums(QCResults[, flag_columns])) QCResults$QCStatus <- apply(QCResults, 1L, function(x) { ifelse(sum(x) == 0L, "PASS", "WARNING") }) QC_Summary["TOTAL FLAGS", ] <- c(sum(QCResults[, "QCStatus"] == "PASS"), sum(QCResults[, "QCStatus"] == "WARNING")) ## ----qcflagHistogramsCode, eval = TRUE, warning = FALSE, message = FALSE------ library(ggplot2) col_by <- "segment" # Graphical summaries of QC statistics plot function QC_histogram <- function(assay_data = NULL, annotation = NULL, fill_by = NULL, thr = NULL, scale_trans = NULL) { plt <- ggplot(assay_data, aes_string(x = paste0("unlist(`", annotation, "`)"), fill = fill_by)) + geom_histogram(bins = 50) + geom_vline(xintercept = thr, lty = "dashed", color = "black") + theme_bw() + guides(fill = "none") + facet_wrap(as.formula(paste("~", fill_by)), nrow = 4) + labs(x = annotation, y = "Segments, #", title = annotation) if(!is.null(scale_trans)) { plt <- plt + scale_x_continuous(trans = scale_trans) } plt } ## ----plotQCHist, warning = FALSE, message = FALSE----------------------------- QC_histogram(sData(demoData), "Trimmed (%)", col_by, 80) QC_histogram(sData(demoData), "Stitched (%)", col_by, 80) QC_histogram(sData(demoData), "Aligned (%)", col_by, 75) QC_histogram(sData(demoData), "Saturated (%)", col_by, 50) + labs(title = "Sequencing Saturation (%)", x = "Sequencing Saturation (%)") QC_histogram(sData(demoData), "area", col_by, 1000, scale_trans = "log10") QC_histogram(sData(demoData), "nuclei", col_by, 20) # calculate the negative geometric means for each module negativeGeoMeans <- esBy(negativeControlSubset(demoData), GROUP = "Module", FUN = function(x) { assayDataApply(x, MARGIN = 2, FUN = ngeoMean, elt = "exprs") }) protocolData(demoData)[["NegGeoMean"]] <- negativeGeoMeans # explicitly copy the Negative geoMeans from sData to pData negCols <- paste0("NegGeoMean_", modules) pData(demoData)[, negCols] <- sData(demoData)[["NegGeoMean"]] for(ann in negCols) { plt <- QC_histogram(pData(demoData), ann, col_by, 2, scale_trans = "log10") print(plt) } # detatch neg_geomean columns ahead of aggregateCounts call pData(demoData) <- pData(demoData)[, !colnames(pData(demoData)) %in% negCols] # show all NTC values, Freq = # of Segments with a given NTC count: kable(table(NTC_Count = sData(demoData)$NTC), col.names = c("NTC Count", "# of Segments")) ## ----QCSummaryTable, results = "asis"----------------------------------------- kable(QC_Summary, caption = "QC Summary Table for each Segment") ## ----removeQCSampleProbe, eval = TRUE----------------------------------------- demoData <- demoData[, QCResults$QCStatus == "PASS"] # Subsetting our dataset has removed samples which did not pass QC dim(demoData) ## ----setbioprobeqcflag, eval = TRUE------------------------------------------ # Generally keep the qcCutoffs parameters unchanged. Set removeLocalOutliers to # FALSE if you do not want to remove local outliers demoData <- setBioProbeQCFlags(demoData, qcCutoffs = list(minProbeRatio = 0.1, percentFailGrubbs = 20), removeLocalOutliers = TRUE) ProbeQCResults <- fData(demoData)[["QCFlags"]] # Define QC table for Probe QC qc_df <- data.frame(Passed = sum(rowSums(ProbeQCResults[, -1]) == 0), Global = sum(ProbeQCResults$GlobalGrubbsOutlier), Local = sum(rowSums(ProbeQCResults[, -2:-1]) > 0 & !ProbeQCResults$GlobalGrubbsOutlier)) ## ----bioprobeQCTable, echo = FALSE, results = "asis"-------------------------- kable(qc_df, caption = "Probes flagged or passed as outliers") ## ----excludeOutlierProbes----------------------------------------------------- #Subset object to exclude all that did not pass Ratio & Global testing ProbeQCPassed <- subset(demoData, fData(demoData)[["QCFlags"]][,c("LowProbeRatio")] == FALSE & fData(demoData)[["QCFlags"]][,c("GlobalGrubbsOutlier")] == FALSE) dim(ProbeQCPassed) demoData <- ProbeQCPassed ## ----aggregateCounts, eval = TRUE--------------------------------------------- # Check how many unique targets the object has length(unique(featureData(demoData)[["TargetName"]])) # collapse to targets target_demoData <- aggregateCounts(demoData) dim(target_demoData) exprs(target_demoData)[1:5, 1:2] ## ----calculateLOQ, eval = TRUE------------------------------------------------ # Define LOQ SD threshold and minimum value cutoff <- 2 minLOQ <- 2 # Calculate LOQ per module tested LOQ <- data.frame(row.names = colnames(target_demoData)) for(module in modules) { vars <- paste0(c("NegGeoMean_", "NegGeoSD_"), module) if(all(vars[1:2] %in% colnames(pData(target_demoData)))) { LOQ[, module] <- pmax(minLOQ, pData(target_demoData)[, vars[1]] * pData(target_demoData)[, vars[2]] ^ cutoff) } } pData(target_demoData)$LOQ <- LOQ ## ----LOQMat, eval = TRUE------------------------------------------------------ LOQ_Mat <- c() for(module in modules) { ind <- fData(target_demoData)$Module == module Mat_i <- t(esApply(target_demoData[ind, ], MARGIN = 1, FUN = function(x) { x > LOQ[, module] })) LOQ_Mat <- rbind(LOQ_Mat, Mat_i) } # ensure ordering since this is stored outside of the geomxSet LOQ_Mat <- LOQ_Mat[fData(target_demoData)$TargetName, ] ## ----segDetectionBarplot------------------------------------------------------ # Save detection rate information to pheno data pData(target_demoData)$GenesDetected <- colSums(LOQ_Mat, na.rm = TRUE) pData(target_demoData)$GeneDetectionRate <- pData(target_demoData)$GenesDetected / nrow(target_demoData) # Determine detection thresholds: 1%, 5%, 10%, 15%, >15% pData(target_demoData)$DetectionThreshold <- cut(pData(target_demoData)$GeneDetectionRate, breaks = c(0, 0.01, 0.05, 0.1, 0.15, 1), labels = c("<1%", "1-5%", "5-10%", "10-15%", ">15%")) # stacked bar plot of different cut points (1%, 5%, 10%, 15%) ggplot(pData(target_demoData), aes(x = DetectionThreshold)) + geom_bar(aes(fill = region)) + geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) + theme_bw() + scale_y_continuous(expand = expansion(mult = c(0, 0.1))) + labs(x = "Gene Detection Rate", y = "Segments, #", fill = "Segment Type") ## ----segTable----------------------------------------------------------------- # cut percent genes detected at 1, 5, 10, 15 kable(table(pData(target_demoData)$DetectionThreshold, pData(target_demoData)$class)) ## ----filterSegments----------------------------------------------------------- target_demoData <- target_demoData[, pData(target_demoData)$GeneDetectionRate >= .1] dim(target_demoData) ## ----replotSankey, fig.width = 10, fig.height = 8, fig.wide = TRUE, message = FALSE, warning = FALSE---- # select the annotations we want to show, use `` to surround column names with # spaces or special symbols count_mat <- count(pData(demoData), `slide name`, class, region, segment) # simplify the slide names count_mat$`slide name` <- gsub("disease", "d", gsub("normal", "n", count_mat$`slide name`)) # gather the data and plot in order: class, slide name, region, segment test_gr <- gather_set_data(count_mat, 1:4) test_gr$x <- factor(test_gr$x, levels = c("class", "slide name", "region", "segment")) # plot Sankey ggplot(test_gr, aes(x, id = id, split = y, value = n)) + geom_parallel_sets(aes(fill = region), alpha = 0.5, axis.width = 0.1) + geom_parallel_sets_axes(axis.width = 0.2) + geom_parallel_sets_labels(color = "white", size = 5) + theme_classic(base_size = 17) + theme(legend.position = "bottom", axis.ticks.y = element_blank(), axis.line = element_blank(), axis.text.y = element_blank()) + scale_y_continuous(expand = expansion(0)) + scale_x_discrete(expand = expansion(0)) + labs(x = "", y = "") + annotate(geom = "segment", x = 4.25, xend = 4.25, y = 20, yend = 120, lwd = 2) + annotate(geom = "text", x = 4.19, y = 70, angle = 90, size = 5, hjust = 0.5, label = "100 segments") ## ----goi detection------------------------------------------------------------ library(scales) # for percent # Calculate detection rate: LOQ_Mat <- LOQ_Mat[, colnames(target_demoData)] fData(target_demoData)$DetectedSegments <- rowSums(LOQ_Mat, na.rm = TRUE) fData(target_demoData)$DetectionRate <- fData(target_demoData)$DetectedSegments / nrow(pData(target_demoData)) # Gene of interest detection table goi <- c("PDCD1", "CD274", "IFNG", "CD8A", "CD68", "EPCAM", "KRT18", "NPHS1", "NPHS2", "CALB1", "CLDN8") goi_df <- data.frame( Gene = goi, Number = fData(target_demoData)[goi, "DetectedSegments"], DetectionRate = percent(fData(target_demoData)[goi, "DetectionRate"])) ## ----tableGOI, echo = FALSE, results = "asis"--------------------------------- kable(goi_df, caption = "Detection rate for Genes of Interest", align = "c", col.names = c("Gene", "Detection, # Segments", "Detection Rate, % of Segments")) ## ----plotDetectionRate, eval = TRUE------------------------------------------- # Plot detection rate: plot_detect <- data.frame(Freq = c(1, 5, 10, 20, 30, 50)) plot_detect$Number <- unlist(lapply(c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5), function(x) {sum(fData(target_demoData)$DetectionRate >= x)})) plot_detect$Rate <- plot_detect$Number / nrow(fData(target_demoData)) rownames(plot_detect) <- plot_detect$Freq ggplot(plot_detect, aes(x = as.factor(Freq), y = Rate, fill = Rate)) + geom_bar(stat = "identity") + geom_text(aes(label = formatC(Number, format = "d", big.mark = ",")), vjust = 1.6, color = "black", size = 4) + scale_fill_gradient2(low = "orange2", mid = "lightblue", high = "dodgerblue3", midpoint = 0.65, limits = c(0,1), labels = scales::percent) + theme_bw() + scale_y_continuous(labels = scales::percent, limits = c(0,1), expand = expansion(mult = c(0, 0))) + labs(x = "% of Segments", y = "Genes Detected, % of Panel > LOQ") ## ----subsetGenes, eval = TRUE------------------------------------------------- # Subset to target genes detected in at least 10% of the samples. # Also manually include the negative control probe, for downstream use negativeProbefData <- subset(fData(target_demoData), CodeClass == "Negative") neg_probes <- unique(negativeProbefData$TargetName) target_demoData <- target_demoData[fData(target_demoData)$DetectionRate >= 0.1 | fData(target_demoData)$TargetName %in% neg_probes, ] dim(target_demoData) # retain only detected genes of interest goi <- goi[goi %in% rownames(target_demoData)] ## ---- previewNF, fig.width = 8, fig.height = 8, fig.wide = TRUE, eval = TRUE, warning = FALSE, message = FALSE---- library(reshape2) # for melt library(cowplot) # for plot_grid # Graph Q3 value vs negGeoMean of Negatives ann_of_interest <- "region" Stat_data <- data.frame(row.names = colnames(exprs(target_demoData)), Segment = colnames(exprs(target_demoData)), Annotation = pData(target_demoData)[, ann_of_interest], Q3 = unlist(apply(exprs(target_demoData), 2, quantile, 0.75, na.rm = TRUE)), NegProbe = exprs(target_demoData)[neg_probes, ]) Stat_data_m <- melt(Stat_data, measure.vars = c("Q3", "NegProbe"), variable.name = "Statistic", value.name = "Value") plt1 <- ggplot(Stat_data_m, aes(x = Value, fill = Statistic)) + geom_histogram(bins = 40) + theme_bw() + scale_x_continuous(trans = "log2") + facet_wrap(~Annotation, nrow = 1) + scale_fill_brewer(palette = 3, type = "qual") + labs(x = "Counts", y = "Segments, #") plt2 <- ggplot(Stat_data, aes(x = NegProbe, y = Q3, color = Annotation)) + geom_abline(intercept = 0, slope = 1, lty = "dashed", color = "darkgray") + geom_point() + guides(color = "none") + theme_bw() + scale_x_continuous(trans = "log2") + scale_y_continuous(trans = "log2") + theme(aspect.ratio = 1) + labs(x = "Negative Probe GeoMean, Counts", y = "Q3 Value, Counts") plt3 <- ggplot(Stat_data, aes(x = NegProbe, y = Q3 / NegProbe, color = Annotation)) + geom_hline(yintercept = 1, lty = "dashed", color = "darkgray") + geom_point() + theme_bw() + scale_x_continuous(trans = "log2") + scale_y_continuous(trans = "log2") + theme(aspect.ratio = 1) + labs(x = "Negative Probe GeoMean, Counts", y = "Q3/NegProbe Value, Counts") btm_row <- plot_grid(plt2, plt3, nrow = 1, labels = c("B", ""), rel_widths = c(0.43,0.57)) plot_grid(plt1, btm_row, ncol = 1, labels = c("A", "")) ## ----normalizeObject, eval = TRUE--------------------------------------------- # Q3 norm (75th percentile) for WTA/CTA with or without custom spike-ins target_demoData <- normalize(target_demoData , data_type = "RNA", norm_method = "quant", desiredQuantile = .75, toElt = "q_norm") # Background normalization for WTA/CTA without custom spike-in target_demoData <- normalize(target_demoData , data_type = "RNA", norm_method = "neg", fromElt = "exprs", toElt = "neg_norm") ## ----normplot, fig.small = TRUE----------------------------------------------- # visualize the first 10 segments with each normalization method boxplot(exprs(target_demoData)[,1:10], col = "#9EDAE5", main = "Raw Counts", log = "y", names = 1:10, xlab = "Segment", ylab = "Counts, Raw") boxplot(assayDataElement(target_demoData[,1:10], elt = "q_norm"), col = "#2CA02C", main = "Q3 Norm Counts", log = "y", names = 1:10, xlab = "Segment", ylab = "Counts, Q3 Normalized") boxplot(assayDataElement(target_demoData[,1:10], elt = "neg_norm"), col = "#FF7F0E", main = "Neg Norm Counts", log = "y", names = 1:10, xlab = "Segment", ylab = "Counts, Neg. Normalized") ## ----dimReduction, eval = TRUE------------------------------------------------ library(umap) library(Rtsne) # update defaults for umap to contain a stable random_state (seed) custom_umap <- umap::umap.defaults custom_umap$random_state <- 42 # run UMAP umap_out <- umap(t(log2(assayDataElement(target_demoData , elt = "q_norm"))), config = custom_umap) pData(target_demoData)[, c("UMAP1", "UMAP2")] <- umap_out$layout[, c(1,2)] ggplot(pData(target_demoData), aes(x = UMAP1, y = UMAP2, color = region, shape = class)) + geom_point(size = 3) + theme_bw() # run tSNE set.seed(42) # set the seed for tSNE as well tsne_out <- Rtsne(t(log2(assayDataElement(target_demoData , elt = "q_norm"))), perplexity = ncol(target_demoData)*.15) pData(target_demoData)[, c("tSNE1", "tSNE2")] <- tsne_out$Y[, c(1,2)] ggplot(pData(target_demoData), aes(x = tSNE1, y = tSNE2, color = region, shape = class)) + geom_point(size = 3) + theme_bw() ## ----CVheatmap, eval = TRUE, echo = TRUE, fig.width = 8, fig.height = 6.5, fig.wide = TRUE---- library(pheatmap) # for pheatmap # create a log2 transform of the data for analysis assayDataElement(object = target_demoData, elt = "log_q") <- assayDataApply(target_demoData, 2, FUN = log, base = 2, elt = "q_norm") # create CV function calc_CV <- function(x) {sd(x) / mean(x)} CV_dat <- assayDataApply(target_demoData, elt = "log_q", MARGIN = 1, calc_CV) # show the highest CD genes and their CV values sort(CV_dat, decreasing = TRUE)[1:5] # Identify genes in the top 3rd of the CV values GOI <- names(CV_dat)[CV_dat > quantile(CV_dat, 0.8)] pheatmap(assayDataElement(target_demoData[GOI, ], elt = "log_q"), scale = "row", show_rownames = FALSE, show_colnames = FALSE, border_color = NA, clustering_method = "average", clustering_distance_rows = "correlation", clustering_distance_cols = "correlation", breaks = seq(-3, 3, 0.05), color = colorRampPalette(c("purple3", "black", "yellow2"))(120), annotation_col = pData(target_demoData)[, c("class", "segment", "region")]) ## ----deNativeComplex, eval = TRUE, message = FALSE, warning = FALSE----------- # convert test variables to factors pData(target_demoData)$testRegion <- factor(pData(target_demoData)$region, c("glomerulus", "tubule")) pData(target_demoData)[["slide"]] <- factor(pData(target_demoData)[["slide name"]]) assayDataElement(object = target_demoData, elt = "log_q") <- assayDataApply(target_demoData, 2, FUN = log, base = 2, elt = "q_norm") # run LMM: # formula follows conventions defined by the lme4 package results <- c() for(status in c("DKD", "normal")) { ind <- pData(target_demoData)$class == status mixedOutmc <- mixedModelDE(target_demoData[, ind], elt = "log_q", modelFormula = ~ testRegion + (1 + testRegion | slide), groupVar = "testRegion", nCores = parallel::detectCores(), multiCore = FALSE) # format results as data.frame r_test <- do.call(rbind, mixedOutmc["lsmeans", ]) tests <- rownames(r_test) r_test <- as.data.frame(r_test) r_test$Contrast <- tests # use lapply in case you have multiple levels of your test factor to # correctly associate gene name with it's row in the results table r_test$Gene <- unlist(lapply(colnames(mixedOutmc), rep, nrow(mixedOutmc["lsmeans", ][[1]]))) r_test$Subset <- status r_test$FDR <- p.adjust(r_test$`Pr(>|t|)`, method = "fdr") r_test <- r_test[, c("Gene", "Subset", "Contrast", "Estimate", "Pr(>|t|)", "FDR")] results <- rbind(results, r_test) } ## ----DEtable, echo = TRUE, results = "asis"----------------------------------- kable(subset(results, Gene %in% goi & Subset == "normal"), digits = 3, caption = "DE results for Genes of Interest", align = "lc", row.names = FALSE) ## ----DEsimple, eval = TRUE, echo = TRUE, message = FALSE, warning = FALSE----- # convert test variables to factors pData(target_demoData)$testClass <- factor(pData(target_demoData)$class, c("normal", "DKD")) # run LMM: # formula follows conventions defined by the lme4 package results2 <- c() for(region in c("glomerulus", "tubule")) { ind <- pData(target_demoData)$region == region mixedOutmc <- mixedModelDE(target_demoData[, ind], elt = "log_q", modelFormula = ~ testClass + (1 | slide), groupVar = "testClass", nCores = parallel::detectCores(), multiCore = FALSE) # format results as data.frame r_test <- do.call(rbind, mixedOutmc["lsmeans", ]) tests <- rownames(r_test) r_test <- as.data.frame(r_test) r_test$Contrast <- tests # use lapply in case you have multiple levels of your test factor to # correctly associate gene name with it's row in the results table r_test$Gene <- unlist(lapply(colnames(mixedOutmc), rep, nrow(mixedOutmc["lsmeans", ][[1]]))) r_test$Subset <- region r_test$FDR <- p.adjust(r_test$`Pr(>|t|)`, method = "fdr") r_test <- r_test[, c("Gene", "Subset", "Contrast", "Estimate", "Pr(>|t|)", "FDR")] results2 <- rbind(results2, r_test) } ## ----DEtable2, echo = TRUE, results = "asis"---------------------------------- kable(subset(results2, Gene %in% goi & Subset == "tubule"), digits = 3, caption = "DE results for Genes of Interest", align = "lc", row.names = FALSE) ## ----glomindex, eval = TRUE, echo = FALSE------------------------------------- # since we didn't execute above, ID the glomeruli for later use ## ----volcanoPlot, fig.width = 11, fig.height = 7, fig.wide = TRUE, warning = FALSE, message = FALSE---- library(ggrepel) # Categorize Results based on P-value & FDR for plotting results$Color <- "NS or FC < 0.5" results$Color[results$`Pr(>|t|)` < 0.05] <- "P < 0.05" results$Color[results$FDR < 0.05] <- "FDR < 0.05" results$Color[results$FDR < 0.001] <- "FDR < 0.001" results$Color[abs(results$Estimate) < 0.5] <- "NS or FC < 0.5" results$Color <- factor(results$Color, levels = c("NS or FC < 0.5", "P < 0.05", "FDR < 0.05", "FDR < 0.001")) # pick top genes for either side of volcano to label # order genes for convenience: results$invert_P <- (-log10(results$`Pr(>|t|)`)) * sign(results$Estimate) top_g <- c() for(cond in c("DKD", "normal")) { ind <- results$Subset == cond top_g <- c(top_g, results[ind, 'Gene'][ order(results[ind, 'invert_P'], decreasing = TRUE)[1:15]], results[ind, 'Gene'][ order(results[ind, 'invert_P'], decreasing = FALSE)[1:15]]) } top_g <- unique(top_g) results <- results[, -1*ncol(results)] # remove invert_P from matrix # Graph results ggplot(results, aes(x = Estimate, y = -log10(`Pr(>|t|)`), color = Color, label = Gene)) + geom_vline(xintercept = c(0.5, -0.5), lty = "dashed") + geom_hline(yintercept = -log10(0.05), lty = "dashed") + geom_point() + labs(x = "Enriched in Tubules <- log2(FC) -> Enriched in Glomeruli", y = "Significance, -log10(P)", color = "Significance") + scale_color_manual(values = c(`FDR < 0.001` = "dodgerblue", `FDR < 0.05` = "lightblue", `P < 0.05` = "orange2", `NS or FC < 0.5` = "gray"), guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + geom_text_repel(data = subset(results, Gene %in% top_g & FDR < 0.001), size = 4, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2, max.overlaps = 50) + theme_bw(base_size = 16) + theme(legend.position = "bottom") + facet_wrap(~Subset, scales = "free_y") ## ----targetTable, eval = TRUE, as.is = TRUE----------------------------------- kable(subset(results, Gene %in% c('PDHA1','ITGB1')), row.names = FALSE) ## ----targetExprs, eval = TRUE------------------------------------------------- # show expression for a single target: PDHA1 ggplot(pData(target_demoData), aes(x = region, fill = region, y = assayDataElement(target_demoData["PDHA1", ], elt = "q_norm"))) + geom_violin() + geom_jitter(width = .2) + labs(y = "PDHA1 Expression") + scale_y_continuous(trans = "log2") + facet_wrap(~class) + theme_bw() ## ----targetExprs2, fig.width = 8, fig.wide = TRUE, eval = TRUE---------------- glom <- pData(target_demoData)$region == "glomerulus" # show expression of PDHA1 vs ITGB1 ggplot(pData(target_demoData), aes(x = assayDataElement(target_demoData["PDHA1", ], elt = "q_norm"), y = assayDataElement(target_demoData["ITGB1", ], elt = "q_norm"), color = region)) + geom_vline(xintercept = max(assayDataElement(target_demoData["PDHA1", glom], elt = "q_norm")), lty = "dashed", col = "darkgray") + geom_hline(yintercept = max(assayDataElement(target_demoData["ITGB1", !glom], elt = "q_norm")), lty = "dashed", col = "darkgray") + geom_point(size = 3) + theme_bw() + scale_x_continuous(trans = "log2") + scale_y_continuous(trans = "log2") + labs(x = "PDHA1 Expression", y = "ITGB1 Expression") + facet_wrap(~class) ## ----heatmap, eval = TRUE, fig.width = 8, fig.height = 6.5, fig.wide = TRUE---- # select top significant genes based on significance, plot with pheatmap GOI <- unique(subset(results, `FDR` < 0.001)$Gene) pheatmap(log2(assayDataElement(target_demoData[GOI, ], elt = "q_norm")), scale = "row", show_rownames = FALSE, show_colnames = FALSE, border_color = NA, clustering_method = "average", clustering_distance_rows = "correlation", clustering_distance_cols = "correlation", cutree_cols = 2, cutree_rows = 2, breaks = seq(-3, 3, 0.05), color = colorRampPalette(c("purple3", "black", "yellow2"))(120), annotation_col = pData(target_demoData)[, c("region", "class")]) ## ----maPlot, fig.width = 8, fig.height = 12, fig.wide = TRUE, warning = FALSE, message = FALSE---- results$MeanExp <- rowMeans(assayDataElement(target_demoData, elt = "q_norm")) top_g2 <- results$Gene[results$Gene %in% top_g & results$FDR < 0.001 & abs(results$Estimate) > .5 & results$MeanExp > quantile(results$MeanExp, 0.9)] ggplot(subset(results, !Gene %in% neg_probes), aes(x = MeanExp, y = Estimate, size = -log10(`Pr(>|t|)`), color = Color, label = Gene)) + geom_hline(yintercept = c(0.5, -0.5), lty = "dashed") + scale_x_continuous(trans = "log2") + geom_point(alpha = 0.5) + labs(y = "Enriched in Glomeruli <- log2(FC) -> Enriched in Tubules", x = "Mean Expression", color = "Significance") + scale_color_manual(values = c(`FDR < 0.001` = "dodgerblue", `FDR < 0.05` = "lightblue", `P < 0.05` = "orange2", `NS or FC < 0.5` = "gray")) + geom_text_repel(data = subset(results, Gene %in% top_g2), size = 4, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) + theme_bw(base_size = 16) + facet_wrap(~Subset, nrow = 2, ncol = 1) ## ----sessInfo----------------------------------------------------------------- sessionInfo()