Quality Control

Author

Harvard Chan Bioinformatics Core

Published

July 3, 2025

Template developed with materials from https://hbctraining.github.io/main/.

Code

# This set up the working directory to this file so all files can be found
library(rstudioapi)
setwd(fs::path_dir(getSourceEditorContext()$path))
# NOTE: This code will check version, this is our recommendation, it may work
# .      other versions
stopifnot(R.version$major >= 4) # requires R4
if (compareVersion(R.version$minor, "3.1") < 0) warning("We recommend >= R4.3.1")
stopifnot(compareVersion(as.character(BiocManager::version()), "3.18") >= 0)

This code is in this revision.

Code

# 1. set up factor_of_interest parameter from parameter above or manually
#    this is used to color plots, it needs to be part of the metadata
factor_of_interest <- params$factor_of_interest
genome <- params$genome
single_end <- params$single_end
# 2. Set input files in this file
source(params$params_file)
# 3. If you set up this file, project information will be printed below and
# .   it can be reused for other Rmd files.
source(params$project_file)
# 4. Load custom functions to load data from coldata/metrics/counts
source(params$functions_file)

1 Overview

Project: name_XXXXX
PI: person name
Analyst: person in the core
Experiment: short description

Code

library(tidyverse)
library(janitor)
library(knitr)
library(rtracklayer)
library(DESeq2)
library(DEGreport)
library(ggrepel)
# library(RColorBrewer)
library(DT)
library(pheatmap)
library(RColorBrewer)
library(ggprism)
library(grafify)
ggplot2::theme_set(theme_prism(base_size = 12))
# https://grafify-vignettes.netlify.app/colour_palettes.html
# NOTE change colors here if you wish
scale_colour_discrete <- function(...) {
  scale_colour_manual(...,
    values = as.vector(grafify:::graf_palettes[["kelly"]])
  )
}
scale_fill_discrete <- function(...) {
  scale_fill_manual(...,
    values = as.vector(grafify:::graf_palettes[["kelly"]])
  )
}

opts_chunk[["set"]](
  cache = FALSE,
  cache.lazy = FALSE,
  dev = c("png", "pdf"),
  error = TRUE,
  highlight = TRUE,
  message = FALSE,
  prompt = FALSE,
  tidy = FALSE,
  warning = FALSE,
  fig.height = 4)

Code

sanitize_datatable <- function(df, ...) {
  # remove dashes which cause wrapping
  DT::datatable(df, ...,
    rownames = gsub("-", "_", rownames(df)),
    colnames = gsub("-", "_", colnames(df))
  )
}

2 Samples and metadata

Code

# This code will load from bcbio or nf-core folder
# TODO:  make sure to set numerator and denominator
coldata <- load_coldata(coldata_fn)
# Change this line to change the levels to the desired order.
# It will affect downstream colors in plots.
coldata[[factor_of_interest]] <- as.factor(coldata[[factor_of_interest]])
coldata$sample <- row.names(coldata)

counts <- load_counts(counts_fn)
counts <- counts[, colnames(counts) %in% coldata$sample]

metrics <- load_metrics(
  se_object, multiqc_data_dir,
  gtf_fn, counts, single_end
) %>%
  left_join(coldata, by = c("sample")) %>%
  as.data.frame()
metrics <- subset(metrics, metrics$sample %in% coldata$sample)
# TODO: change order as needed
order <- unique(metrics[["sample"]])
rownames(metrics) <- metrics$sample
# if the names don't match in order or string check files names and coldata information
counts <- counts[, rownames(metrics)]
coldata <- coldata[rownames(metrics), ]
stopifnot(all(names(counts) == rownames(metrics)))

Code

meta_df <- coldata
ggplot(meta_df, aes(.data[[factor_of_interest]],
  fill = .data[[factor_of_interest]]
)) +
  geom_bar() +
  ylab("") +
  xlab("") +
  ylab("# of samples") +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5),
    legend.position = "none"
  )

Code

meta_sm <- meta_df %>%
  as.data.frame()

meta_sm %>% sanitize_datatable()

3 Read metrics

Here, we want to see consistency and a minimum of 20 million reads (the grey line).

Code

metrics %>%
  ggplot(aes(
    x = factor(sample, level = order),
    y = total_reads, # if total reads are not already in millions, divide by 10000000 here
    fill = .data[[factor_of_interest]]
  )) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_y_continuous(name = "Millions of reads") +
  scale_x_discrete(limits = rev) +
  xlab("") +
  ggtitle("Total reads") +
  geom_hline(yintercept = 20, color = "grey", linewidth = 2)

Code

metrics %>%
  ggplot(aes(
    x = .data[[factor_of_interest]],
    y = total_reads,
    color = .data[[factor_of_interest]]
  )) +
  geom_point(alpha = 0.5, size = 4) +
  coord_flip() +
  scale_y_continuous(name = "million reads") +
  ggtitle("Total reads")

Code

# get min percent mapped reads for reference
min_pct_mapped <- round(min(metrics$mapped_reads / metrics$total_reads) * 100, 1)
max_pct_mapped <- round(max(metrics$mapped_reads / metrics$total_reads) * 100, 1)

The genomic mapping rate represents the percentage of reads mapping to the reference genome. We want to see consistent mapping rates between samples and over 70% mapping (the grey line). These samples have mapping rates: 97.3% - 98.1 %.

Code

metrics$mapped_reads_pct <- round(metrics$mapped_reads / metrics$total_reads * 100, 1)
metrics %>%
  ggplot(aes(
    x = factor(sample, level = order),
    y = mapped_reads_pct,
    color = .data[[factor_of_interest]]
  )) +
  geom_point(alpha = 0.5, size = 4) +
  coord_flip() +
  ylab("Mapped Reads %") +
  scale_x_discrete(limits = rev) +
  ylim(0, 100) +
  ggtitle("Mapping rate") +
  xlab("") +
  geom_hline(yintercept = 70, color = "grey", linewidth = 2)

The number of genes represented in every sample is expected to be consistent and over 20K (grey line).

Code

genes_detected <- colSums(counts > 0) %>% enframe()
sample_names <- metrics[, c("sample"), drop = F]
genes_detected <- left_join(genes_detected, sample_names, by = c("name" = "sample"))
genes_detected <- genes_detected %>% group_by(name)
genes_detected <- summarise(genes_detected,
  n_genes = max(value)
)

metrics <- metrics %>%
  left_join(genes_detected, by = c("sample" = "name"))

Code

ggplot(metrics, aes(
  x = factor(sample, level = order),
  y = n_genes, fill = .data[[factor_of_interest]]
)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  ggtitle("Number of genes") +
  ylab("Number of genes") +
  xlab("") +
  geom_hline(yintercept = 20000, color = "grey", linewidth = 2)

Code

metrics %>%
  ggplot(aes(
    x = .data[[factor_of_interest]],
    y = n_genes,
    color = .data[[factor_of_interest]]
  )) +
  geom_point(alpha = 0.5, size = 4) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(name = "million reads") +
  xlab("") +
  ggtitle("Number of Genes")

This plot shows how complex the samples are. We expect samples with more reads to detect more genes.

Code

metrics %>%
  ggplot(aes(
    x = total_reads / 1000000,
    y = n_genes,
    color = .data[[factor_of_interest]]
  )) +
  geom_point(alpha = 0.5, size = 4) +
  scale_x_log10() +
  ggtitle("Gene saturation") +
  ylab("Number of genes") +
  xlab("Millions of reads")

Here we are looking for consistency, and exonic mapping rates around or above 70% (grey line).

Code

metrics %>%
  ggplot(aes(
    x = factor(sample, level = order),
    y = exonic_rate * 100,
    color = .data[[factor_of_interest]]
  )) +
  geom_point(alpha = 0.5, size = 4) +
  ylab("Exonic rate %") +
  ggtitle("Exonic mapping rate") +
  scale_x_discrete(limits = rev) +
  coord_flip() +
  xlab("") +
  ylim(c(0, 100)) +
  geom_hline(yintercept = 70, color = "grey", linewidth = 2)

Here, we expect a low intronic mapping rate (≤ 15% - 20%). The grey line indicates 20%.

Code

metrics %>%
  ggplot(aes(
    x = factor(sample, level = order),
    y = intronic_rate * 100,
    color = .data[[factor_of_interest]]
  )) +
  geom_point(alpha = 0.5, size = 4) +
  ylab("Intronic rate %") +
  ggtitle("Intronic mapping rate") +
  scale_x_discrete(limits = rev) +
  coord_flip() +
  xlab("") +
  ylim(c(0, 100)) +
  geom_hline(yintercept = 20, color = "grey", linewidth = 2)

Here, we expect a low intergenic mapping rate, which is true for all samples. The grey line indicates 15%

Code

metrics %>%
  ggplot(aes(
    x = factor(sample, level = order),
    y = intergenic_rate * 100,
    color = .data[[factor_of_interest]]
  )) +
  geom_point(alpha = 0.5, size = 4) +
  ylab("Intergenic rate %") +
  ggtitle("Intergenic mapping rate") +
  coord_flip() +
  xlab("") +
  scale_x_discrete(limits = rev) +
  ylim(c(0, 100)) +
  geom_hline(yintercept = 15, color = "grey", linewidth = 2)

Samples should have a ribosomal RNA (rRNA) “contamination” rate below 10% (the grey line).

Code

rrna_ylim <- max(round(metrics$r_and_t_rna_rate * 100, 2)) + 10
metrics %>%
  ggplot(aes(
    x = factor(sample, level = order),
    y = r_and_t_rna_rate * 100,
    color = .data[[factor_of_interest]]
  )) +
  geom_point(alpha = 0.5, size = 4) +
  ylab("tRNA/rRNA rate, %") +
  ylim(0, rrna_ylim) +
  ggtitle("tRNA/rRNA mapping rate") +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  ylim(c(0, 100)) +
  xlab("") +
  geom_hline(yintercept = 10, color = "grey", linewidth = 2)

There should be little bias, i.e. the values should be close to 1, or at least consistent among samples

Code

metrics %>%
  ggplot(aes(
    x = factor(sample, level = order),
    y = x5_3_bias,
    color = .data[[factor_of_interest]]
  )) +
  geom_point(alpha = 0.5, size = 4) +
  ggtitle("5'-3' bias") +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  ylim(c(0.5, 1.5)) +
  xlab("") +
  ylab("5'-3' bias") +
  geom_hline(yintercept = 1, color = "grey", linewidth = 2)

We expect consistency in the box plots here between the samples, i.e. the distribution of counts across the genes is similar

Code

metrics_small <- metrics %>% dplyr::select(sample, .data[[factor_of_interest]])
metrics_small <- left_join(sample_names, metrics_small)

counts_lng <-
  counts %>%
  as_tibble() %>%
  filter(rowSums(.) != 0) %>%
  gather(name, counts)

counts_lng <- left_join(counts_lng, metrics_small, by = c("name" = "sample"))

ggplot(counts_lng, aes(factor(name, level = order),
  log2(counts + 1),
  fill = .data[[factor_of_interest]]
)) +
  geom_boxplot() +
  scale_x_discrete(limits = rev) +
  coord_flip() +
  xlab("") +
  ggtitle("Counts per gene, all non-zero genes")

4 Sample similarity analysis

In this section, we look at how well the different groups in the dataset cluster with each other. Samples from the same group should ideally be clustering together. We use Principal Component Analysis (PCA).

4.1 Principal component analysis (PCA)

Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells).

Code

vst <- vst(counts)

coldat_for_pca <- as.data.frame(metrics)
rownames(coldat_for_pca) <- coldat_for_pca$sample
coldat_for_pca <- coldat_for_pca[colnames(counts), ]
pca1 <- degPCA(vst, coldat_for_pca,
  condition = factor_of_interest, data = T
)[["plot"]]
pca2 <- degPCA(vst, coldat_for_pca,
  condition = factor_of_interest, data = T, pc1 = "PC3", pc2 = "PC4"
)[["plot"]]



pca1 + scale_color_grafify(palette = "kelly")

Code

pca2 + scale_color_grafify(palette = "kelly")

4.2 Hierarchical clustering

Inter-correlation analysis (ICA) is another way to look at how well samples cluster by plotting the correlation between the expression profiles of the samples.

Code

vst_cor <- cor(vst)

colma <- coldata %>% as.data.frame()
rownames(colma) <- colma$sample
colma <- colma[rownames(vst_cor), ]
colma <- colma %>% dplyr::select(.data[[factor_of_interest]])
anno_colors <- lapply(colnames(colma), function(c) {
  l.col <- grafify:::graf_palettes[["kelly"]][1:length(unique(colma[[c]]))]
  names(l.col) <- levels(colma[[c]])
  l.col
})
names(anno_colors) <- colnames(colma)

p <- pheatmap(vst_cor,
  annotation = colma,
  annotation_colors = anno_colors,
  show_rownames = T,
  show_colnames = T,
  color = colorRampPalette(brewer.pal(11, "RdBu"))(15)
)
p

5 Covariates analysis

When there are multiple factors that can influence the results of a given experiment, it is useful to assess which of them is responsible for the most variance as determined by PCA. This method adapts the method described by Daily et al. for which they integrated a method to correlate covariates with principal components values to determine the importance of each factor.

Code

## Remove non-useful columns output by nf-core
coldat_2 <- data.frame(coldat_for_pca[, !(colnames(coldat_for_pca) %in% c("fastq_1", "fastq_2", "salmon_library_types", "salmon_compatible_fragment_ratio", "samtools_reads_mapped_percent", "samtools_reads_properly_paired_percent", "samtools_mapped_passed_pct", "strandedness", "qualimap_5_3_bias"))])

# Remove missing data
coldat_2 <- na.omit(coldat_2)
degCovariates(vst, metadata = coldat_2)

6 Conclusions

7 Methods

RNA-seq counts were generated by the nf-core rnaseq pipeline [version] using Salmon (Patro et al. 2017). Downstream analyses were performed using R version 4.5.1 (2025-06-13). Counts were imported into R using DESeq2 version 1.46.0 (Love, Huber, and Anders 2014). Gene annotations were obtained from Ensembl. Plots were generated by ggplot2 (Wickham 2016). Heatmaps were generated by pheatmap (Kolde 2019).

7.1 R package references

Code

citation("DESeq2")

To cite package ‘DESeq2’ in publications use:

Love, M.I., Huber, W., Anders, S. Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2 Genome Biology 15(12):550 (2014)

A BibTeX entry for LaTeX users is

@Article{, title = {Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2}, author = {Michael I. Love and Wolfgang Huber and Simon Anders}, year = {2014}, journal = {Genome Biology}, doi = {10.1186/s13059-014-0550-8}, volume = {15}, issue = {12}, pages = {550}, }

Code

citation("ggplot2")

To cite ggplot2 in publications, please use

H. Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2016.

A BibTeX entry for LaTeX users is

@Book{, author = {Hadley Wickham}, title = {ggplot2: Elegant Graphics for Data Analysis}, publisher = {Springer-Verlag New York}, year = {2016}, isbn = {978-3-319-24277-4}, url = {https://ggplot2.tidyverse.org}, }

Code

citation("pheatmap")

To cite package ‘pheatmap’ in publications use:

Kolde R (2025). pheatmap: Pretty Heatmaps. doi:10.32614/CRAN.package.pheatmap https://doi.org/10.32614/CRAN.package.pheatmap, R package version 1.0.13, https://CRAN.R-project.org/package=pheatmap.

A BibTeX entry for LaTeX users is

@Manual{, title = {pheatmap: Pretty Heatmaps}, author = {Raivo Kolde}, year = {2025}, note = {R package version 1.0.13}, url = {https://CRAN.R-project.org/package=pheatmap}, doi = {10.32614/CRAN.package.pheatmap}, }

7.2 R session

List and version of tools used for the QC report generation.

Code

sessionInfo()

R version 4.5.1 (2025-06-13)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 22.04.5 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: UTC
tzcode source: system (glibc)

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] grafify_5.0.0.1             ggprism_1.0.6              
 [3] RColorBrewer_1.1-3          pheatmap_1.0.13            
 [5] DT_0.33                     ggrepel_0.9.6              
 [7] DEGreport_1.42.0            DESeq2_1.46.0              
 [9] rtracklayer_1.66.0          knitr_1.50                 
[11] janitor_2.2.1               SummarizedExperiment_1.36.0
[13] Biobase_2.66.0              GenomicRanges_1.58.0       
[15] GenomeInfoDb_1.42.3         IRanges_2.40.1             
[17] S4Vectors_0.44.0            BiocGenerics_0.52.0        
[19] MatrixGenerics_1.18.1       matrixStats_1.5.0          
[21] GenomeInfoDbData_1.2.13     lubridate_1.9.4            
[23] forcats_1.0.0               stringr_1.5.1              
[25] dplyr_1.1.4                 purrr_1.0.4                
[27] readr_2.1.5                 tidyr_1.3.1                
[29] tibble_3.3.0                ggplot2_3.5.2              
[31] tidyverse_2.0.0            

loaded via a namespace (and not attached):
  [1] ggdendro_0.2.0              rstudioapi_0.17.1          
  [3] jsonlite_2.0.0              shape_1.4.6.1              
  [5] magrittr_2.0.3              estimability_1.5.1         
  [7] nloptr_2.2.1                farver_2.1.2               
  [9] rmarkdown_2.29              GlobalOptions_0.1.2        
 [11] BiocIO_1.16.0               zlibbioc_1.52.0            
 [13] vctrs_0.6.5                 minqa_1.2.8                
 [15] Rsamtools_2.22.0            RCurl_1.98-1.17            
 [17] base64enc_0.1-3             htmltools_0.5.8.1          
 [19] S4Arrays_1.6.0              curl_6.4.0                 
 [21] broom_1.0.8                 SparseArray_1.6.2          
 [23] Formula_1.2-5               sass_0.4.10                
 [25] bslib_0.9.0                 htmlwidgets_1.6.4          
 [27] plyr_1.8.9                  cachem_1.1.0               
 [29] emmeans_1.11.0              GenomicAlignments_1.42.0   
 [31] lifecycle_1.0.4             iterators_1.0.14           
 [33] pkgconfig_2.0.3             Matrix_1.7-3               
 [35] R6_2.6.1                    fastmap_1.2.0              
 [37] rbibutils_2.3               snakecase_0.11.1           
 [39] clue_0.3-66                 numDeriv_2016.8-1.1        
 [41] digest_0.6.37               colorspace_2.1-1           
 [43] reshape_0.8.9               patchwork_1.3.0            
 [45] crosstalk_1.2.1             Hmisc_5.2-3                
 [47] labeling_0.4.3              timechange_0.3.0           
 [49] mgcv_1.9-3                  httr_1.4.7                 
 [51] abind_1.4-8                 compiler_4.5.1             
 [53] bit64_4.6.0-1               withr_3.0.2                
 [55] doParallel_1.0.17           htmlTable_2.4.3            
 [57] ConsensusClusterPlus_1.70.0 backports_1.5.0            
 [59] BiocParallel_1.40.2         carData_3.0-5              
 [61] psych_2.5.3                 MASS_7.3-65                
 [63] DelayedArray_0.32.0         rjson_0.2.23               
 [65] tools_4.5.1                 foreign_0.8-90             
 [67] nnet_7.3-20                 glue_1.8.0                 
 [69] restfulr_0.0.15             nlme_3.1-168               
 [71] grid_4.5.1                  checkmate_2.3.2            
 [73] cluster_2.1.8.1             generics_0.1.4             
 [75] gtable_0.3.6                tzdb_0.5.0                 
 [77] data.table_1.17.0           hms_1.1.3                  
 [79] car_3.1-3                   XVector_0.46.0             
 [81] foreach_1.5.2               pillar_1.10.2              
 [83] vroom_1.6.5                 limma_3.62.2               
 [85] splines_4.5.1               logging_0.10-108           
 [87] circlize_0.4.16             lattice_0.22-7             
 [89] bit_4.6.0                   tidyselect_1.2.1           
 [91] ComplexHeatmap_2.22.0       locfit_1.5-9.12            
 [93] Biostrings_2.74.1           reformulas_0.4.0           
 [95] gridExtra_2.3               edgeR_4.4.2                
 [97] xfun_0.52                   statmod_1.5.0              
 [99] stringi_1.8.7               UCSC.utils_1.2.0           
[101] boot_1.3-31                 yaml_2.3.10                
[103] evaluate_1.0.3              codetools_0.2-20           
[105] cli_3.6.5                   rpart_4.1.24               
[107] Rdpack_2.6.4                xtable_1.8-4               
[109] jquerylib_0.1.4             Rcpp_1.0.14                
[111] png_0.1-8                   XML_3.99-0.18              
[113] parallel_4.5.1              bitops_1.0-9               
[115] lme4_1.1-37                 mvtnorm_1.3-3              
[117] lmerTest_3.1-3              scales_1.4.0               
[119] crayon_1.5.3                GetoptLong_1.0.5           
[121] rlang_1.1.6                 cowplot_1.1.3              
[123] mnormt_2.1.1