Differential Expression

Author

Harvard Chan Bioinformatics Core

Published

July 3, 2025

Template developed with materials from https://hbctraining.github.io/main/.

Code

# NOTE: This code will check version, this is our recommendation, it may work on other versions
# Requires R version 4
stopifnot(R.version$major >= 4)
# If the minor version number is less than .3.1, send a warning that we recommend R version 4.3.1 or higher
if (compareVersion(R.version$minor, "3.1") < 0) warning("We recommend >= R4.3.1")
# Requires BiocManager version 3.18 or later
stopifnot(compareVersion(as.character(BiocManager::version()), "3.18") >= 0)

This code is in this revision.

Code

library(tidyverse)
# 1. Set up input files in this R file (params_de.R)
source(params$params_file)
# 2. Set up project file (already done from QC probably)
source(params$project_file)
# 3. Load custom functions to load data from coldata/metrics/counts
map(list.files(params$functions_file, pattern = "*.R$", full.names = T), source) %>% invisible()

# Assign the genome parameter to the object genome
# IMPORTANT set these values if you are not using the parameters in the header (lines 23-44)
genome <- params$genome
# If the genome name is matching to human or mouse, use their respective annotation, otherwise give a warning
if (grepl("hg|human", genome, ignore.case = TRUE)) {
  library(org.Hs.eg.db)
  ann.org <- org.Hs.eg.db
} else if (grepl("mm|mouse", genome, ignore.case = TRUE)) {
  library(org.Mm.eg.db)
  ann.org <- org.Mm.eg.db
} else {
  warning("If you are working on an organism other than human or mouse: Bioconductor packages are also available for a number of other organisms. This code should work for any of the organisms with an eg.db package listed at https://bioconductor.org/packages/release/BiocViews.html#___Organism -- please search for your organism and assign ann.org manually.")
}

# Assigns the column object from the parameters in the YAML header to the column object
column <- params$column
# Assigns the contrasts object from the parameters in the YAML header to the contrasts object
contrasts <- params$contrasts
# Assigns the subset_column object from the parameters in the YAML header to the subset_column object
subset_column <- params$subset_column
# Assigns the subset_value object from the parameters in the YAML header to the subset_value object
subset_value <- params$subset_value
# Assigns the ruv boolean object from the parameters in the YAML header to the run_ruv object
run_ruv <- params$ruv
# Assigns the combatseq boolean object from the parameters in the YAML header to the run_combatseq object
run_combatseq <- params$combatseq
# If the run_ruv object and/or run_combatseq is TRUE, set the run_rmv boolean to TRUE, otherwise set run_rmv boolean to FALSE
run_rmv <- run_ruv | run_combatseq
# Assign the value of column to factor_of_interest, this will allow you to change factor_of_interest throughout the Rmd if needed, while maintaining the value of column
factor_of_interest <- column

Code

# Load required libraries
library(knitr)
library(tidyverse)
library(glue)
library(janitor)
library(purrr)
library(ggpubr)
library(rtracklayer)
library(DESeq2)
library(vegan)
library(EnhancedVolcano)
library(DEGreport)
library(pheatmap)
library(msigdbr)
library(fgsea)
library(htmltools)
library(ggforce)
library(RColorBrewer)
library(ggprism)
library(ggprism)
library(grafify)
library(viridis)
library(ashr)

# Overwrite the default filter function with dplyr's filter function
filter <- dplyr::filter
# Overwrite the default select function with dplyr's select function
select <- dplyr::select

# Setting the base theme that ggplot2 will use to theme_prism with the standard text size being 12
ggplot2::theme_set(theme_prism(base_size = 12))
# Setting scale color palette to the palette "kelly" from grafify
# You can select different palettes from here:
# https://grafify-vignettes.netlify.app/colour_palettes.html
scale_colour_discrete <- function(...) {
  scale_colour_manual(...,
    values = as.vector(grafify:::graf_palettes[["kelly"]])
  )
}
# Setting scale fill palette to the palette "kelly" from grafify
# You can select different palettes from here:
# https://grafify-vignettes.netlify.app/colour_palettes.html
scale_fill_discrete <- function(...) {
  scale_fill_manual(...,
    values = as.vector(grafify:::graf_palettes[["kelly"]])
  )
}
# Setting color to the palette "kelly" from grafify
# You can select diffferent palettes from here:
# https://grafify-vignettes.netlify.app/colour_palettes.html
colors <- as.vector(grafify:::graf_palettes[["kelly"]])

# Setting options for Rmd code chunks
opts_chunk[["set"]](
  cache = FALSE, # When the cache is turned on, knitr will skip the execution of this code chunk if it has been executed before and nothing in the code chunk has changed since then. For a cached code chunk, its output and objects will be automatically loaded from the previous run, as if the chunk were executed again.
  cache.lazy = FALSE, # Whether to only load cached objects when needed
  dev = c("png", "pdf"), # Specifies the graphical devices to use figures
  error = TRUE, # Whether to stop rendering on an error
  highlight = TRUE, # Whether to enable syntax highlighting for the code in the chunk
  message = FALSE, # Whether to preserve messages emitted by message() in the final output
  prompt = FALSE, # Whether an R prompt (>) is displayed at the beginning of each line of R input code in rendered documents
  tidy = FALSE, # Whether or not R code in a chunk is formatted automatically before rendering, similar to the R package, styler
  warning = FALSE, # Whether to preserve warnings in the output
  echo = TRUE, # Whether to include R source code in the final knitted document
  fig.height = 4) # Sets the default height of plot figures generated in inches

# Set seed for reproducibility
set.seed(1234567890L)

Code

# Create a function to clean up data frames
sanitize_datatable <- function(df, ...) {
  # Remove dashes from row names and column names which cause wrapping
  DT::datatable(df, ...,
    rownames = gsub("-", "_", rownames(df)),
    colnames = gsub("-", "_", colnames(df)),
    options = list(scrollX = TRUE, autoWidth = TRUE),
    class = "stripe hover"
  )
}

Code

# This code will load from nf-core folder
# NOTE make sure to set numerator and denominator
# Use the load_coldata function from the sourced load_data.R within the 00_params directory, to subset the colodata_fn dataframe that was pointed to when sourcing params.R within the 00_params directory
# The factors for this subsetting were set in thr params section of the YAML code chunk.
# The default subsetting is none since subset_column and subset_value are both set to null initially
# The column object here is given to ensure that the column of interest exists in the loaded dataframe
coldata <- load_coldata(
  coldata_fn, column,
  subset_column, subset_value
)
# Set the row names from the coldata dataframe to be a column called sample
# This column should already exist from the Nextflow run
coldata$sample <- row.names(coldata)

# Use the load_counts function from the sourced load_data.R within the 00_params directory, to load the counts data from the dataframe pointed to when sourcing params.R within the 00_params directory
counts <- load_counts(counts_fn)
# If subsetting parameters were provided, this will subset the counts data to match the samples in the coldata dataframe
counts <- counts[, colnames(counts) %in% coldata$sample]

# Use the load_metrics function from the sourced load_data.R within the 00_params directory, to load the metrics from the multiqc, along with the values of se_object and gtf_fn defined in the sourced params.R within the 00_params directory and also the recently created counts data, which is used to estimate the tRNA/rRNA rate (r_and_t_rna_rate).
metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>%
  left_join(coldata, by = c("sample")) %>% # Join the coldata table to the end of object produced from the load_metrics function
  as.data.frame() # Format it as a dataframe
# Assign the sample names to be the row names in the metrics object
rownames(metrics) <- metrics$sample
# If subsetting parameters were provided, this will subset the metrics data to match the samples in the coldata dataframe
metrics <- subset(metrics, metrics$sample %in% coldata$sample)

# If the sample names in the counts (column) don't match in order or string of the metrics file, reorder them in the counts object
counts <- counts[, rownames(metrics)]
# If the sample names in the coldata object (rows) don't match in order or string of the metrics file, reorder them in the counts object
coldata <- coldata[rownames(metrics), ]

# This will relevel the first contrast in your contrasts list with with the "control_name" as the baseline value defined in the params section of the YAML
coldata[[contrasts[[1]][1]]] <- relevel(as.factor(coldata[[contrasts[[1]][1]]]), contrasts[[1]][3])
# Stop if the column names in counts does not match and in the same order as the row names in metrics
stopifnot(all(colnames(counts) == rownames(metrics)))

1 Overview

Project: name_XXXXX
PI: person name
Analyst: person in the core
Experiment: short description
Aim: short description

Code

# Query AnnotationHub and assign it to ah object
ah <- AnnotationHub::AnnotationHub()

# Retrieve the annotations from Annotation Hub dependent on your reference genome
if (grepl("hg|human", genome, ignore.case = TRUE)) { # If the reference genome is human
  ens <- AnnotationHub::query(ah, c("Homo sapiens", "EnsDb")) # Retrieve the recent human annotations
  ens <- ens[["AH119325"]] # Assign a human annotation, may need to change
} else if (grepl("mm|mouse", genome, ignore.case = TRUE)) { # If the reference genome is mouse
  ens <- AnnotationHub::query(ah, c("Mus musculus", "EnsDb")) # Retrieve the recent mouse annotations
  ens <- ens[["AH116340"]] # Assign a mouse annotation, may need to change
} else { # If the reference genome is not human or mouse, prin this warning
  cat("Warning: Please, get a table that contains gene_id, gene_name and entrez to continue using this template.")
}

# Create a gene-level dataframe
# NOTE: Change this code if you are not using AnnotationHub
#       The rdata is a table with gene_id, gene_name and entrez columns
rdata <- ensembldb::genes(ens, return.type = "data.frame") %>% # Extract ens annotation as a dataframe
  dplyr::select(gene_id, gene_name,
    entrez = entrezid,
    gene_biotype, description
  ) %>% # Retrieve certain columns and rename the entrez column
  dplyr::filter(gene_id %in% rownames(counts)) %>% # Retain only gene annotations in the counts matrix
  dplyr::distinct(gene_id, .keep_all = TRUE) # Retain only unique gene_ids and also retain all of the other columns with .keep_all = TRUE

# Keep one entrez Id per gene
rdata$entrez <- purrr::map(rdata$entrez, 1) %>% unlist()
# Filter not needed genes
# rdata <- rdata %>% filter(!grepl("ncRNA", gene_biotype)) %>% # Remove non-coding RNAs
#   filter(!grepl('Mir', gene_name)) %>% # Remove miRNAs
#   filter(!grepl('Snor', gene_name)) # Remove snoRNAs
# counts <- counts[rdata$gene_id,] # Filter the counts matrix after removing these not needed genes

2 Set up

We recommend avoiding manual filtering prior to running DESeq2, as the software performs internal filtering as part of its pipeline. However, pre-filtering may be appropriate in certain contexts to improve computational efficiency or address data imbalances, such as:

A high proportion of dropouts (zero counts), where filtering can reduce computational burden
A large number of samples, where pre-filtering low-expressed features can improve performance
Strongly unbalanced group sizes, where group-specific filtering strategies may help mitigate bias or improve sensitivity

Code

# Establish a null DDS object for DESeq2
dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = ~1)

# Perform a variance-stabilized transformation and assign the DESeqTransform object to vsd_before
vsd_before <- vst(dds_to_use)
# Extract the variance-stabilized transformed (normalized) counts and assign them to norm_matrix
norm_matrix <- assay(vsd_before)

3 PCA and group level variance.

Principal Component Analysis (PCA) is a dimensionality reduction technique that transforms high-dimensional data into a smaller set of uncorrelated variables called principal components. In gene expression analysis, PCA reduces thousands of gene expression measurements into a few components that capture the most variance across samples. This helps visualize overall patterns, identify outliers, and detect batch effects or biological groupings.

Dispersion estimates are a key part of the DESEQ2 analysis. DESEQ2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per gene dispersions to match this distribution. If one group has higher variance than all others this will affect the dispersion estimates. Here we visually check that the variance per group is similar using a PCA. The ellipses are minimal volume enclosing ellipses using the Khachiyan algorithm.

It is best practice NOT to subset your data unless one group has significantly higher variance than the others. The best dispersion estimates are obtained with more data.

This code automatically uses the column value from the header. You can also manually add a factor of interest to define the groups. One can be created by combining multiple metadata columns using the paste0 function.

Code

# Run PCA on the normalized counts
pca <- degPCA(norm_matrix,
  metadata = metrics, # Using the metrics object as our metadata
  condition = factor_of_interest, # Specifying the condition as our factor_of_interest from the above code chunk
  name = "sample", # Column from metadata to use to label the points in the PCA plot
  data = TRUE # Include the data
)

pca$plot + # Create plot
  ggtitle(paste0("All samples", "\nPCA using ", nrow(vsd_before), " genes")) + # Add title to plot
  theme(plot.title = element_text(hjust = 0.5)) + # Center title on plot
  geom_mark_ellipse(aes(color = .data[[column]])) + # Create ellipses on plot
  scale_color_grafify("kelly") # Set color parameter for plot

3.1 Analysis of the variance by group

Groups in a univariate analysis can also differ with regard to their mean values, variation around those means, or both. In univariate analyses, dispersion can be examined using Levene’s test. PERMDISP is a multivariate extension of Levene’s test to examine whether groups differ in variability. In essence, PERMDISP involves calculating the distance from each data point to its group centroid and then testing whether those distances differ among the groups. Source

Here we apply this test to our variance stabilized data. We calculate distances between samples and then use the betadisper() function from the popular vegan package. We get two overall p-values where significant means that the dispersions are different between groups. The first p-value comes from the anova() function and the second from the permutest() function. We also get pairwise p-values for every group-group comparison.

This analysis allows us to confirm that the variance is the same in all groups (non-significant p-value), so that the DE analysis will be accurate. Dispersion estimates are a key part of the DESeq2 analysis. DESeq2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per-gene dispersions to match this distribution. If one group has higher variance than all others, this will affect the dispersion estimates.

Code

# Use vegdist() to compute pair-wise dissimilarity indices between samples from the normalized count matrix
# Note, we need to transpose (use t())) the counts matrix for vegdist() to compute dissimilarity indices between the samples
vare.disa <- vegdist(t(assay(vsd_before)))

# Evaluate the homogeneity of dispersion within the groups
mod <- betadisper(vare.disa, group = metrics[[factor_of_interest]])
# Test the homogeniety of dispersion between groups  with ANOVA.
# ANOVA tests is there is overall significant variation in variance between groups.
anova(mod)

Code

# Test the homogeniety of dispersion between groups  with a permutation test
# The permutation test considers pairwise comparisons so you can see which group(s) is different
permutest(mod, pairwise = TRUE)


Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 719

Response: Distances
          Df     Sum Sq    Mean Sq      F N.Perm Pr(>F)
Groups     1 1.2238e-05 1.2238e-05 0.2501    719 0.7014
Residuals  4 1.9573e-04 4.8932e-05                     

Pairwise comparisons:
(Observed p-value below diagonal, permuted p-value above diagonal)
        normal tumor
normal           0.6
tumor  0.64326

4 Covariate analysis

Multiple factors related to the experimental design or quality of sequencing may influence the outcomes of a given RNA-seq experiment. To further determine whether any confounding covariate risks affecting the results of our differential expression analyses, it is useful to assess the correlation between covariates and principal component (PC) values.

Here, we are using DEGreport::degCovariates() to explore potential correlations between variables provided in the metadata and all PCs that account for at least 5% of the variability in the data. If applicable, significant correlations (FDR < 0.1) are circled. This diagnostic plot helps us determine which variables we may need to add to our DE model.

Code

# Visualize covariates for the PCs
degCovariates(
  counts = norm_matrix, # Normalized counts matrix
  metadata = metrics # Metadata to overlay as covariates
)

5 Data modeling

Code

# Enter your design formula
# The format should be:
# as.formula(paste0("~ ", column_to_control_for, " + ", factor_of_interest))
formula <- as.formula(paste0("~ ", " + ", column))
# Stop if the column names in counts does not match and in the same order as the row names in metrics
stopifnot(all(colnames(counts) == rownames(coldata)))

# Create the DDS object for DESeq2 and assign it to the object dds_to_use
dds_to_use <- DESeqDataSetFromMatrix(
  countData = counts, # Provide the raw counts data
  colData = coldata, # Provide the metadata
  design = formula # Provide the design formula
)

# Carry out a variance-stabilized transformation of the data in the DDS object and assign it to vsd_before
# Note: vsd_before and norm_matrix was previously assigned in the setup code chunk  w/o variables in the formula so this is overwriting that
# NOTE: VST won’t regress out this when normalizing
vsd_before <- vst(dds_to_use)
# Extract the variance-stabilized transformed (normalized) counts and assign them to norm_matrix
norm_matrix <- assay(vsd_before)
# We are going to modify coldata in the RUV section when is turn on, hence want to retain the original
new_cdata <- coldata

For this study, this formula is recommended: ~, +sample_type

5.1 Remove unwanted variation

Removing unwanted variation from RNA-seq analysis is essential to ensure that the results reflect biological rather than technical differences. Methods like ComBat, RUVseq, or surrogate variable analysis (SVA) can be applied to adjust for batch effects, library preparation discrepancies, or other confounders. These techniques model and subtract the unwanted variation, enhancing the ability to detect true biological signals in the data. Proper normalization and careful experimental design are also crucial steps to mitigate such unwanted variation.

5.1.1 Assessing unknown factors

Code

# Print out text discussing is RUVseq will be done
# If the RUV parameter in the params section of the YAML is TRUE, then
if (run_ruv) {
  cat("When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.")
} else { # If the RUV parameter in the params section of the YAML is FALSE, then
  cat("There is no need to assess unknown factor for this study.")
}

There is no need to assess unknown factor for this study.

Code

# Make DDS object
dds_to_use <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula)
# Apply VST
vsd_to_use <- vst(dds_to_use, blind = FALSE)

5.1.2 Remove Batch Effects

There is no need to remove known factors like batch effect in this study.

Code

# Run ComBat-seq
adjusted_counts <- ComBat_seq(
  counts = as.matrix(counts), # Provide the counts matrix
  batch = batch, # Provide the batch information
  group = treatment # Provide the variable of interest information
)

# NOTE: Running ComBat-seq with multiple variables of interest
# adjusted_counts <- ComBat_seq(
#   counts = as.matrix(counts), # Provide the counts matrix
#   batch = batch, # Provide the batch information
#   covar_mod = imp # Provide the variables of interest information
# )

Code

# Re-enter the adjusted count from ComBat-seq into DESeq
# NOTE: Make sure the formula doesn't contain the covariates used in ComBat-seq above
dds_to_use <- DESeqDataSetFromMatrix(
  countData = adjusted_counts, # Provide the adjusted counts from ComBat-seq
  colData = coldata, # Provide coldata
  design = formula # Provide design formula, notably without the covariates used in ComBat-seq
)
# Perform a variance-stabilized transformation on the ComBat-seq adjusted count data
vsd_combat <- vst(dds_to_use, blind = FALSE)

# Retrieve the variance-stabilized transformed counts data and assign it to the object norm_matrix
norm_matrix <- assay(vsd_combat)

# Run degPCA to use prcomp and make a plot
pca_combat <- degPCA(
  counts = norm_matrix, # Variance-stabilized transformed counts data after ComBat-seq
  metadata = coldata, # Provide metadata
  condition = column # Provide the variable of interest to color the plot by
) +
  ggtitle("After Combatseq") # Add title

# Plot PCA from dgePCA
pca_combat +
  scale_color_grafify("kelly") # Apply color scale

6 Differential Expression

Differential gene expression analysis of count data was performed using the Bioconductor R package, DESeq2, which fits the count data to a negative binomial model.

Before fitting the model, we often look at a metric called dispersion, which is a measure for variance which also takes into consideration mean expression. A dispersion value is estimated for each individual gene, then ‘shrunken’ to a more accurate value based on expected variation for a typical gene exhibiting that level of expression. Finally, the shrunken dispersion value is used in the final GLM fit.

We use the below dispersion plot, which should show an inverse relationship between dispersion and mean expression, to get an idea of whether our data is a good fit for the model.

Code

# Perform differential expression analysis on the DDS object
de <- DESeq(dds_to_use)
# Plot the dispersion for the data
DESeq2::plotDispEsts(de)

Because it is difficult to accurately detect and quantify the expression of lowly expressed genes, differences in their expression between treatment conditions can be unduly exaggerated after the model is fit. We correct for this so that gene LFC is not dependent overall on basal gene expression level.

In cases there are multiple groups and conditions across groups is recommended to use dummy variables instead of interaction terms: https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#interactions.

The LRT is useful for testing multiple terms at once, for example testing 3 or more levels of a factor at once, or all interactions between two variables. The LRT for count data is conceptually similar to an analysis of variance (ANOVA) calculation in linear regression, except that in the case of the Negative Binomial GLM, we use an analysis of deviance (ANODEV), where the deviance captures the difference in likelihood between a full and a reduced model.

Code

# NOTE: We recommend LRT for time series
# resultsNames(de) # Check the order is right
# Create a list that mirrors the contrasts in the contrasts list object where the pattern is:
# <column_of_comparison>_<treatment_name>_vs_<control_name>
names_to_use <- lapply(contrasts, function(contrast) {
  coef <- paste0(contrast[1], "_", contrast[2], "_vs_", contrast[3])
})
# Currently the contrasts list object doesn't have names for the items in the list
# Assign the names from names_to_use to the names in the contrasts list object
names(contrasts) <- names_to_use

# Create a list called `de_list` to hold the differential expression information for each contrast
# Go through each contrast in the contrasts list object and
de_list <- lapply(contrasts, function(contrast) {
  # Extract the results (LFC, pvalue, padj, etc.) for the contrast
  resLFC <- results(de, contrast = contrast)
  # Create a string for the contrast following this pattern:
  # <column_of_comparison>_<treatment_name>_vs_<control_name>
  coef <- paste0(contrast[1], "_", contrast[2], "_vs_", contrast[3])
  # Apply LFC shrinkage with `ashr`
  # NOTE: Use `ashr` for comparisons with many groups to be able to pull out all the contrasts; otherwise `apeglm` is fine. It shrinks less.
  resLFCS <- lfcShrink(
    dds = de, # Provide the DDS object
    contrast = contrast, # Provide the contrast
    type = "ash" # Use `ashr` to apply the shrinkage
  )
  # If you want to use `apeglm` instead of `ashr` then use:
  # resLFCS <- lfcShrink(
  #   dds = de, # Provide the DDS object
  #   coef = coef, # Provide the contrast
  #   type = "apeglm" # Use `apeglm` to apply the shrinkage
  # )

  # Create a new results data frame with some extra information
  res <- as.data.frame(resLFCS) %>% # Extract the results from the resLFCS as a dataframe
    rownames_to_column("gene_id") %>% # Move the rownames to a column of data called "gene_id"
    left_join(rdata, by = "gene_id") %>% # Combine the results with the rdata object by the gene_id and rdata holds information about each gene
    relocate(gene_name) %>% # Move the "gene_name" column to be the first column
    dplyr::rename(lfc = log2FoldChange) %>% # Rename the column log2FoldChange to lfc
    mutate(pi = abs(lfc) * -log10(padj)) %>% # Create a new column named pi that multiplies the absolute value of log fold change by the log10 adjusted adjusted p-value
    arrange(-pi) # Arrange the data frame by the highest values of pi

  # Filter out genes that have no expression or were filtered out by DESEQ2
  res <- res[res$baseMean > 0, ] %>% # Remove genes with no expression
    drop_na(padj) %>% # Remove genes who don't have an adjusted p-value (lots of lowly expressed genes)
    drop_na(pvalue) # Remove genes who don't have a p-value (lots of lowly expressed genes)

  # Extract the significant genes
  res_sig <- res %>%
    filter(padj < 0.05) %>% # Retain the values that have an adjusted p-value less than 0.05
    arrange(padj) %>% # Arrange the genes by the lowest adjusted p-value
    mutate(gene_name = ifelse(test = gene_name == "" | is.na(gene_name), # If the gene_name value for a gene is NA or blank then
      yes = gene_id, # Use the gene_id as the gene_name
      no = gene_name # Maintain the gene_name
    ))
  # Create a list object called results
  results <- list(
    lfc = resLFC, # Assign the unshrunken results data to `lfc`
    lfcs = resLFCS, # Assign the shrunken results data to `lfcs`
    all = res, # Assign the full differential expression results to `all`
    sig = res_sig # Assign the significantly differentially expressed genes to `sig`
  )
  # Return the `results` list object to be a list within the object `de_list`
  return(results)
})

# NOTE: If you manually add any other comparison to the list with the following variables, the code below will make the plots for those as well:
# de_list <- c(de_list, new_comparison=list(lfc=resLFC,
#                                           lfcs=resLFCS,
#                                           all=res, sig=res_sig))

6.1 MA plot

This plot can help to:

Identify Differential Expression: Genes that show a significant log-fold change (M value away from 0) indicate changes in expression between conditions.
Assess Data Quality: The plot can help in identifying biases or systematic errors in the data. Ideally, most points should scatter around the M=0 line, indicating that there is no significant systematic difference between the conditions.
Visualize data dispersion: The distribution of points along the A-axis gives a sense of the spread of expression levels and any patterns or anomalies in the dataset.

6.2 Differentially Expressed Genes

Code

dt_list <- list()
for (contrast in names(de_list)) {
  res_sig <- de_list[[contrast]][["sig"]]
  dt_list <- c(
    dt_list,
    list(h3(contrast)),
    list(DT::datatable(res_sig,
      options = list(
        scrollX = TRUE, autoWidth = TRUE
      ),
      class = "stripe hover"
    ))
  )
}
tagList(dt_list)

sample_type_tumor_vs_normal

sample_type_normal_vs_tumor

6.3 Volcano plot

This volcano plot shows the genes that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in purple are genes that are significantly differentially expressed with a large effect (padj < 0.05 and model coefficient > ±0.5). Points in pink are significant with a small effect (padj < 0.05 and model coefficient < ±0.5) and points in blue are not significant but have a large effect (padj > 0.05 and model coefficient > ±0.5). Grey points are non-significant. The dashed lines correspond to the cutoff values of padj and model coefficient that we have chosen.

6.4 Heatmap

6.5 Plot Top 16 Genes

7 Pathway Enrichment

From the set of differentially expressed genes and using publicly available information about gene sets involved in biological processes and functions, we can calculate which biological processes and functions are significantly perturbed as a result of the treatment.

Code

# Call the appropriate functional analysis databases for your analysis
if (grepl("hg|human", genome, ignore.case = TRUE)) { # If the genome value in the params section of the YAML contains hg or human, then
  all_in_life <- get_databases_v2(sps = "human") # Get the human pathway databases using the get_databases_v2 function form the FA.R file within the 00_libs directory
  run_FA <- TRUE # Assign the value of TRUE to a object called run_FA and it will allow the functional analysis sections to run
} else if (grepl("mm|mouse", genome, ignore.case = TRUE)) { # If the genome value in the params section of the YAML contains mm or mouse, then
  all_in_life <- get_databases_v2(sps = "mouse") # Get the mouse pathway databases using the get_databases_v2 function form the FA.R file within the 00_libs directory
  run_FA <- TRUE # Assign the value of TRUE to a object called run_FA and it will allow the functional analysis sections to run
} else { # If your organism isn't mouse or human then
  warning("If you are working on an organism other than human or mouse: This pathway enrichment is based on annotations from MSigDB https://www.gsea-msigdb.org/gsea/msigdb which only has human and mouse collections. For non-model organism pathway analysis, please see the template rnaseq/03_functional/Nonmodel_Organism_Pathway_Analysis.Rmd") # Print this error message
  run_FA <- FALSE # Assign the value of FALSE to a object called run_FA and it will not allow the functional analysis sections to run
}

8 Pathway Analysis- GSEA

Gene Set Enrichment Analysis (GSEA) is a computational method used to determine whether a predefined set of genes shows statistically significant, concordant differences between two biological states (e.g., disease vs. normal) in RNA-seq data or other types of gene expression data. Advantages of GSEA.

Biological Insight: Helps in understanding the underlying biological processes and pathways affected, rather than focusing on individual genes.
Incorporation of Prior Knowledge: Utilizes predefined gene sets, allowing integration of existing biological knowledge.
Contextual Relevance: Can reveal subtle but coordinated changes in biologically meaningful gene sets that might not be apparent when looking at individual genes.

In the GSEA results, NES (normalized enrichment score) refers to the direction of regulation of the pathway. NES < 0 means downregulated, while NES > 0 means upregulated.

Code

# Go through each contrast in the de_list list object and assign data frames of significantly enriched pathways to to different elements of the list
fa_gsea_list <- lapply(de_list, function(contrast) {
  # For a given contrast pull out the full differential expression results
  res <- contrast[["all"]]
  # Creating the input for the GSEA analysis
  gsea_input <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)
    filter(!is.na(padj)) %>% # Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object
    dplyr::select(gene_id, lfc) # Extract the columns "gene_id" and "lfc"
  # Obtain the ENSEMBL ID and gene symbols for the genes in the gsea_input object
  # NOTE: Change to the correct species if not working in human or mouse
  input_entrezid <- AnnotationDbi::select(
    x = ann.org, # Annotation database to search
    keys = gsea_input$gene_id, # The gene_ids to search for
    keytype = "ENSEMBL", # The gene_ids (keys) are ENSEMBL annotations
    columns = c("ENTREZID", "SYMBOL")
  ) # Return the queried ENSEMBL ID, along with the ENTREZ ID and the gene symbol
  # Combine gene annotation with the lfc information
  input_entrezid <- inner_join(
    x = gsea_input, # Combine the ENSEMBL ID and lfc input with
    y = input_entrezid, # The ENTREZ ID and gene symbol information
    by = c("gene_id" = "ENSEMBL")
  ) %>% # Join these table together by common ENSEMBL IDs
    filter(!is.na(ENTREZID)) %>% # Remove the genes without an ENTREZ ID
    distinct(ENTREZID, .keep_all = TRUE) %>% # Retain the unique ENTREZ IDs and all columns of data
    arrange(desc(lfc)) # Arrange the data frame by descending values of lfc

  # Run the run_fgsea_v2 function found in FA.R within the 00_libs directory to run the GSEA analysis and assign the output to the object tb
  tb <- run_fgsea_v2(
    input = input_entrezid, # Query for the GSEA analysis
    all_in_life = all_in_life
  ) # Annotation databases to query against
  tb %>%
    filter(padj < 0.05) %>%
    arrange(padj) # Filter and sort the GSEA output for results with an adjust p-value less than 0.05 and return them
})

Code

# NOTE: DT::datatables doesn't work with tabset and for loops
# You can use the following code to print dynamically or call manually sanitize_datatable() multiple times
# Create an empty list to assign contrast values and DT data tables to
dt_list <- list()

# For each of the contrasts in the de_list list object to extract the name of the contrast and the GSEA results to be rendered into an HTML table
for (contrast in names(de_list)) {
  # Assign the significantly enriched pathways to the object res_sig
  res_sig <- fa_gsea_list[[contrast]] %>% # Use the GSEA obect for a given contrast. NOTE: These have already been filter for adjusted p-value less than 0.05.
    filter(padj < 0.05) %>% arrange(padj) # Filter and sort for adjusted p-values of less than 0.05
  # Populate the dt_list list
  dt_list <- c(
    dt_list, # With the contents of dt_list from the previous loop
    list(h3(contrast)), # Odd numbered objects in the list will be the contrast name in a HTML renderable Heading 3 size
    list(sanitize_datatable(res_sig)) # Even numbered object in the list will hold HTML renderable data table from the DT package containing the significantly enriched pathways
  )
}

# Render the dt_list into HTML
tagList(dt_list)

sample_type_tumor_vs_normal

sample_type_normal_vs_tumor

9 Pathway Analysis- Over-representation

Over-Representation Analysis (ORA) is a statistical method used to determine whether a predefined set of genes (e.g., genes belonging to a specific biological pathway or function) is over-represented (or enriched) among a list of differentially expressed genes (DEGs) from RNA-seq data. Adventages of ORA:

Simplicity: Easy to perform and interpret.
Biological Insight: Helps to identify pathways and processes that are significantly affected in the condition studied.
Prior Knowledge Integration: Utilizes existing biological knowledge through predefined gene sets.

Code

# Go through each contrast in the de_list list object and
fa_list <- lapply(de_list, function(contrast) {
  # For a given contrast pull out the full differential expression results
  res <- contrast[["all"]]

  # Extract the universe of genes to the ORA on and assign it to the object universe
  universe <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)
    filter(!is.na(padj)) %>% # Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object
    pull(gene_id) # Extract the gene_id column
  # Subset the full annotations to only contain universe we will evaluate within
  universe_mapping <- rdata %>% # Start with the full annotations
    filter(gene_id %in% universe, !is.na(entrez)) # Only retain the the annotations that are in the universe of genes (baseMean greater than 0, along with defined p-value and adjusted p-value) and have an ENTREZ ID
  # AnnotationDbi::select(ann.org, universe, 'ENSEMBL', columns=c('ENTREZID', 'SYMBOL'))

  # Create the input vector of genes for the ORA with a positive or negative lfc
  ora_input <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)
    filter(!is.na(padj), padj < 0.01, abs(lfc) > 0.3) %>% # Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object, and only keep genes with adjusted p-values less than 0.01 and the absolute value of the lfc is greater than 0.3
    pull(gene_id) # Extract the gene ID
  # Create the ENTREZ ID input for the ORA
  # NOTE: Change to the correct species if not working in human or mouse
  input_entrezid <- rdata %>% # Start with the full annotations
    filter(gene_id %in% ora_input, !is.na(entrez)) # Only retain the the annotations that are are in the ora_input object (ajusted p-value less than 0.01, lfc greater than the absolute value of 0.3) and have an ENTREZ ID

  # AnnotationDbi::select(ann.org, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL'))

  # Run the ORA using the run_fora_v2 function within the FA.R file in the 00_libs directory
  all <- run_fora_v2(
    input = input_entrezid, # Data frame of genes that are being evaluated for enrichment
    uni = universe_mapping, # Universe of gene to look for enrichment within
    all_in_life = all_in_life
  ) # Annotation databases to query against

  # Create the input vector of genes for the ORA with a positive lfc
  ora_input <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)
    filter(!is.na(padj), padj < 0.01, lfc > 0.3) %>% # Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object, and only keep genes with adjusted p-values less than 0.01 and the lfc is greater than 0.3
    pull(gene_id) # Extract the gene ID
  # Create the ENTREZ ID input for the ORA
  # NOTE: Change to the correct species if not working in human or mouse
  input_entrezid <- rdata %>% # Start with the full annotations
    filter(gene_id %in% ora_input, !is.na(entrez)) # Only retain the the annotations that are are in the ora_input object (adjusted p-value less than 0.01 and lfc greater than 0.3) and have an ENTREZ ID

  # Run the ORA using the run_fora_v2 function within the FA.R file in the 00_libs directory
  up <- run_fora_v2(
    input = input_entrezid, # Data frame of genes that are being evaluated for enrichment
    uni = universe_mapping, # Universe of gene to look for enrichment within
    all_in_life = all_in_life
  ) # Annotation databases to query against

  # Create the input vector of genes for the ORA with a negative lfc
  ora_input <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)
    filter(!is.na(padj), padj < 0.01, lfc < -0.3) %>% # Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object, and only keep genes with adjusted p-values less than 0.01 and the lfc is less than -0.3
    pull(gene_id) # Extract the gene ID
  # Create the ENTREZ ID input for the ORA
  # NOTE: Change to the correct species if not working in human or mouse
  input_entrezid <- rdata %>% # Start with the full annotations
    filter(gene_id %in% ora_input, !is.na(entrez)) # Only retain the the annotations that are are in the ora_input object (adjusted p-value less than 0.01 and lfc less than -0.3) and have an ENTREZ ID

  # Run the ORA using the run_fora_v2 function within the FA.R file in the 00_libs directory
  down <- run_fora_v2(
    input = input_entrezid, # Data frame of genes that are being evaluated for enrichment
    uni = universe_mapping, # Universe of gene to look for enrichment within
    all_in_life = all_in_life
  ) # Annotation databases to query against

  # Create a list to hold the ORA results for a given contrast
  list(
    all = all, # Assign the positive and negative lfc combined ORA results to "all"
    up = up, # Assign the positive lfc combined ORA results to "up"
    down = down # Assign the negative lfc combined ORA results to "down"
  )
})

9.1 All significant genes

Code

# NOTE DT::datatables doesn't work with tabset and for loops
# You can use the following code to print dynamically or call manually sanitize_datatable() multiple times

dt_list <- list()
for (contrast in names(de_list)) {
  res_sig <- fa_list[[contrast]][["all"]] %>%
    filter(padj < 0.05) %>%
    arrange(padj) # Filter and sort
  dt_list <- c(
    dt_list,
    list(h3(contrast)),
    list(sanitize_datatable(res_sig))
  )
}
tagList(dt_list)

sample_type_tumor_vs_normal

sample_type_normal_vs_tumor

9.2 Down-regulated genes

Code

# Create an empty list to assign contrast values and DT data tables to
dt_list <- list()
# For each of the contrasts in the de_list list object to extract the name of the contrast and the ORA results to be rendered into an HTML table
for (contrast in names(de_list)) {
  # Create a dataframe to hold the significantly over-represented terms for genes with a negative lfc
  res_sig <- fa_list[[contrast]][["down"]] %>% # Start with the complete list of terms used in the ORA evaluated for negative lfc
    filter(padj < 0.05) %>% arrange(padj) # Filter and sort only the results with adjusted p-values of 0.05

  # Populate the dt_list list
  dt_list <- c(
    dt_list, # With the contents of dt_list from the previous loop
    list(h3(contrast)), # Odd numbered objects in the list will be the contrast name in a HTML renderable Heading 3 size
    list(sanitize_datatable(res_sig)) # Even numbered object in the list will hold HTML renderable data table from the DT package containing the significantly over-represented terms for genes with a negative lfc
  )
}

# Render the dt_list into HTML for the significantly over-represented terms for genes with a negative  lfc
tagList(dt_list)

sample_type_tumor_vs_normal

sample_type_normal_vs_tumor

9.3 Up-regulated genes

Code

# Create an empty list to assign contrast values and DT data tables to
dt_list <- list()
# For each of the contrasts in the de_list list object to extract the name of the contrast and the ORA results to be rendered into an HTML table
for (contrast in names(de_list)) {
  # Create a dataframe to hold the significantly over-represented terms for genes with a positive lfc
  res_sig <- fa_list[[contrast]][["up"]] %>% # Start with the complete list of terms used in the ORA evaluated for positive lfc
    filter(padj < 0.05) %>% arrange(padj) # Filter and sort only the results with adjusted p-values of 0.05

  # Populate the dt_list list
  dt_list <- c(
    dt_list, # With the contents of dt_list from the previous loop
    list(h3(contrast)), # Odd numbered objects in the list will be the contrast name in a HTML renderable Heading 3 size
    list(sanitize_datatable(res_sig)) # Even numbered object in the list will hold HTML renderable data table from the DT package containing the significantly over-represented terms for genes with a positive lfc
  )
}

# Render the dt_list into HTML for the significantly over-represented terms for genes with a positive lfc
tagList(dt_list)

sample_type_tumor_vs_normal

sample_type_normal_vs_tumor

10 Save files

Code

# If the subset value from the params in the YAML is not null then
if (!is.null(subset_value) & !is.null(subset_column)) {
  # Interpret the value of subset_value and assign it to the object called filenames
  filenames <- str_interp("${subset_value}")
} else { # If the subset value from the params in the YAML is null then
  # Assign the calue of "full" to the object called filenames
  filenames <- "full"
}

# Assign the value of filenames to a new object called filename
filename <- paste0(filenames)

# Assign the file path to save the file path for the expression data CSV to the object name_expression_fn
name_expression_fn <- file.path( # Construct a file path
  basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directory
  str_interp("${filename}_expression.csv") # Interpret the value of "filename" and insert that in front of "_expression.csv"
)

# Extract the variance stabilized transformed counts matrix into a data frame to be saved
counts_norm <- norm_matrix %>% # Start with the variance stabilized transformed counts matrix
  as.data.frame() %>% # Convert the matrix to a data frame
  # Convert the row names (ENSEMBL IDs) to a column called "gene_id"
  rownames_to_column("gene_id") %>%
  left_join(rdata, by = "gene_id")

# Write the variance stabilized  transformed counts matrix to a CSV file with the name_expression_fn path
write_csv(counts_norm, name_expression_fn)

# For each of the contrasts in the de_list list object to save the differential expression analysis and ORA to their respective CSV files
for (contrast in names(contrasts)) {
  # Create a file name for the differential expression analysis
  name_deg_fn <- file.path( # Construct a file path
    basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directory
    str_interp("${filename}_${contrast}_deg.csv") # Interpret the value of "filename" and "contrast", then insert that in front of "_deg.csv"
  )

  # Create a file name for the ORA analysis
  name_pathways_fn <- file.path( # Construct a file path
    basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directory
    str_interp("${filename}_${contrast}_pathways.csv") # Interpret the value of "filename" and "contrast", then insert that in front of "_pathways.csv"
  )

  # Assign the full differential expression analysis for a given contrast to the object called res_for_writing
  res_for_writing <- de_list[[contrast]][["all"]] %>% # Start with the full differential expression analysis dataframe for the given contrast
    mutate(comparison = contrast) # Add a column to the table with the given contrast

  # Assign the full ORA for positive and negative lfc for a given contrast to the object called pathways_for_writing
  # NOTE: Choose what ORA to save (all, down or up). To save everything, add more lines of this code
  pathways_for_writing <- fa_list[[contrast]][["all"]] %>% # Start with the full ORA dataframe evaluating negative and positive lfc for the given contrast
    mutate(comparison = contrast) # Add a column to the table with the given contrast

  # Write the full differential expression analysis for a given contrast to a CSV file with the name_deg_fn path
  write_csv(res_for_writing, name_deg_fn)
  # Write the full ORA for a given contrast to a CSV file with the name_pathways_fn path
  write_csv(pathways_for_writing, name_pathways_fn)
}

If you would like to use DEGpatterns.Rmd template to visualize trends of gene changes across conditions/groups, please save the following output:

Code

# Save the DESeq object to an RDS file
saveRDS(dds_to_use, "DEGpattern_deseq_obj.rds")
# Save the metadata to an RDS file
saveRDS(coldata, "deseq_coldata.rds")

# Set the adjusted p-value threshold to 0.05
padj.cutoff <- 0.05
# Assign a value for the top n genes to XXX
topN <- 1000

# Go through each contrast in the de_list list object and extract the significance values for the significant genes for top N significant genes
# This will go through the de_list object, assign the name of the contrast in the de_list to contrast and assign its associated differential expression list (containing lfc, lfcs, all and sig) to the object contrast_de
deg_List <- imap(de_list, \(contrast_de, contrast){
  # Extract the significant genes from a given contrast and assign them to res_LRT
  res_LRT <- contrast_de[["sig"]]
  # Create a named vector where the values in the vector are the adjusted p-values and the names are the index of the value
  gene_padj <- setNames(res_LRT$padj, rownames(res_LRT))
  # Subset the gene_padj vector by the adjusted p-value threshold established in the top of this code block
  gene_sig <- gene_padj[gene_padj < padj.cutoff]
  # Arrange the gene significance values with the most significant first (they should already be arranged this way though), then keep either the first N genes defined by topN at the top of this code block or the total number of significant genes, whichever is fewer and assign it to the object called degs
  degs <- sort(gene_sig)[1:min(length(gene_sig), topN)]
  # Save the RDS object for the significance values for a given contrast, adjusted p-value threshold and number of top N genes
  saveRDS(degs, glue("DEGpattern_deseq_{contrast}_DEGs_padj{padj.cutoff}_topN{topN}.rds"))
  # Populate the deg_List with the degs vector for each contrast
  return(degs)
})
# Please note, you can choose to combine all significant results across contrast as well

11 Conclusion

12 Methods

12.1 R package references

Code

# Provide citations
citation("DESeq2")

To cite package ‘DESeq2’ in publications use:

Love, M.I., Huber, W., Anders, S. Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2 Genome Biology 15(12):550 (2014)

A BibTeX entry for LaTeX users is

@Article{, title = {Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2}, author = {Michael I. Love and Wolfgang Huber and Simon Anders}, year = {2014}, journal = {Genome Biology}, doi = {10.1186/s13059-014-0550-8}, volume = {15}, issue = {12}, pages = {550}, }

Code

citation("ggplot2")

To cite ggplot2 in publications, please use

H. Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2016.

A BibTeX entry for LaTeX users is

@Book{, author = {Hadley Wickham}, title = {ggplot2: Elegant Graphics for Data Analysis}, publisher = {Springer-Verlag New York}, year = {2016}, isbn = {978-3-319-24277-4}, url = {https://ggplot2.tidyverse.org}, }

Code

citation("pheatmap")

To cite package ‘pheatmap’ in publications use:

Kolde R (2025). pheatmap: Pretty Heatmaps. doi:10.32614/CRAN.package.pheatmap https://doi.org/10.32614/CRAN.package.pheatmap, R package version 1.0.13, https://CRAN.R-project.org/package=pheatmap.

A BibTeX entry for LaTeX users is

@Manual{, title = {pheatmap: Pretty Heatmaps}, author = {Raivo Kolde}, year = {2025}, note = {R package version 1.0.13}, url = {https://CRAN.R-project.org/package=pheatmap}, doi = {10.32614/CRAN.package.pheatmap}, }

12.2 R session

List and version of tools used for the DE report generation.

Code

# Print sessionInfo()
sessionInfo()

R version 4.5.1 (2025-06-13)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 22.04.5 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: UTC
tzcode source: system (glibc)

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] ensembldb_2.30.0            AnnotationFilter_1.30.0    
 [3] GenomicFeatures_1.58.0      ashr_2.2-63                
 [5] viridis_0.6.5               viridisLite_0.4.2          
 [7] grafify_5.0.0.1             ggprism_1.0.6              
 [9] RColorBrewer_1.1-3          ggforce_0.5.0              
[11] htmltools_0.5.8.1           fgsea_1.32.4               
[13] pheatmap_1.0.13             DEGreport_1.42.0           
[15] EnhancedVolcano_1.24.0      ggrepel_0.9.6              
[17] vegan_2.7-1                 permute_0.9-7              
[19] DESeq2_1.46.0               rtracklayer_1.66.0         
[21] ggpubr_0.6.0                glue_1.8.0                 
[23] knitr_1.50                  org.Hs.eg.db_3.20.0        
[25] AnnotationDbi_1.68.0        janitor_2.2.1              
[27] SummarizedExperiment_1.36.0 Biobase_2.66.0             
[29] GenomicRanges_1.58.0        GenomeInfoDb_1.42.3        
[31] IRanges_2.40.1              S4Vectors_0.44.0           
[33] BiocGenerics_0.52.0         MatrixGenerics_1.18.1      
[35] matrixStats_1.5.0           GenomeInfoDbData_1.2.13    
[37] clusterProfiler_4.14.6      msigdbr_24.1.0             
[39] lubridate_1.9.4             forcats_1.0.0              
[41] stringr_1.5.1               dplyr_1.1.4                
[43] purrr_1.0.4                 readr_2.1.5                
[45] tidyr_1.3.1                 tibble_3.3.0               
[47] ggplot2_3.5.2               tidyverse_2.0.0            

loaded via a namespace (and not attached):
  [1] ProtGenerics_1.38.0         fs_1.6.6                   
  [3] bitops_1.0-9                enrichplot_1.26.6          
  [5] httr_1.4.7                  doParallel_1.0.17          
  [7] numDeriv_2016.8-1.1         tools_4.5.1                
  [9] backports_1.5.0             DT_0.33                    
 [11] R6_2.6.1                    lazyeval_0.2.2             
 [13] mgcv_1.9-3                  GetoptLong_1.0.5           
 [15] withr_3.0.2                 gridExtra_2.3              
 [17] cli_3.6.5                   logging_0.10-108           
 [19] sass_0.4.10                 labeling_0.4.3             
 [21] SQUAREM_2021.1              mvtnorm_1.3-3              
 [23] mixsqp_0.3-54               Rsamtools_2.22.0           
 [25] yulab.utils_0.2.0           gson_0.1.0                 
 [27] foreign_0.8-90              DOSE_4.0.1                 
 [29] R.utils_2.13.0              invgamma_1.1               
 [31] limma_3.62.2                rstudioapi_0.17.1          
 [33] RSQLite_2.4.1               generics_0.1.4             
 [35] gridGraphics_0.5-1          shape_1.4.6.1              
 [37] BiocIO_1.16.0               crosstalk_1.2.1            
 [39] vroom_1.6.5                 car_3.1-3                  
 [41] GO.db_3.20.0                Matrix_1.7-3               
 [43] abind_1.4-8                 R.methodsS3_1.8.2          
 [45] lifecycle_1.0.4             yaml_2.3.10                
 [47] edgeR_4.4.2                 snakecase_0.11.1           
 [49] carData_3.0-5               BiocFileCache_2.14.0       
 [51] qvalue_2.38.0               SparseArray_1.6.2          
 [53] grid_4.5.1                  blob_1.2.4                 
 [55] crayon_1.5.3                ggtangle_0.0.6             
 [57] lattice_0.22-7              cowplot_1.1.3              
 [59] KEGGREST_1.46.0             pillar_1.10.2              
 [61] ComplexHeatmap_2.22.0       rjson_0.2.23               
 [63] boot_1.3-31                 estimability_1.5.1         
 [65] codetools_0.2-20            fastmatch_1.1-6            
 [67] ggfun_0.1.8                 data.table_1.17.0          
 [69] vctrs_0.6.5                 png_0.1-8                  
 [71] treeio_1.30.0               Rdpack_2.6.4               
 [73] gtable_0.3.6                assertthat_0.2.1           
 [75] cachem_1.1.0                xfun_0.52                  
 [77] mime_0.13                   rbibutils_2.3              
 [79] S4Arrays_1.6.0              ConsensusClusterPlus_1.70.0
 [81] reformulas_0.4.0            iterators_1.0.14           
 [83] statmod_1.5.0               nlme_3.1-168               
 [85] ggtree_3.14.0               bit64_4.6.0-1              
 [87] filelock_1.0.3              bslib_0.9.0                
 [89] irlba_2.3.5.1               rpart_4.1.24               
 [91] colorspace_2.1-1            DBI_1.2.3                  
 [93] Hmisc_5.2-3                 nnet_7.3-20                
 [95] mnormt_2.1.1                tidyselect_1.2.1           
 [97] emmeans_1.11.0              bit_4.6.0                  
 [99] compiler_4.5.1              curl_6.4.0                 
[101] htmlTable_2.4.3             ggdendro_0.2.0             
[103] DelayedArray_0.32.0         checkmate_2.3.2            
[105] scales_1.4.0                psych_2.5.3                
[107] rappdirs_0.3.3              digest_0.6.37              
[109] minqa_1.2.8                 rmarkdown_2.29             
[111] XVector_0.46.0              pkgconfig_2.0.3            
[113] base64enc_0.1-3             lme4_1.1-37                
[115] dbplyr_2.5.0                fastmap_1.2.0              
[117] rlang_1.1.6                 GlobalOptions_0.1.2        
[119] htmlwidgets_1.6.4           UCSC.utils_1.2.0           
[121] jquerylib_0.1.4             farver_2.1.2               
[123] jsonlite_2.0.0              BiocParallel_1.40.2        
[125] GOSemSim_2.32.0             R.oo_1.27.1                
[127] RCurl_1.98-1.17             magrittr_2.0.3             
[129] Formula_1.2-5               ggplotify_0.1.2            
[131] patchwork_1.3.0             Rcpp_1.0.14                
[133] ape_5.8-1                   babelgene_22.9             
[135] stringi_1.8.7               zlibbioc_1.52.0            
[137] MASS_7.3-65                 AnnotationHub_3.14.0       
[139] plyr_1.8.9                  parallel_4.5.1             
[141] Biostrings_2.74.1           splines_4.5.1              
[143] hms_1.1.3                   circlize_0.4.16            
[145] locfit_1.5-9.12             igraph_2.1.4               
[147] ggsignif_0.6.4              reshape2_1.4.4             
[149] BiocVersion_3.20.0          XML_3.99-0.18              
[151] evaluate_1.0.3              BiocManager_1.30.26        
[153] nloptr_2.2.1                tzdb_0.5.0                 
[155] foreach_1.5.2               tweenr_2.0.3               
[157] polyclip_1.10-7             reshape_0.8.9              
[159] clue_0.3-66                 broom_1.0.8                
[161] xtable_1.8-4                restfulr_0.0.15            
[163] tidytree_0.4.6              rstatix_0.7.2              
[165] truncnorm_1.0-9             lmerTest_3.1-3             
[167] aplot_0.2.5                 memoise_2.0.1              
[169] GenomicAlignments_1.42.0    cluster_2.1.8.1            
[171] timechange_0.3.0

--- title: "Differential Expression" author: "Harvard Chan Bioinformatics Core" date: "`r Sys.Date()`" format: html: code-fold: true code-tools: true code-overflow: wrap df-print: paged highlight-style: pygments number-sections: true self-contained: true theme: default toc: true toc-location: left toc-expand: false lightbox: true page-layout: full params: ## Combatseq and ruv can both be false or ONLY ONE can be true # column is the factor of interest in your data that you are trying to evaluate # For example, this might be "condition" or "treatment" column: "sample_type" # This is a list of contrasts that you would like to evaluate # The syntax for each item in the list is: c("column_name", "treatment_name", "control_name") contrasts: value: - ["sample_type", "tumor", "normal"] - ["sample_type", "normal", "tumor"] # These next two subset lines are useful if you would like to subset your data in someway before you run the analysis # The column of data in you would like to subset, perhaps: "cell_type" or "batch" subset_column: null # Then, which values for that subset_column would you like to keep, like "macrophages" or "batch_2" subset_value: null # Reference genome used genome: hg38 # If you would like to run RUV, set to TRUE, otherwise FALSE ruv: false # If you would like to run ComBatseq, set to TRUE, otherwise FALSE combatseq: false params_file: ../00_params/params-example.R # example data # Path to the information.R file project_file: ../information.R # Path to the directory with the functions functions_file: ../00_libs --- Template developed with materials from https://hbctraining.github.io/main/. ```{r libraries} # NOTE: This code will check version, this is our recommendation, it may work on other versions # Requires R version 4 stopifnot(R.version$major >= 4) # If the minor version number is less than .3.1, send a warning that we recommend R version 4.3.1 or higher if (compareVersion(R.version$minor, "3.1") < 0) warning("We recommend >= R4.3.1") # Requires BiocManager version 3.18 or later stopifnot(compareVersion(as.character(BiocManager::version()), "3.18") >= 0) ``` This code is in this ![](https://img.shields.io/badge/status-stable-green) revision. ```{r load-params, cache = FALSE, message = FALSE, warning=FALSE} library(tidyverse) # 1. Set up input files in this R file (params_de.R) source(params$params_file) # 2. Set up project file (already done from QC probably) source(params$project_file) # 3. Load custom functions to load data from coldata/metrics/counts map(list.files(params$functions_file, pattern = "*.R$", full.names = T), source) %>% invisible() # Assign the genome parameter to the object genome # IMPORTANT set these values if you are not using the parameters in the header (lines 23-44) genome <- params$genome # If the genome name is matching to human or mouse, use their respective annotation, otherwise give a warning if (grepl("hg|human", genome, ignore.case = TRUE)) { library(org.Hs.eg.db) ann.org <- org.Hs.eg.db } else if (grepl("mm|mouse", genome, ignore.case = TRUE)) { library(org.Mm.eg.db) ann.org <- org.Mm.eg.db } else { warning("If you are working on an organism other than human or mouse: Bioconductor packages are also available for a number of other organisms. This code should work for any of the organisms with an eg.db package listed at https://bioconductor.org/packages/release/BiocViews.html#___Organism -- please search for your organism and assign ann.org manually.") } # Assigns the column object from the parameters in the YAML header to the column object column <- params$column # Assigns the contrasts object from the parameters in the YAML header to the contrasts object contrasts <- params$contrasts # Assigns the subset_column object from the parameters in the YAML header to the subset_column object subset_column <- params$subset_column # Assigns the subset_value object from the parameters in the YAML header to the subset_value object subset_value <- params$subset_value # Assigns the ruv boolean object from the parameters in the YAML header to the run_ruv object run_ruv <- params$ruv # Assigns the combatseq boolean object from the parameters in the YAML header to the run_combatseq object run_combatseq <- params$combatseq # If the run_ruv object and/or run_combatseq is TRUE, set the run_rmv boolean to TRUE, otherwise set run_rmv boolean to FALSE run_rmv <- run_ruv | run_combatseq # Assign the value of column to factor_of_interest, this will allow you to change factor_of_interest throughout the Rmd if needed, while maintaining the value of column factor_of_interest <- column ``` ```{r load-libraries, cache = FALSE, message = FALSE, warning=FALSE} # Load required libraries library(knitr) library(tidyverse) library(glue) library(janitor) library(purrr) library(ggpubr) library(rtracklayer) library(DESeq2) library(vegan) library(EnhancedVolcano) library(DEGreport) library(pheatmap) library(msigdbr) library(fgsea) library(htmltools) library(ggforce) library(RColorBrewer) library(ggprism) library(ggprism) library(grafify) library(viridis) library(ashr) # Overwrite the default filter function with dplyr's filter function filter <- dplyr::filter # Overwrite the default select function with dplyr's select function select <- dplyr::select # Setting the base theme that ggplot2 will use to theme_prism with the standard text size being 12 ggplot2::theme_set(theme_prism(base_size = 12)) # Setting scale color palette to the palette "kelly" from grafify # You can select different palettes from here: # https://grafify-vignettes.netlify.app/colour_palettes.html scale_colour_discrete <- function(...) { scale_colour_manual(..., values = as.vector(grafify:::graf_palettes[["kelly"]]) ) } # Setting scale fill palette to the palette "kelly" from grafify # You can select different palettes from here: # https://grafify-vignettes.netlify.app/colour_palettes.html scale_fill_discrete <- function(...) { scale_fill_manual(..., values = as.vector(grafify:::graf_palettes[["kelly"]]) ) } # Setting color to the palette "kelly" from grafify # You can select diffferent palettes from here: # https://grafify-vignettes.netlify.app/colour_palettes.html colors <- as.vector(grafify:::graf_palettes[["kelly"]]) # Setting options for Rmd code chunks opts_chunk[["set"]]( cache = FALSE, # When the cache is turned on, knitr will skip the execution of this code chunk if it has been executed before and nothing in the code chunk has changed since then. For a cached code chunk, its output and objects will be automatically loaded from the previous run, as if the chunk were executed again. cache.lazy = FALSE, # Whether to only load cached objects when needed dev = c("png", "pdf"), # Specifies the graphical devices to use figures error = TRUE, # Whether to stop rendering on an error highlight = TRUE, # Whether to enable syntax highlighting for the code in the chunk message = FALSE, # Whether to preserve messages emitted by message() in the final output prompt = FALSE, # Whether an R prompt (>) is displayed at the beginning of each line of R input code in rendered documents tidy = FALSE, # Whether or not R code in a chunk is formatted automatically before rendering, similar to the R package, styler warning = FALSE, # Whether to preserve warnings in the output echo = TRUE, # Whether to include R source code in the final knitted document fig.height = 4) # Sets the default height of plot figures generated in inches # Set seed for reproducibility set.seed(1234567890L) ``` ```{r sanitize-datatable} # Create a function to clean up data frames sanitize_datatable <- function(df, ...) { # Remove dashes from row names and column names which cause wrapping DT::datatable(df, ..., rownames = gsub("-", "_", rownames(df)), colnames = gsub("-", "_", colnames(df)), options = list(scrollX = TRUE, autoWidth = TRUE), class = "stripe hover" ) } ``` ```{r load-data, message=F, warning=F} # This code will load from nf-core folder # NOTE make sure to set numerator and denominator # Use the load_coldata function from the sourced load_data.R within the 00_params directory, to subset the colodata_fn dataframe that was pointed to when sourcing params.R within the 00_params directory # The factors for this subsetting were set in thr params section of the YAML code chunk. # The default subsetting is none since subset_column and subset_value are both set to null initially # The column object here is given to ensure that the column of interest exists in the loaded dataframe coldata <- load_coldata( coldata_fn, column, subset_column, subset_value ) # Set the row names from the coldata dataframe to be a column called sample # This column should already exist from the Nextflow run coldata$sample <- row.names(coldata) # Use the load_counts function from the sourced load_data.R within the 00_params directory, to load the counts data from the dataframe pointed to when sourcing params.R within the 00_params directory counts <- load_counts(counts_fn) # If subsetting parameters were provided, this will subset the counts data to match the samples in the coldata dataframe counts <- counts[, colnames(counts) %in% coldata$sample] # Use the load_metrics function from the sourced load_data.R within the 00_params directory, to load the metrics from the multiqc, along with the values of se_object and gtf_fn defined in the sourced params.R within the 00_params directory and also the recently created counts data, which is used to estimate the tRNA/rRNA rate (r_and_t_rna_rate). metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>% left_join(coldata, by = c("sample")) %>% # Join the coldata table to the end of object produced from the load_metrics function as.data.frame() # Format it as a dataframe # Assign the sample names to be the row names in the metrics object rownames(metrics) <- metrics$sample # If subsetting parameters were provided, this will subset the metrics data to match the samples in the coldata dataframe metrics <- subset(metrics, metrics$sample %in% coldata$sample) # If the sample names in the counts (column) don't match in order or string of the metrics file, reorder them in the counts object counts <- counts[, rownames(metrics)] # If the sample names in the coldata object (rows) don't match in order or string of the metrics file, reorder them in the counts object coldata <- coldata[rownames(metrics), ] # This will relevel the first contrast in your contrasts list with with the "control_name" as the baseline value defined in the params section of the YAML coldata[[contrasts[[1]][1]]] <- relevel(as.factor(coldata[[contrasts[[1]][1]]]), contrasts[[1]][3]) # Stop if the column names in counts does not match and in the same order as the row names in metrics stopifnot(all(colnames(counts) == rownames(metrics))) ``` # Overview - Project: `r project` - PI: `r PI` - Analyst: `r analyst` - Experiment: `r experiment` - Aim: `r aim` ```{r load-counts-data} # Query AnnotationHub and assign it to ah object ah <- AnnotationHub::AnnotationHub() # Retrieve the annotations from Annotation Hub dependent on your reference genome if (grepl("hg|human", genome, ignore.case = TRUE)) { # If the reference genome is human ens <- AnnotationHub::query(ah, c("Homo sapiens", "EnsDb")) # Retrieve the recent human annotations ens <- ens[["AH119325"]] # Assign a human annotation, may need to change } else if (grepl("mm|mouse", genome, ignore.case = TRUE)) { # If the reference genome is mouse ens <- AnnotationHub::query(ah, c("Mus musculus", "EnsDb")) # Retrieve the recent mouse annotations ens <- ens[["AH116340"]] # Assign a mouse annotation, may need to change } else { # If the reference genome is not human or mouse, prin this warning cat("Warning: Please, get a table that contains gene_id, gene_name and entrez to continue using this template.") } # Create a gene-level dataframe # NOTE: Change this code if you are not using AnnotationHub # The rdata is a table with gene_id, gene_name and entrez columns rdata <- ensembldb::genes(ens, return.type = "data.frame") %>% # Extract ens annotation as a dataframe dplyr::select(gene_id, gene_name, entrez = entrezid, gene_biotype, description ) %>% # Retrieve certain columns and rename the entrez column dplyr::filter(gene_id %in% rownames(counts)) %>% # Retain only gene annotations in the counts matrix dplyr::distinct(gene_id, .keep_all = TRUE) # Retain only unique gene_ids and also retain all of the other columns with .keep_all = TRUE # Keep one entrez Id per gene rdata$entrez <- purrr::map(rdata$entrez, 1) %>% unlist() # Filter not needed genes # rdata <- rdata %>% filter(!grepl("ncRNA", gene_biotype)) %>% # Remove non-coding RNAs # filter(!grepl('Mir', gene_name)) %>% # Remove miRNAs # filter(!grepl('Snor', gene_name)) # Remove snoRNAs # counts <- counts[rdata$gene_id,] # Filter the counts matrix after removing these not needed genes ``` # Set up We recommend avoiding manual filtering prior to running DESeq2, as the software performs internal filtering as part of its pipeline. However, pre-filtering may be appropriate in certain contexts to improve computational efficiency or address data imbalances, such as: - A high proportion of dropouts (zero counts), where filtering can reduce computational burden - A large number of samples, where pre-filtering low-expressed features can improve performance - Strongly unbalanced group sizes, where group-specific filtering strategies may help mitigate bias or improve sensitivity ```{r setup} # Establish a null DDS object for DESeq2 dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = ~1) # Perform a variance-stabilized transformation and assign the DESeqTransform object to vsd_before vsd_before <- vst(dds_to_use) # Extract the variance-stabilized transformed (normalized) counts and assign them to norm_matrix norm_matrix <- assay(vsd_before) ``` # PCA and group level variance. **Principal Component Analysis (PCA)** is a dimensionality reduction technique that transforms high-dimensional data into a smaller set of uncorrelated variables called principal components. In gene expression analysis, PCA reduces thousands of gene expression measurements into a few components that capture the most variance across samples. This helps visualize overall patterns, identify outliers, and detect batch effects or biological groupings. Dispersion estimates are a key part of the DESEQ2 analysis. DESEQ2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per gene dispersions to match this distribution. If one group has higher variance than all others this will affect the dispersion estimates. Here we visually check that the variance per group is similar using a PCA. The ellipses are minimal volume enclosing ellipses using the Khachiyan algorithm. **It is best practice NOT to subset your data unless one group has significantly higher variance than the others. The best dispersion estimates are obtained with more data.** **This code automatically uses the column value from the header. You can also manually add a factor of interest to define the groups. One can be created by combining multiple metadata columns using the paste0 function.** ```{r set-group, eval=FALSE, echo=FALSE} ## Example of creating a group covariate # meta$group <- paste0(meta$sex, "_", meta$age, "_", meta$treatment) # Re-assign the factor of interest in case it has change # You can modify this to test for different factors of interest with: # factor_of_interest <- "insert column name for covariate of interest" factor_of_interest <- column ``` ```{r plot-pca} # Run PCA on the normalized counts pca <- degPCA(norm_matrix, metadata = metrics, # Using the metrics object as our metadata condition = factor_of_interest, # Specifying the condition as our factor_of_interest from the above code chunk name = "sample", # Column from metadata to use to label the points in the PCA plot data = TRUE # Include the data ) pca$plot + # Create plot ggtitle(paste0("All samples", "\nPCA using ", nrow(vsd_before), " genes")) + # Add title to plot theme(plot.title = element_text(hjust = 0.5)) + # Center title on plot geom_mark_ellipse(aes(color = .data[[column]])) + # Create ellipses on plot scale_color_grafify("kelly") # Set color parameter for plot ``` ## Analysis of the variance by group Groups in a univariate analysis can also differ with regard to their mean values, variation around those means, or both. In univariate analyses, dispersion can be examined using Levene’s test. PERMDISP is a multivariate extension of Levene’s test to examine whether groups differ in variability. In essence, PERMDISP involves calculating the distance from each data point to its group centroid and then testing whether those distances differ among the groups. [Source](https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permdisp/) Here we apply this test to our variance stabilized data. We calculate distances between samples and then use the `betadisper()` function from the popular vegan package. We get two overall p-values where significant means that the dispersions are different between groups. The first p-value comes from the `anova()` function and the second from the `permutest()` function. We also get pairwise p-values for every group-group comparison. **This analysis allows us to confirm that the variance is the same in all groups (non-significant p-value), so that the DE analysis will be accurate.** Dispersion estimates are a key part of the DESeq2 analysis. DESeq2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per-gene dispersions to match this distribution. If one group has higher variance than all others, this will affect the dispersion estimates. ```{r PERMDISP} # Use vegdist() to compute pair-wise dissimilarity indices between samples from the normalized count matrix # Note, we need to transpose (use t())) the counts matrix for vegdist() to compute dissimilarity indices between the samples vare.disa <- vegdist(t(assay(vsd_before))) # Evaluate the homogeneity of dispersion within the groups mod <- betadisper(vare.disa, group = metrics[[factor_of_interest]]) # Test the homogeniety of dispersion between groups with ANOVA. # ANOVA tests is there is overall significant variation in variance between groups. anova(mod) # Test the homogeniety of dispersion between groups with a permutation test # The permutation test considers pairwise comparisons so you can see which group(s) is different permutest(mod, pairwise = TRUE) ``` # Covariate analysis Multiple factors related to the experimental design or quality of sequencing may influence the outcomes of a given RNA-seq experiment. To further determine whether any confounding covariate risks affecting the results of our differential expression analyses, it is useful to assess the correlation between covariates and principal component (PC) values. Here, we are using `DEGreport::degCovariates()` to explore potential correlations between variables provided in the metadata and all PCs that account for at least 5% of the variability in the data. If applicable, significant correlations (FDR < 0.1) are circled. **This diagnostic plot helps us determine which variables we may need to add to our DE model.** ```{r run-covariates, fig.height = 6, fig.width = 10} # Visualize covariates for the PCs degCovariates( counts = norm_matrix, # Normalized counts matrix metadata = metrics # Metadata to overlay as covariates ) ``` # Data modeling ```{r setup-deseq2} # Enter your design formula # The format should be: # as.formula(paste0("~ ", column_to_control_for, " + ", factor_of_interest)) formula <- as.formula(paste0("~ ", " + ", column)) # Stop if the column names in counts does not match and in the same order as the row names in metrics stopifnot(all(colnames(counts) == rownames(coldata))) # Create the DDS object for DESeq2 and assign it to the object dds_to_use dds_to_use <- DESeqDataSetFromMatrix( countData = counts, # Provide the raw counts data colData = coldata, # Provide the metadata design = formula # Provide the design formula ) # Carry out a variance-stabilized transformation of the data in the DDS object and assign it to vsd_before # Note: vsd_before and norm_matrix was previously assigned in the setup code chunk w/o variables in the formula so this is overwriting that # NOTE: VST won’t regress out this when normalizing vsd_before <- vst(dds_to_use) # Extract the variance-stabilized transformed (normalized) counts and assign them to norm_matrix norm_matrix <- assay(vsd_before) # We are going to modify coldata in the RUV section when is turn on, hence want to retain the original new_cdata <- coldata ``` For this study, this formula is recommended: `r as.character(formula)` ```{r note-ruv, eval=F, echo=FALSE} #### IF YOU ARE RUNNING RUV OR COMBATSEQ RUN THE CHUNKS BELOW OTHERWISE SKIP TO Differential Expression SECTION or remove this section ``` ## Remove unwanted variation Removing unwanted variation from RNA-seq analysis is essential to ensure that the results reflect biological rather than technical differences. Methods like ComBat, RUVseq, or surrogate variable analysis (SVA) can be applied to adjust for batch effects, library preparation discrepancies, or other confounders. These techniques model and subtract the unwanted variation, enhancing the ability to detect true biological signals in the data. Proper normalization and careful experimental design are also crucial steps to mitigate such unwanted variation. ### Assessing unknown factors ```{r note-ruv-print, results='asis'} # Print out text discussing is RUVseq will be done # If the RUV parameter in the params section of the YAML is TRUE, then if (run_ruv) { cat("When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.") } else { # If the RUV parameter in the params section of the YAML is FALSE, then cat("There is no need to assess unknown factor for this study.") } ``` ```{r do_RUV, eval=run_ruv, echo=run_ruv} # TOFIX Add to template: check correlation of dummy variables produced by ruvseq with existing covariates in metadata # NOTE: RUVseq is used when you don’t know where the unwanted variation is coming from. The package utilizes dummy variable(s), 1-5 used, start with 1, look at PCA, decide if you want more separation. Add any known-created RUV variables to DESeq2 formula. The normalized matrix produced – only for visualization, not for input into DESeq2. library(RUVSeq) # If you want to skip the code, just set up formula to be your model in the next chunk of code # Vector containing the variable of interest design <- coldata[[column]] # Formats the variable of interest into a matrix # Levels are the rows and columns are the number of elements of a given level # The value is the index of the positions in the vector diffs <- makeGroups(design) # Re-assigning the variance stabilized transformation count matrix to a object called dat dat <- norm_matrix # By default RUVSeq is running one variable, ruvset_2 <- RUVs( x = dat, # Variance stabilized transformed counts data cIdx = rownames(dat), # Vector of the genes k = 1, # Number of unknown covariates to find scIdx = diffs, # Matrix formatted variable of interest isLog = TRUE, # The variance stabilized transformation is approximately a log transformation round = FALSE # Whether to round the normalized measures to form pseudocounts ) # W is the estimation of the unknown source of variation that we can now add to DESeq2 vars <- ruvset$W # Create a new colData with W added to it new_cdata <- cbind(coldata, vars) # New formula with W formula <- as.formula(paste0( "~ ", paste0( colnames(new_cdata)[grepl("W", colnames(new_cdata))], collapse = " + " ), " + ", column )) # Overwrite the normalized count matrix with the normalized counts from RUVSeq # NOTE: Only use this for visualization norm_matrix <- ruvset$normalizedCounts # Create a PCA pca2 <- degPCA( counts = norm_matrix, # Provide the adjusted counts from RUVSeq metadata = new_cdata, # Provide metadata condition = column # Provide the variable of interest to color the plot by ) + ggtitle("After RUV") # Add a title # Plot PCA pca2 + scale_color_grafify("kelly") # Apply color scale ``` ```{r after_RUV, eval=run_ruv} # Make DDS object dds_to_use <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula) # Apply VST vsd_to_use <- vst(dds_to_use, blind = FALSE) ``` ### Remove Batch Effects ```{r combat-text , eval=run_combatseq, results='asis', echo=run_combatseq} # NOTE Combatseq (part of the SVA package) - corrected count, removes the effects while retaining the structure of the data. Used in a scenario where you know what covariate/batch is. Do not add known-removed covariates to DESeq2 formula. Also, don’t attempt to remove biological effect (e.g. donor), this is not conceptually valid; best for technical variation. library(sva) cat("Here we apply Combat-seq (https://github.com/zhangyuqing/ComBat-seq) to try to remove batch effects so we can better tease out the effects of interest. Combat-seq uses a negative binomial regression to model batch effects, providing adjusted data by mapping the original data to an expected distribution if there were no batch effects. The adjusted data preserves the integer nature of counts, so that it is compatible with the assumptions of state-of-the-art differential expression software (e.g. edgeR, DESeq2, which specifically request untransformed count data).") ``` ```{r print-no-ruv-or-batch, eval=!run_combatseq, results='asis', echo=run_combatseq} cat("There is no need to remove known factors like batch effect in this study.") ``` ```{r set_variable_combatseq, eval=run_combatseq, echo=run_combatseq} # NOTE: work on this code if you need to run ComBat-seq # Set your batch effect variable here this is the variable that ComBat-seq will try to remove ## Column name of your batch variable to_remove <- "batch" ## Column name of of your variable(s) of interest to_keep <- "sample_type" # Re-assign the column of coldata with the batch information as a factor coldata[[to_remove]] <- as.factor(coldata[[to_remove]]) # Re-assign the column of coldata with the variable of interest as a factor coldata[[to_keep]] <- as.factor(coldata[[to_keep]]) # Assign the column with batch information to the batch object batch <- coldata[[to_remove]] # Assign the column with variable of interest information to the treatment object treatment <- coldata[[to_keep]] # If you have multiple variables of interest you will need to assign each to a vector # treatment1 <- metrics[[to_keep]] # treatment2 <- metrics[[to_keep]] # treatment3 <- metrics[[to_keep]] # Column bind all of the vectors with columns containing variables of interest into one object # imp <- cbind(as.numeric(as.character(treatment1)),as.numeric(as.character(treatment2)), as.numeric(as.character(treatment3))) ``` ```{r do_combatseq, eval=run_combatseq} # Run ComBat-seq adjusted_counts <- ComBat_seq( counts = as.matrix(counts), # Provide the counts matrix batch = batch, # Provide the batch information group = treatment # Provide the variable of interest information ) # NOTE: Running ComBat-seq with multiple variables of interest # adjusted_counts <- ComBat_seq( # counts = as.matrix(counts), # Provide the counts matrix # batch = batch, # Provide the batch information # covar_mod = imp # Provide the variables of interest information # ) ``` ```{r after_combatseq, eval=run_combatseq} # Re-enter the adjusted count from ComBat-seq into DESeq # NOTE: Make sure the formula doesn't contain the covariates used in ComBat-seq above dds_to_use <- DESeqDataSetFromMatrix( countData = adjusted_counts, # Provide the adjusted counts from ComBat-seq colData = coldata, # Provide coldata design = formula # Provide design formula, notably without the covariates used in ComBat-seq ) # Perform a variance-stabilized transformation on the ComBat-seq adjusted count data vsd_combat <- vst(dds_to_use, blind = FALSE) # Retrieve the variance-stabilized transformed counts data and assign it to the object norm_matrix norm_matrix <- assay(vsd_combat) # Run degPCA to use prcomp and make a plot pca_combat <- degPCA( counts = norm_matrix, # Variance-stabilized transformed counts data after ComBat-seq metadata = coldata, # Provide metadata condition = column # Provide the variable of interest to color the plot by ) + ggtitle("After Combatseq") # Add title # Plot PCA from dgePCA pca_combat + scale_color_grafify("kelly") # Apply color scale ``` # Differential Expression Differential gene expression analysis of count data was performed using the Bioconductor R package, DESeq2, which fits the count data to a negative binomial model. Before fitting the model, we often look at a metric called dispersion, which is a measure for variance which also takes into consideration mean expression. A dispersion value is estimated for each individual gene, then 'shrunken' to a more accurate value based on expected variation for a typical gene exhibiting that level of expression. Finally, the shrunken dispersion value is used in the final GLM fit. We use the below dispersion plot, which should show an inverse relationship between dispersion and mean expression, to get an idea of whether our data is a good fit for the model. ```{r run-deseq2} # Perform differential expression analysis on the DDS object de <- DESeq(dds_to_use) # Plot the dispersion for the data DESeq2::plotDispEsts(de) ``` Because it is difficult to accurately detect and quantify the expression of lowly expressed genes, differences in their expression between treatment conditions can be unduly exaggerated after the model is fit. We correct for this so that gene LFC is not dependent overall on basal gene expression level. In cases there are multiple groups and conditions across groups is recommended to use dummy variables instead of interaction terms: https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#interactions. The LRT is useful for testing multiple terms at once, for example testing 3 or more levels of a factor at once, or all interactions between two variables. The LRT for count data is conceptually similar to an analysis of variance (ANOVA) calculation in linear regression, except that in the case of the Negative Binomial GLM, we use an analysis of deviance (ANODEV), where the deviance captures the difference in likelihood between a full and a reduced model. ```{r lfc-shrink} # NOTE: We recommend LRT for time series # resultsNames(de) # Check the order is right # Create a list that mirrors the contrasts in the contrasts list object where the pattern is: # <column_of_comparison>_<treatment_name>_vs_<control_name> names_to_use <- lapply(contrasts, function(contrast) { coef <- paste0(contrast[1], "_", contrast[2], "_vs_", contrast[3]) }) # Currently the contrasts list object doesn't have names for the items in the list # Assign the names from names_to_use to the names in the contrasts list object names(contrasts) <- names_to_use # Create a list called `de_list` to hold the differential expression information for each contrast # Go through each contrast in the contrasts list object and de_list <- lapply(contrasts, function(contrast) { # Extract the results (LFC, pvalue, padj, etc.) for the contrast resLFC <- results(de, contrast = contrast) # Create a string for the contrast following this pattern: # <column_of_comparison>_<treatment_name>_vs_<control_name> coef <- paste0(contrast[1], "_", contrast[2], "_vs_", contrast[3]) # Apply LFC shrinkage with `ashr` # NOTE: Use `ashr` for comparisons with many groups to be able to pull out all the contrasts; otherwise `apeglm` is fine. It shrinks less. resLFCS <- lfcShrink( dds = de, # Provide the DDS object contrast = contrast, # Provide the contrast type = "ash" # Use `ashr` to apply the shrinkage ) # If you want to use `apeglm` instead of `ashr` then use: # resLFCS <- lfcShrink( # dds = de, # Provide the DDS object # coef = coef, # Provide the contrast # type = "apeglm" # Use `apeglm` to apply the shrinkage # ) # Create a new results data frame with some extra information res <- as.data.frame(resLFCS) %>% # Extract the results from the resLFCS as a dataframe rownames_to_column("gene_id") %>% # Move the rownames to a column of data called "gene_id" left_join(rdata, by = "gene_id") %>% # Combine the results with the rdata object by the gene_id and rdata holds information about each gene relocate(gene_name) %>% # Move the "gene_name" column to be the first column dplyr::rename(lfc = log2FoldChange) %>% # Rename the column log2FoldChange to lfc mutate(pi = abs(lfc) * -log10(padj)) %>% # Create a new column named pi that multiplies the absolute value of log fold change by the log10 adjusted adjusted p-value arrange(-pi) # Arrange the data frame by the highest values of pi # Filter out genes that have no expression or were filtered out by DESEQ2 res <- res[res$baseMean > 0, ] %>% # Remove genes with no expression drop_na(padj) %>% # Remove genes who don't have an adjusted p-value (lots of lowly expressed genes) drop_na(pvalue) # Remove genes who don't have a p-value (lots of lowly expressed genes) # Extract the significant genes res_sig <- res %>% filter(padj < 0.05) %>% # Retain the values that have an adjusted p-value less than 0.05 arrange(padj) %>% # Arrange the genes by the lowest adjusted p-value mutate(gene_name = ifelse(test = gene_name == "" | is.na(gene_name), # If the gene_name value for a gene is NA or blank then yes = gene_id, # Use the gene_id as the gene_name no = gene_name # Maintain the gene_name )) # Create a list object called results results <- list( lfc = resLFC, # Assign the unshrunken results data to `lfc` lfcs = resLFCS, # Assign the shrunken results data to `lfcs` all = res, # Assign the full differential expression results to `all` sig = res_sig # Assign the significantly differentially expressed genes to `sig` ) # Return the `results` list object to be a list within the object `de_list` return(results) }) # NOTE: If you manually add any other comparison to the list with the following variables, the code below will make the plots for those as well: # de_list <- c(de_list, new_comparison=list(lfc=resLFC, # lfcs=resLFCS, # all=res, sig=res_sig)) ``` ## MA plot This plot can help to: - Identify Differential Expression: Genes that show a significant log-fold change (M value away from 0) indicate changes in expression between conditions. - Assess Data Quality: The plot can help in identifying biases or systematic errors in the data. Ideally, most points should scatter around the M=0 line, indicating that there is no significant systematic difference between the conditions. - Visualize data dispersion: The distribution of points along the A-axis gives a sense of the spread of expression levels and any patterns or anomalies in the dataset. ::: {.panel-tabset} ```{r after_lfc_shrink, message=F, warning=F} #| results: asis #| fig-width: 4 #| fig-height: 4 # For each of the contrasts in the de_list list object to create MA plots for (contrast in names(de_list)) { # Create a tabset of the contrast cat("### ", contrast, "\n\n") # Use the unshrunken (lfc) counts for creating an MA plot p1 <- degMA(as.DEGSet(de_list[[contrast]]$lfc)) + ggtitle("Before LFC Shrinking") # Add title # Print the unshrunken counts MA plot print(p1) # Use the shrunken (lfc) counts for creating an MA plot p2 <- degMA(as.DEGSet(de_list[[contrast]]$lfcs), limit = 2) + ggtitle("After LFC Shrinking") # Add title print(p2) # Print the shrunken counts MA plot cat("\n\n") # Print two new lines } ``` ::: ## Differentially Expressed Genes ```{r} #| results: asis dt_list <- list() for (contrast in names(de_list)) { res_sig <- de_list[[contrast]][["sig"]] dt_list <- c( dt_list, list(h3(contrast)), list(DT::datatable(res_sig, options = list( scrollX = TRUE, autoWidth = TRUE ), class = "stripe hover" )) ) } tagList(dt_list) ``` ## Volcano plot This volcano plot shows the genes that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in purple are genes that are significantly differentially expressed with a large effect (padj < 0.05 and model coefficient > ±0.5). Points in pink are significant with a small effect (padj < 0.05 and model coefficient < ±0.5) and points in blue are not significant but have a large effect (padj > 0.05 and model coefficient > ±0.5). Grey points are non-significant. The dashed lines correspond to the cutoff values of padj and model coefficient that we have chosen. ::: {.panel-tabset} ```{r} #| results: asis #| fig-height: 6 for (contrast in names(de_list)) { cat("### ", contrast, "\n\n") res <- de_list[[contrast]][["all"]] res_mod <- res show <- as.data.frame(res_mod[1:10, c("lfc", "padj", "gene_name")]) p1 <- EnhancedVolcano(res_mod, lab = res_mod$gene_name, pCutoff = 0.05, selectLab = c(show$gene_name), FCcutoff = 0.5, x = "lfc", y = "padj", title = contrast, col = c("darkgrey", "lightblue", "plum1", "purple"), subtitle = "", drawConnectors = TRUE, max.overlaps = Inf ) print(p1) cat("\n\n") } ``` ::: ## Heatmap ::: {.panel-tabset} ```{r} #| results: asis for (contrast in names(de_list)) { cat("### ", contrast, "\n\n") res_sig <- de_list[[contrast]][["sig"]] ma <- norm_matrix[res_sig$gene_id, ] colma <- coldata[, c(column), drop = FALSE] ma_colors <- lapply(colnames(colma), function(c) { l.col <- colors[1:length(unique(colma[[c]]))] names(l.col) <- sort(unique(colma[[c]])) l.col }) names(ma_colors) <- colnames(colma) p1 <- pheatmap(ma, color = inferno(10), cluster_rows = TRUE, show_rownames = FALSE, annotation = colma, annotation_colors = ma_colors, border_color = NA, fontsize = 10, scale = "row", fontsize_row = 10, height = 20 ) print(p1) cat("\n\n") } ``` ::: ## Plot Top 16 Genes ::: {.panel-tabset} ```{r} #| results: asis #| fig-height: 6 #| fig-width: 8 n <- 16 for (contrast in names(de_list)) { cat("### ", contrast, "\n\n") res_sig <- de_list[[contrast]][["sig"]] top_n <- res_sig %>% slice_min(order_by = padj, n = n, with_ties = FALSE) %>% dplyr::select(gene_name, gene_id) top_n_exp <- norm_matrix %>% as.data.frame() %>% rownames_to_column("gene_id") %>% pivot_longer(cols = !gene_id, names_to = "sample", values_to = "log2_expression") %>% right_join(y = top_n, relationship = "many-to-many") %>% left_join(y = coldata, by = "sample") p1 <- ggplot(top_n_exp, aes_string(x = column, y = "log2_expression")) + geom_boxplot(outlier.shape = NA, linewidth = 0.5, color = "grey") + geom_point() + facet_wrap(~gene_name) + ggtitle(str_interp("Expression of Top ${n} DEGs")) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) print(p1) cat("\n\n") } ``` ::: # Pathway Enrichment From the set of differentially expressed genes and using publicly available information about gene sets involved in biological processes and functions, we can calculate which biological processes and functions are significantly perturbed as a result of the treatment. ```{r setup-fa} # Call the appropriate functional analysis databases for your analysis if (grepl("hg|human", genome, ignore.case = TRUE)) { # If the genome value in the params section of the YAML contains hg or human, then all_in_life <- get_databases_v2(sps = "human") # Get the human pathway databases using the get_databases_v2 function form the FA.R file within the 00_libs directory run_FA <- TRUE # Assign the value of TRUE to a object called run_FA and it will allow the functional analysis sections to run } else if (grepl("mm|mouse", genome, ignore.case = TRUE)) { # If the genome value in the params section of the YAML contains mm or mouse, then all_in_life <- get_databases_v2(sps = "mouse") # Get the mouse pathway databases using the get_databases_v2 function form the FA.R file within the 00_libs directory run_FA <- TRUE # Assign the value of TRUE to a object called run_FA and it will allow the functional analysis sections to run } else { # If your organism isn't mouse or human then warning("If you are working on an organism other than human or mouse: This pathway enrichment is based on annotations from MSigDB https://www.gsea-msigdb.org/gsea/msigdb which only has human and mouse collections. For non-model organism pathway analysis, please see the template rnaseq/03_functional/Nonmodel_Organism_Pathway_Analysis.Rmd") # Print this error message run_FA <- FALSE # Assign the value of FALSE to a object called run_FA and it will not allow the functional analysis sections to run } ``` # Pathway Analysis- GSEA Gene Set Enrichment Analysis (GSEA) is a computational method used to determine whether a predefined set of genes shows statistically significant, concordant differences between two biological states (e.g., disease vs. normal) in RNA-seq data or other types of gene expression data. Advantages of GSEA. - Biological Insight: Helps in understanding the underlying biological processes and pathways affected, rather than focusing on individual genes. - Incorporation of Prior Knowledge: Utilizes predefined gene sets, allowing integration of existing biological knowledge. - Contextual Relevance: Can reveal subtle but coordinated changes in biologically meaningful gene sets that might not be apparent when looking at individual genes. **In the GSEA results, NES (normalized enrichment score) refers to the direction of regulation of the pathway. NES < 0 means downregulated, while NES > 0 means upregulated.** ```{r run-gsea, warning=F, message=F, eval=run_FA} # Go through each contrast in the de_list list object and assign data frames of significantly enriched pathways to to different elements of the list fa_gsea_list <- lapply(de_list, function(contrast) { # For a given contrast pull out the full differential expression results res <- contrast[["all"]] # Creating the input for the GSEA analysis gsea_input <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value) filter(!is.na(padj)) %>% # Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object dplyr::select(gene_id, lfc) # Extract the columns "gene_id" and "lfc" # Obtain the ENSEMBL ID and gene symbols for the genes in the gsea_input object # NOTE: Change to the correct species if not working in human or mouse input_entrezid <- AnnotationDbi::select( x = ann.org, # Annotation database to search keys = gsea_input$gene_id, # The gene_ids to search for keytype = "ENSEMBL", # The gene_ids (keys) are ENSEMBL annotations columns = c("ENTREZID", "SYMBOL") ) # Return the queried ENSEMBL ID, along with the ENTREZ ID and the gene symbol # Combine gene annotation with the lfc information input_entrezid <- inner_join( x = gsea_input, # Combine the ENSEMBL ID and lfc input with y = input_entrezid, # The ENTREZ ID and gene symbol information by = c("gene_id" = "ENSEMBL") ) %>% # Join these table together by common ENSEMBL IDs filter(!is.na(ENTREZID)) %>% # Remove the genes without an ENTREZ ID distinct(ENTREZID, .keep_all = TRUE) %>% # Retain the unique ENTREZ IDs and all columns of data arrange(desc(lfc)) # Arrange the data frame by descending values of lfc # Run the run_fgsea_v2 function found in FA.R within the 00_libs directory to run the GSEA analysis and assign the output to the object tb tb <- run_fgsea_v2( input = input_entrezid, # Query for the GSEA analysis all_in_life = all_in_life ) # Annotation databases to query against tb %>% filter(padj < 0.05) %>% arrange(padj) # Filter and sort the GSEA output for results with an adjust p-value less than 0.05 and return them }) ``` ```{r print-gsea, results='asis', eval=run_FA} # NOTE: DT::datatables doesn't work with tabset and for loops # You can use the following code to print dynamically or call manually sanitize_datatable() multiple times # Create an empty list to assign contrast values and DT data tables to dt_list <- list() # For each of the contrasts in the de_list list object to extract the name of the contrast and the GSEA results to be rendered into an HTML table for (contrast in names(de_list)) { # Assign the significantly enriched pathways to the object res_sig res_sig <- fa_gsea_list[[contrast]] %>% # Use the GSEA obect for a given contrast. NOTE: These have already been filter for adjusted p-value less than 0.05. filter(padj < 0.05) %>% arrange(padj) # Filter and sort for adjusted p-values of less than 0.05 # Populate the dt_list list dt_list <- c( dt_list, # With the contents of dt_list from the previous loop list(h3(contrast)), # Odd numbered objects in the list will be the contrast name in a HTML renderable Heading 3 size list(sanitize_datatable(res_sig)) # Even numbered object in the list will hold HTML renderable data table from the DT package containing the significantly enriched pathways ) } # Render the dt_list into HTML tagList(dt_list) ``` # Pathway Analysis- Over-representation Over-Representation Analysis (ORA) is a statistical method used to determine whether a predefined set of genes (e.g., genes belonging to a specific biological pathway or function) is over-represented (or enriched) among a list of differentially expressed genes (DEGs) from RNA-seq data. Adventages of ORA: - Simplicity: Easy to perform and interpret. - Biological Insight: Helps to identify pathways and processes that are significantly affected in the condition studied. - Prior Knowledge Integration: Utilizes existing biological knowledge through predefined gene sets. ```{r run-ora, warning=F, message=F, eval=run_FA} # Go through each contrast in the de_list list object and fa_list <- lapply(de_list, function(contrast) { # For a given contrast pull out the full differential expression results res <- contrast[["all"]] # Extract the universe of genes to the ORA on and assign it to the object universe universe <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value) filter(!is.na(padj)) %>% # Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object pull(gene_id) # Extract the gene_id column # Subset the full annotations to only contain universe we will evaluate within universe_mapping <- rdata %>% # Start with the full annotations filter(gene_id %in% universe, !is.na(entrez)) # Only retain the the annotations that are in the universe of genes (baseMean greater than 0, along with defined p-value and adjusted p-value) and have an ENTREZ ID # AnnotationDbi::select(ann.org, universe, 'ENSEMBL', columns=c('ENTREZID', 'SYMBOL')) # Create the input vector of genes for the ORA with a positive or negative lfc ora_input <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value) filter(!is.na(padj), padj < 0.01, abs(lfc) > 0.3) %>% # Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object, and only keep genes with adjusted p-values less than 0.01 and the absolute value of the lfc is greater than 0.3 pull(gene_id) # Extract the gene ID # Create the ENTREZ ID input for the ORA # NOTE: Change to the correct species if not working in human or mouse input_entrezid <- rdata %>% # Start with the full annotations filter(gene_id %in% ora_input, !is.na(entrez)) # Only retain the the annotations that are are in the ora_input object (ajusted p-value less than 0.01, lfc greater than the absolute value of 0.3) and have an ENTREZ ID # AnnotationDbi::select(ann.org, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL')) # Run the ORA using the run_fora_v2 function within the FA.R file in the 00_libs directory all <- run_fora_v2( input = input_entrezid, # Data frame of genes that are being evaluated for enrichment uni = universe_mapping, # Universe of gene to look for enrichment within all_in_life = all_in_life ) # Annotation databases to query against # Create the input vector of genes for the ORA with a positive lfc ora_input <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value) filter(!is.na(padj), padj < 0.01, lfc > 0.3) %>% # Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object, and only keep genes with adjusted p-values less than 0.01 and the lfc is greater than 0.3 pull(gene_id) # Extract the gene ID # Create the ENTREZ ID input for the ORA # NOTE: Change to the correct species if not working in human or mouse input_entrezid <- rdata %>% # Start with the full annotations filter(gene_id %in% ora_input, !is.na(entrez)) # Only retain the the annotations that are are in the ora_input object (adjusted p-value less than 0.01 and lfc greater than 0.3) and have an ENTREZ ID # Run the ORA using the run_fora_v2 function within the FA.R file in the 00_libs directory up <- run_fora_v2( input = input_entrezid, # Data frame of genes that are being evaluated for enrichment uni = universe_mapping, # Universe of gene to look for enrichment within all_in_life = all_in_life ) # Annotation databases to query against # Create the input vector of genes for the ORA with a negative lfc ora_input <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value) filter(!is.na(padj), padj < 0.01, lfc < -0.3) %>% # Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object, and only keep genes with adjusted p-values less than 0.01 and the lfc is less than -0.3 pull(gene_id) # Extract the gene ID # Create the ENTREZ ID input for the ORA # NOTE: Change to the correct species if not working in human or mouse input_entrezid <- rdata %>% # Start with the full annotations filter(gene_id %in% ora_input, !is.na(entrez)) # Only retain the the annotations that are are in the ora_input object (adjusted p-value less than 0.01 and lfc less than -0.3) and have an ENTREZ ID # Run the ORA using the run_fora_v2 function within the FA.R file in the 00_libs directory down <- run_fora_v2( input = input_entrezid, # Data frame of genes that are being evaluated for enrichment uni = universe_mapping, # Universe of gene to look for enrichment within all_in_life = all_in_life ) # Annotation databases to query against # Create a list to hold the ORA results for a given contrast list( all = all, # Assign the positive and negative lfc combined ORA results to "all" up = up, # Assign the positive lfc combined ORA results to "up" down = down # Assign the negative lfc combined ORA results to "down" ) }) ``` ## All significant genes ```{r} #| results: asis #| eval: !expr run_FA # NOTE DT::datatables doesn't work with tabset and for loops # You can use the following code to print dynamically or call manually sanitize_datatable() multiple times dt_list <- list() for (contrast in names(de_list)) { res_sig <- fa_list[[contrast]][["all"]] %>% filter(padj < 0.05) %>% arrange(padj) # Filter and sort dt_list <- c( dt_list, list(h3(contrast)), list(sanitize_datatable(res_sig)) ) } tagList(dt_list) ``` ## Down-regulated genes ```{r print-ora-down} #| results: asis #| eval: !expr run_FA # Create an empty list to assign contrast values and DT data tables to dt_list <- list() # For each of the contrasts in the de_list list object to extract the name of the contrast and the ORA results to be rendered into an HTML table for (contrast in names(de_list)) { # Create a dataframe to hold the significantly over-represented terms for genes with a negative lfc res_sig <- fa_list[[contrast]][["down"]] %>% # Start with the complete list of terms used in the ORA evaluated for negative lfc filter(padj < 0.05) %>% arrange(padj) # Filter and sort only the results with adjusted p-values of 0.05 # Populate the dt_list list dt_list <- c( dt_list, # With the contents of dt_list from the previous loop list(h3(contrast)), # Odd numbered objects in the list will be the contrast name in a HTML renderable Heading 3 size list(sanitize_datatable(res_sig)) # Even numbered object in the list will hold HTML renderable data table from the DT package containing the significantly over-represented terms for genes with a negative lfc ) } # Render the dt_list into HTML for the significantly over-represented terms for genes with a negative lfc tagList(dt_list) ``` ## Up-regulated genes ```{r print-ora-up} #| results: asis #| eval: !expr run_FA # Create an empty list to assign contrast values and DT data tables to dt_list <- list() # For each of the contrasts in the de_list list object to extract the name of the contrast and the ORA results to be rendered into an HTML table for (contrast in names(de_list)) { # Create a dataframe to hold the significantly over-represented terms for genes with a positive lfc res_sig <- fa_list[[contrast]][["up"]] %>% # Start with the complete list of terms used in the ORA evaluated for positive lfc filter(padj < 0.05) %>% arrange(padj) # Filter and sort only the results with adjusted p-values of 0.05 # Populate the dt_list list dt_list <- c( dt_list, # With the contents of dt_list from the previous loop list(h3(contrast)), # Odd numbered objects in the list will be the contrast name in a HTML renderable Heading 3 size list(sanitize_datatable(res_sig)) # Even numbered object in the list will hold HTML renderable data table from the DT package containing the significantly over-represented terms for genes with a positive lfc ) } # Render the dt_list into HTML for the significantly over-represented terms for genes with a positive lfc tagList(dt_list) ``` # Save files ```{r write-files} # If the subset value from the params in the YAML is not null then if (!is.null(subset_value) & !is.null(subset_column)) { # Interpret the value of subset_value and assign it to the object called filenames filenames <- str_interp("${subset_value}") } else { # If the subset value from the params in the YAML is null then # Assign the calue of "full" to the object called filenames filenames <- "full" } # Assign the value of filenames to a new object called filename filename <- paste0(filenames) # Assign the file path to save the file path for the expression data CSV to the object name_expression_fn name_expression_fn <- file.path( # Construct a file path basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directory str_interp("${filename}_expression.csv") # Interpret the value of "filename" and insert that in front of "_expression.csv" ) # Extract the variance stabilized transformed counts matrix into a data frame to be saved counts_norm <- norm_matrix %>% # Start with the variance stabilized transformed counts matrix as.data.frame() %>% # Convert the matrix to a data frame # Convert the row names (ENSEMBL IDs) to a column called "gene_id" rownames_to_column("gene_id") %>% left_join(rdata, by = "gene_id") # Write the variance stabilized transformed counts matrix to a CSV file with the name_expression_fn path write_csv(counts_norm, name_expression_fn) # For each of the contrasts in the de_list list object to save the differential expression analysis and ORA to their respective CSV files for (contrast in names(contrasts)) { # Create a file name for the differential expression analysis name_deg_fn <- file.path( # Construct a file path basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directory str_interp("${filename}_${contrast}_deg.csv") # Interpret the value of "filename" and "contrast", then insert that in front of "_deg.csv" ) # Create a file name for the ORA analysis name_pathways_fn <- file.path( # Construct a file path basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directory str_interp("${filename}_${contrast}_pathways.csv") # Interpret the value of "filename" and "contrast", then insert that in front of "_pathways.csv" ) # Assign the full differential expression analysis for a given contrast to the object called res_for_writing res_for_writing <- de_list[[contrast]][["all"]] %>% # Start with the full differential expression analysis dataframe for the given contrast mutate(comparison = contrast) # Add a column to the table with the given contrast # Assign the full ORA for positive and negative lfc for a given contrast to the object called pathways_for_writing # NOTE: Choose what ORA to save (all, down or up). To save everything, add more lines of this code pathways_for_writing <- fa_list[[contrast]][["all"]] %>% # Start with the full ORA dataframe evaluating negative and positive lfc for the given contrast mutate(comparison = contrast) # Add a column to the table with the given contrast # Write the full differential expression analysis for a given contrast to a CSV file with the name_deg_fn path write_csv(res_for_writing, name_deg_fn) # Write the full ORA for a given contrast to a CSV file with the name_pathways_fn path write_csv(pathways_for_writing, name_pathways_fn) } ``` If you would like to use `DEGpatterns.Rmd` template to visualize trends of gene changes across conditions/groups, please save the following output: ```{r save} # Save the DESeq object to an RDS file saveRDS(dds_to_use, "DEGpattern_deseq_obj.rds") # Save the metadata to an RDS file saveRDS(coldata, "deseq_coldata.rds") # Set the adjusted p-value threshold to 0.05 padj.cutoff <- 0.05 # Assign a value for the top n genes to XXX topN <- 1000 # Go through each contrast in the de_list list object and extract the significance values for the significant genes for top N significant genes # This will go through the de_list object, assign the name of the contrast in the de_list to contrast and assign its associated differential expression list (containing lfc, lfcs, all and sig) to the object contrast_de deg_List <- imap(de_list, \(contrast_de, contrast){ # Extract the significant genes from a given contrast and assign them to res_LRT res_LRT <- contrast_de[["sig"]] # Create a named vector where the values in the vector are the adjusted p-values and the names are the index of the value gene_padj <- setNames(res_LRT$padj, rownames(res_LRT)) # Subset the gene_padj vector by the adjusted p-value threshold established in the top of this code block gene_sig <- gene_padj[gene_padj < padj.cutoff] # Arrange the gene significance values with the most significant first (they should already be arranged this way though), then keep either the first N genes defined by topN at the top of this code block or the total number of significant genes, whichever is fewer and assign it to the object called degs degs <- sort(gene_sig)[1:min(length(gene_sig), topN)] # Save the RDS object for the significance values for a given contrast, adjusted p-value threshold and number of top N genes saveRDS(degs, glue("DEGpattern_deseq_{contrast}_DEGs_padj{padj.cutoff}_topN{topN}.rds")) # Populate the deg_List with the degs vector for each contrast return(degs) }) # Please note, you can choose to combine all significant results across contrast as well ``` # Conclusion # Methods ## R package references ```{r citations, results='asis'} # Provide citations citation("DESeq2") citation("ggplot2") citation("pheatmap") ``` ## R session List and version of tools used for the DE report generation. ```{r} # Print sessionInfo() sessionInfo() ```