---
title: "Differential Expression"
author: "Harvard Chan Bioinformatics Core"
date: "`r Sys.Date()`"
format:
html:
code-fold: true
code-tools: true
code-overflow: wrap
df-print: paged
highlight-style: pygments
number-sections: true
self-contained: true
theme: default
toc: true
toc-location: left
toc-expand: false
lightbox: true
page-layout: full
params:
## Combatseq and ruv can both be false or ONLY ONE can be true
# column is the factor of interest in your data that you are trying to evaluate
# For example, this might be "condition" or "treatment"
column: "sample_type"
# This is a list of contrasts that you would like to evaluate
# The syntax for each item in the list is: c("column_name", "treatment_name", "control_name")
contrasts:
value:
- ["sample_type", "tumor", "normal"]
- ["sample_type", "normal", "tumor"]
# These next two subset lines are useful if you would like to subset your data in someway before you run the analysis
# The column of data in you would like to subset, perhaps: "cell_type" or "batch"
subset_column: null
# Then, which values for that subset_column would you like to keep, like "macrophages" or "batch_2"
subset_value: null
# Reference genome used
genome: hg38
# If you would like to run RUV, set to TRUE, otherwise FALSE
ruv: false
# If you would like to run ComBatseq, set to TRUE, otherwise FALSE
combatseq: false
params_file: ../00_params/params-example.R # example data
# Path to the information.R file
project_file: ../information.R
# Path to the directory with the functions
functions_file: ../00_libs
---
Template developed with materials from https://hbctraining.github.io/main/.
```{r libraries}
# NOTE: This code will check version, this is our recommendation, it may work on other versions
# Requires R version 4
stopifnot(R.version$major >= 4)
# If the minor version number is less than .3.1, send a warning that we recommend R version 4.3.1 or higher
if (compareVersion(R.version$minor, "3.1") < 0) warning("We recommend >= R4.3.1")
# Requires BiocManager version 3.18 or later
stopifnot(compareVersion(as.character(BiocManager::version()), "3.18") >= 0)
```
This code is in this  revision.
```{r load-params, cache = FALSE, message = FALSE, warning=FALSE}
library(tidyverse)
library(purrr)
# 1. Set up input files in this R file (params_de.R)
source(params$params_file)
# 2. Set up project file (already done from QC probably)
source(params$project_file)
# 3. Load custom functions to load data from coldata/metrics/counts
purrr::map(list.files(params$functions_file, pattern = "*.R$", full.names = T), source) %>% invisible()
# Assign the genome parameter to the object genome
# IMPORTANT set these values if you are not using the parameters in the header (lines 23-44)
genome <- params$genome
# If the genome name is matching to human or mouse, use their respective annotation, otherwise give a warning
if (grepl("hg|human", genome, ignore.case = TRUE)) {
library(org.Hs.eg.db)
ann.org <- org.Hs.eg.db
} else if (grepl("mm|mouse", genome, ignore.case = TRUE)) {
library(org.Mm.eg.db)
ann.org <- org.Mm.eg.db
} else {
warning("If you are working on an organism other than human or mouse: Bioconductor packages are also available for a number of other organisms. This code should work for any of the organisms with an eg.db package listed at https://bioconductor.org/packages/release/BiocViews.html#___Organism -- please search for your organism and assign ann.org manually.")
}
# Assigns the column object from the parameters in the YAML header to the column object
column <- params$column
# Assigns the contrasts object from the parameters in the YAML header to the contrasts object
contrasts <- params$contrasts
# Assigns the subset_column object from the parameters in the YAML header to the subset_column object
subset_column <- params$subset_column
# Assigns the subset_value object from the parameters in the YAML header to the subset_value object
subset_value <- params$subset_value
# Assigns the ruv boolean object from the parameters in the YAML header to the run_ruv object
run_ruv <- params$ruv
# Assigns the combatseq boolean object from the parameters in the YAML header to the run_combatseq object
run_combatseq <- params$combatseq
# If the run_ruv object and/or run_combatseq is TRUE, set the run_rmv boolean to TRUE, otherwise set run_rmv boolean to FALSE
run_rmv <- run_ruv | run_combatseq
# Assign the value of column to factor_of_interest, this will allow you to change factor_of_interest throughout the Rmd if needed, while maintaining the value of column
factor_of_interest <- column
```
```{r load-libraries, cache = FALSE, message = FALSE, warning=FALSE}
# Load required libraries
library(knitr)
library(tidyverse)
library(glue)
library(janitor)
library(purrr)
library(ggpubr)
library(rtracklayer)
library(DESeq2)
library(vegan)
library(EnhancedVolcano)
library(DEGreport)
library(pheatmap)
library(msigdbr)
library(fgsea)
library(htmltools)
library(ggforce)
library(RColorBrewer)
library(ggprism)
library(ggprism)
library(grafify)
library(viridis)
library(ashr)
# Overwrite the default filter function with dplyr's filter function
filter <- dplyr::filter
# Overwrite the default select function with dplyr's select function
select <- dplyr::select
# Setting the base theme that ggplot2 will use to theme_prism with the standard text size being 12
ggplot2::theme_set(theme_prism(base_size = 12))
# Setting scale color palette to the palette "kelly" from grafify
# You can select different palettes from here:
# https://grafify-vignettes.netlify.app/colour_palettes.html
scale_colour_discrete <- function(...) {
scale_colour_manual(...,
values = as.vector(grafify:::graf_palettes[["kelly"]])
)
}
# Setting scale fill palette to the palette "kelly" from grafify
# You can select different palettes from here:
# https://grafify-vignettes.netlify.app/colour_palettes.html
scale_fill_discrete <- function(...) {
scale_fill_manual(...,
values = as.vector(grafify:::graf_palettes[["kelly"]])
)
}
# Setting color to the palette "kelly" from grafify
# You can select diffferent palettes from here:
# https://grafify-vignettes.netlify.app/colour_palettes.html
colors <- as.vector(grafify:::graf_palettes[["kelly"]])
# Setting options for Rmd code chunks
opts_chunk[["set"]](
cache = FALSE, # When the cache is turned on, knitr will skip the execution of this code chunk if it has been executed before and nothing in the code chunk has changed since then. For a cached code chunk, its output and objects will be automatically loaded from the previous run, as if the chunk were executed again.
cache.lazy = FALSE, # Whether to only load cached objects when needed
dev = c("png", "pdf"), # Specifies the graphical devices to use figures
error = TRUE, # Whether to stop rendering on an error
highlight = TRUE, # Whether to enable syntax highlighting for the code in the chunk
message = FALSE, # Whether to preserve messages emitted by message() in the final output
prompt = FALSE, # Whether an R prompt (>) is displayed at the beginning of each line of R input code in rendered documents
tidy = FALSE, # Whether or not R code in a chunk is formatted automatically before rendering, similar to the R package, styler
warning = FALSE, # Whether to preserve warnings in the output
echo = TRUE, # Whether to include R source code in the final knitted document
fig.height = 4) # Sets the default height of plot figures generated in inches
# Set seed for reproducibility
set.seed(1234567890L)
```
```{r sanitize-datatable}
# Create a function to clean up data frames
sanitize_datatable <- function(df, ...) {
# Remove dashes from row names and column names which cause wrapping
DT::datatable(df, ...,
rownames = gsub("-", "_", rownames(df)),
colnames = gsub("-", "_", colnames(df)),
options = list(scrollX = TRUE, autoWidth = TRUE),
class = "stripe hover"
)
}
```
```{r load-data, message=F, warning=F}
# This code will load from nf-core folder
# NOTE make sure to set numerator and denominator
# Use the load_coldata function from the sourced load_data.R within the 00_params directory, to subset the colodata_fn dataframe that was pointed to when sourcing params.R within the 00_params directory
# The factors for this subsetting were set in thr params section of the YAML code chunk.
# The default subsetting is none since subset_column and subset_value are both set to null initially
# The column object here is given to ensure that the column of interest exists in the loaded dataframe
coldata <- load_coldata(
coldata_fn, column,
subset_column, subset_value
)
# Set the row names from the coldata dataframe to be a column called sample
# This column should already exist from the Nextflow run
coldata$sample <- row.names(coldata)
# Use the load_counts function from the sourced load_data.R within the 00_params directory, to load the counts data from the dataframe pointed to when sourcing params.R within the 00_params directory
counts <- load_counts(counts_fn)
# If subsetting parameters were provided, this will subset the counts data to match the samples in the coldata dataframe
counts <- counts[, colnames(counts) %in% coldata$sample]
# Use the load_metrics function from the sourced load_data.R within the 00_params directory, to load the metrics from the multiqc, along with the values of se_object and gtf_fn defined in the sourced params.R within the 00_params directory and also the recently created counts data, which is used to estimate the tRNA/rRNA rate (r_and_t_rna_rate).
metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>%
left_join(coldata, by = c("sample")) %>% # Join the coldata table to the end of object produced from the load_metrics function
as.data.frame() # Format it as a dataframe
# Assign the sample names to be the row names in the metrics object
rownames(metrics) <- metrics$sample
# If subsetting parameters were provided, this will subset the metrics data to match the samples in the coldata dataframe
metrics <- subset(metrics, metrics$sample %in% coldata$sample)
# If the sample names in the counts (column) don't match in order or string of the metrics file, reorder them in the counts object
counts <- counts[, rownames(metrics)]
# If the sample names in the coldata object (rows) don't match in order or string of the metrics file, reorder them in the counts object
coldata <- coldata[rownames(metrics), ]
# This will relevel the first contrast in your contrasts list with with the "control_name" as the baseline value defined in the params section of the YAML
coldata[[contrasts[[1]][1]]] <- relevel(as.factor(coldata[[contrasts[[1]][1]]]), contrasts[[1]][3])
# Stop if the column names in counts does not match and in the same order as the row names in metrics
stopifnot(all(colnames(counts) == rownames(metrics)))
```
# Overview
- Project: `r project`
- PI: `r PI`
- Analyst: `r analyst`
- Experiment: `r experiment`
- Aim: `r aim`
```{r load-counts-data}
# Query AnnotationHub and assign it to ah object
ah <- AnnotationHub::AnnotationHub()
# Retrieve the annotations from Annotation Hub dependent on your reference genome
if (grepl("hg|human", genome, ignore.case = TRUE)) { # If the reference genome is human
ens <- AnnotationHub::query(ah, c("Homo sapiens", "EnsDb")) # Retrieve the recent human annotations
ens <- ens[["AH119325"]] # Assign a human annotation, may need to change
} else if (grepl("mm|mouse", genome, ignore.case = TRUE)) { # If the reference genome is mouse
ens <- AnnotationHub::query(ah, c("Mus musculus", "EnsDb")) # Retrieve the recent mouse annotations
ens <- ens[["AH116340"]] # Assign a mouse annotation, may need to change
} else { # If the reference genome is not human or mouse, prin this warning
cat("Warning: Please, get a table that contains gene_id, gene_name and entrez to continue using this template.")
}
# Create a gene-level dataframe
# NOTE: Change this code if you are not using AnnotationHub
# The rdata is a table with gene_id, gene_name and entrez columns
rdata <- ensembldb::genes(ens, return.type = "data.frame") %>% # Extract ens annotation as a dataframe
dplyr::select(gene_id, gene_name,
entrez = entrezid,
gene_biotype, description
) %>% # Retrieve certain columns and rename the entrez column
dplyr::filter(gene_id %in% rownames(counts)) %>% # Retain only gene annotations in the counts matrix
dplyr::distinct(gene_id, .keep_all = TRUE) # Retain only unique gene_ids and also retain all of the other columns with .keep_all = TRUE
# Keep one entrez Id per gene
rdata$entrez <- purrr::map(rdata$entrez, 1) %>% unlist()
# Filter not needed genes
# rdata <- rdata %>% filter(!grepl("ncRNA", gene_biotype)) %>% # Remove non-coding RNAs
# filter(!grepl('Mir', gene_name)) %>% # Remove miRNAs
# filter(!grepl('Snor', gene_name)) # Remove snoRNAs
# counts <- counts[rdata$gene_id,] # Filter the counts matrix after removing these not needed genes
```
# Set up
We recommend avoiding manual filtering prior to running DESeq2, as the software performs internal filtering as part of its pipeline. However, pre-filtering may be appropriate in certain contexts to improve computational efficiency or address data imbalances, such as:
- A high proportion of dropouts (zero counts), where filtering can reduce computational burden
- A large number of samples, where pre-filtering low-expressed features can improve performance
- Strongly unbalanced group sizes, where group-specific filtering strategies may help mitigate bias or improve sensitivity
```{r setup}
# Establish a null DDS object for DESeq2
dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = ~1)
# Perform a variance-stabilized transformation and assign the DESeqTransform object to vsd_before
vsd_before <- vst(dds_to_use)
# Extract the variance-stabilized transformed (normalized) counts and assign them to norm_matrix
norm_matrix <- assay(vsd_before)
```
# PCA and group level variance.
**Principal Component Analysis (PCA)** is a dimensionality reduction technique that transforms high-dimensional data into a smaller set of uncorrelated variables called principal components. In gene expression analysis, PCA reduces thousands of gene expression measurements into a few components that capture the most variance across samples. This helps visualize overall patterns, identify outliers, and detect batch effects or biological groupings.
Dispersion estimates are a key part of the DESEQ2 analysis. DESEQ2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per gene dispersions to match this distribution. If one group has higher variance than all others this will affect the dispersion estimates. Here we visually check that the variance per group is similar using a PCA. The ellipses are minimal volume enclosing ellipses using the Khachiyan algorithm.
**It is best practice NOT to subset your data unless one group has significantly higher variance than the others. The best dispersion estimates are obtained with more data.**
**This code automatically uses the column value from the header. You can also manually add a factor of interest to define the groups. One can be created by combining multiple metadata columns using the paste0 function.**
```{r set-group, eval=FALSE, echo=FALSE}
## Example of creating a group covariate
# meta$group <- paste0(meta$sex, "_", meta$age, "_", meta$treatment)
# Re-assign the factor of interest in case it has change
# You can modify this to test for different factors of interest with:
# factor_of_interest <- "insert column name for covariate of interest"
factor_of_interest <- column
```
```{r plot-pca}
# Run PCA on the normalized counts
pca <- degPCA(norm_matrix,
metadata = metrics, # Using the metrics object as our metadata
condition = factor_of_interest, # Specifying the condition as our factor_of_interest from the above code chunk
name = "sample", # Column from metadata to use to label the points in the PCA plot
data = TRUE # Include the data
)
pca$plot + # Create plot
ggtitle(paste0("All samples", "\nPCA using ", nrow(vsd_before), " genes")) + # Add title to plot
theme(plot.title = element_text(hjust = 0.5)) + # Center title on plot
geom_mark_ellipse(aes(color = .data[[column]])) + # Create ellipses on plot
scale_color_grafify("kelly") # Set color parameter for plot
```
## Analysis of the variance by group
Groups in a univariate analysis can also differ with regard to their mean values, variation around those means, or both. In univariate analyses, dispersion can be examined using Levene’s test. PERMDISP is a multivariate extension of Levene’s test to examine whether groups differ in variability. In essence, PERMDISP involves calculating the distance from each data point to its group centroid and then testing whether those distances differ among the groups. [Source](https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permdisp/)
Here we apply this test to our variance stabilized data. We calculate distances between samples and then use the `betadisper()` function from the popular vegan package. We get two overall p-values where significant means that the dispersions are different between groups. The first p-value comes from the `anova()` function and the second from the `permutest()` function. We also get pairwise p-values for every group-group comparison.
**This analysis allows us to confirm that the variance is the same in all groups (non-significant p-value), so that the DE analysis will be accurate.** Dispersion estimates are a key part of the DESeq2 analysis. DESeq2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per-gene dispersions to match this distribution. If one group has higher variance than all others, this will affect the dispersion estimates.
```{r PERMDISP}
# Use vegdist() to compute pair-wise dissimilarity indices between samples from the normalized count matrix
# Note, we need to transpose (use t())) the counts matrix for vegdist() to compute dissimilarity indices between the samples
vare.disa <- vegdist(t(assay(vsd_before)))
# Evaluate the homogeneity of dispersion within the groups
mod <- betadisper(vare.disa, group = metrics[[factor_of_interest]])
# Test the homogeniety of dispersion between groups with ANOVA.
# ANOVA tests is there is overall significant variation in variance between groups.
anova(mod)
# Test the homogeniety of dispersion between groups with a permutation test
# The permutation test considers pairwise comparisons so you can see which group(s) is different
permutest(mod, pairwise = TRUE)
```
# Covariate analysis
Multiple factors related to the experimental design or quality of sequencing may influence the outcomes of a given RNA-seq experiment. To further determine whether any confounding covariate risks affecting the results of our differential expression analyses, it is useful to assess the correlation between covariates and principal component (PC) values.
Here, we are using `DEGreport::degCovariates()` to explore potential correlations between variables provided in the metadata and all PCs that account for at least 5% of the variability in the data. If applicable, significant correlations (FDR < 0.1) are circled. **This diagnostic plot helps us determine which variables we may need to add to our DE model.**
```{r run-covariates, fig.height = 6, fig.width = 10}
# Visualize covariates for the PCs
degCovariates(
counts = norm_matrix, # Normalized counts matrix
metadata = metrics # Metadata to overlay as covariates
)
```
# Data modeling
```{r setup-deseq2}
# Enter your design formula
# The format should be:
# as.formula(paste0("~ ", column_to_control_for, " + ", factor_of_interest))
formula <- as.formula(paste0("~ ", " + ", column))
# Stop if the column names in counts does not match and in the same order as the row names in metrics
stopifnot(all(colnames(counts) == rownames(coldata)))
# Create the DDS object for DESeq2 and assign it to the object dds_to_use
dds_to_use <- DESeqDataSetFromMatrix(
countData = counts, # Provide the raw counts data
colData = coldata, # Provide the metadata
design = formula # Provide the design formula
)
# Carry out a variance-stabilized transformation of the data in the DDS object and assign it to vsd_before
# Note: vsd_before and norm_matrix was previously assigned in the setup code chunk w/o variables in the formula so this is overwriting that
# NOTE: VST won’t regress out this when normalizing
vsd_before <- vst(dds_to_use)
# Extract the variance-stabilized transformed (normalized) counts and assign them to norm_matrix
norm_matrix <- assay(vsd_before)
# We are going to modify coldata in the RUV section when is turn on, hence want to retain the original
new_cdata <- coldata
```
For this study, this formula is recommended: `r as.character(formula)`
```{r note-ruv, eval=F, echo=FALSE}
#### IF YOU ARE RUNNING RUV OR COMBATSEQ RUN THE CHUNKS BELOW OTHERWISE SKIP TO Differential Expression SECTION or remove this section
```
## Remove unwanted variation
Removing unwanted variation from RNA-seq analysis is essential to ensure that the results reflect biological rather than technical differences. Methods like ComBat, RUVseq, or surrogate variable analysis (SVA) can be applied to adjust for batch effects, library preparation discrepancies, or other confounders. These techniques model and subtract the unwanted variation, enhancing the ability to detect true biological signals in the data. Proper normalization and careful experimental design are also crucial steps to mitigate such unwanted variation.
### Assessing unknown factors
```{r note-ruv-print, results='asis'}
# Print out text discussing is RUVseq will be done
# If the RUV parameter in the params section of the YAML is TRUE, then
if (run_ruv) {
cat("When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.")
} else { # If the RUV parameter in the params section of the YAML is FALSE, then
cat("There is no need to assess unknown factor for this study.")
}
```
```{r do_RUV, eval=run_ruv, echo=run_ruv}
# TOFIX Add to template: check correlation of dummy variables produced by ruvseq with existing covariates in metadata
# NOTE: RUVseq is used when you don’t know where the unwanted variation is coming from. The package utilizes dummy variable(s), 1-5 used, start with 1, look at PCA, decide if you want more separation. Add any known-created RUV variables to DESeq2 formula. The normalized matrix produced – only for visualization, not for input into DESeq2.
library(RUVSeq)
# If you want to skip the code, just set up formula to be your model in the next chunk of code
# Vector containing the variable of interest
design <- coldata[[column]]
# Formats the variable of interest into a matrix
# Levels are the rows and columns are the number of elements of a given level
# The value is the index of the positions in the vector
diffs <- makeGroups(design)
# Re-assigning the variance stabilized transformation count matrix to a object called dat
dat <- norm_matrix
# By default RUVSeq is running one variable,
ruvset_2 <- RUVs(
x = dat, # Variance stabilized transformed counts data
cIdx = rownames(dat), # Vector of the genes
k = 1, # Number of unknown covariates to find
scIdx = diffs, # Matrix formatted variable of interest
isLog = TRUE, # The variance stabilized transformation is approximately a log transformation
round = FALSE # Whether to round the normalized measures to form pseudocounts
)
# W is the estimation of the unknown source of variation that we can now add to DESeq2
vars <- ruvset$W
# Create a new colData with W added to it
new_cdata <- cbind(coldata, vars)
# New formula with W
formula <- as.formula(paste0(
"~ ",
paste0(
colnames(new_cdata)[grepl("W", colnames(new_cdata))],
collapse = " + "
), " + ", column
))
# Overwrite the normalized count matrix with the normalized counts from RUVSeq
# NOTE: Only use this for visualization
norm_matrix <- ruvset$normalizedCounts
# Create a PCA
pca2 <- degPCA(
counts = norm_matrix, # Provide the adjusted counts from RUVSeq
metadata = new_cdata, # Provide metadata
condition = column # Provide the variable of interest to color the plot by
) + ggtitle("After RUV") # Add a title
# Plot PCA
pca2 +
scale_color_grafify("kelly") # Apply color scale
```
```{r after_RUV, eval=run_ruv}
# Make DDS object
dds_to_use <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula)
# Apply VST
vsd_to_use <- vst(dds_to_use, blind = FALSE)
```
### Remove Batch Effects
```{r combat-text , eval=run_combatseq, results='asis', echo=run_combatseq}
# NOTE Combatseq (part of the SVA package) - corrected count, removes the effects while retaining the structure of the data. Used in a scenario where you know what covariate/batch is. Do not add known-removed covariates to DESeq2 formula. Also, don’t attempt to remove biological effect (e.g. donor), this is not conceptually valid; best for technical variation.
library(sva)
cat("Here we apply Combat-seq (https://github.com/zhangyuqing/ComBat-seq) to try to remove batch effects so we can better tease out the effects of interest.
Combat-seq uses a negative binomial regression to model batch effects, providing adjusted data by mapping the original data to an expected distribution if there were no batch effects. The adjusted data preserves the integer nature of counts, so that it is compatible with the assumptions of state-of-the-art differential expression software (e.g. edgeR, DESeq2, which specifically request untransformed count data).")
```
```{r print-no-ruv-or-batch, eval=!run_combatseq, results='asis', echo=run_combatseq}
cat("There is no need to remove known factors like batch effect in this study.")
```
```{r set_variable_combatseq, eval=run_combatseq, echo=run_combatseq}
# NOTE: work on this code if you need to run ComBat-seq
# Set your batch effect variable here this is the variable that ComBat-seq will try to remove
## Column name of your batch variable
to_remove <- "batch"
## Column name of of your variable(s) of interest
to_keep <- "sample_type"
# Re-assign the column of coldata with the batch information as a factor
coldata[[to_remove]] <- as.factor(coldata[[to_remove]])
# Re-assign the column of coldata with the variable of interest as a factor
coldata[[to_keep]] <- as.factor(coldata[[to_keep]])
# Assign the column with batch information to the batch object
batch <- coldata[[to_remove]]
# Assign the column with variable of interest information to the treatment object
treatment <- coldata[[to_keep]]
# If you have multiple variables of interest you will need to assign each to a vector
# treatment1 <- metrics[[to_keep]]
# treatment2 <- metrics[[to_keep]]
# treatment3 <- metrics[[to_keep]]
# Column bind all of the vectors with columns containing variables of interest into one object
# imp <- cbind(as.numeric(as.character(treatment1)),as.numeric(as.character(treatment2)), as.numeric(as.character(treatment3)))
```
```{r do_combatseq, eval=run_combatseq}
# Run ComBat-seq
adjusted_counts <- ComBat_seq(
counts = as.matrix(counts), # Provide the counts matrix
batch = batch, # Provide the batch information
group = treatment # Provide the variable of interest information
)
# NOTE: Running ComBat-seq with multiple variables of interest
# adjusted_counts <- ComBat_seq(
# counts = as.matrix(counts), # Provide the counts matrix
# batch = batch, # Provide the batch information
# covar_mod = imp # Provide the variables of interest information
# )
```
```{r after_combatseq, eval=run_combatseq}
# Re-enter the adjusted count from ComBat-seq into DESeq
# NOTE: Make sure the formula doesn't contain the covariates used in ComBat-seq above
dds_to_use <- DESeqDataSetFromMatrix(
countData = adjusted_counts, # Provide the adjusted counts from ComBat-seq
colData = coldata, # Provide coldata
design = formula # Provide design formula, notably without the covariates used in ComBat-seq
)
# Perform a variance-stabilized transformation on the ComBat-seq adjusted count data
vsd_combat <- vst(dds_to_use, blind = FALSE)
# Retrieve the variance-stabilized transformed counts data and assign it to the object norm_matrix
norm_matrix <- assay(vsd_combat)
# Run degPCA to use prcomp and make a plot
pca_combat <- degPCA(
counts = norm_matrix, # Variance-stabilized transformed counts data after ComBat-seq
metadata = coldata, # Provide metadata
condition = column # Provide the variable of interest to color the plot by
) +
ggtitle("After Combatseq") # Add title
# Plot PCA from dgePCA
pca_combat +
scale_color_grafify("kelly") # Apply color scale
```
# Differential Expression
Differential gene expression analysis of count data was performed using the Bioconductor R package, DESeq2, which fits the count data to a negative binomial model.
Before fitting the model, we often look at a metric called dispersion, which is a measure for variance which also takes into consideration mean expression. A dispersion value is estimated for each individual gene, then 'shrunken' to a more accurate value based on expected variation for a typical gene exhibiting that level of expression. Finally, the shrunken dispersion value is used in the final GLM fit.
We use the below dispersion plot, which should show an inverse relationship between dispersion and mean expression, to get an idea of whether our data is a good fit for the model.
```{r run-deseq2}
# Perform differential expression analysis on the DDS object
de <- DESeq(dds_to_use)
# Plot the dispersion for the data
DESeq2::plotDispEsts(de)
```
Because it is difficult to accurately detect and quantify the expression of lowly expressed genes, differences in their expression between treatment conditions can be unduly exaggerated after the model is fit. We correct for this so that gene LFC is not dependent overall on basal gene expression level.
In cases there are multiple groups and conditions across groups is recommended to use dummy variables instead of interaction terms: https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#interactions.
The LRT is useful for testing multiple terms at once, for example testing 3 or more levels of a factor at once, or all interactions between two variables. The LRT for count data is conceptually similar to an analysis of variance (ANOVA) calculation in linear regression, except that in the case of the Negative Binomial GLM, we use an analysis of deviance (ANODEV), where the deviance captures the difference in likelihood between a full and a reduced model.
```{r lfc-shrink}
# NOTE: We recommend LRT for time series
# resultsNames(de) # Check the order is right
# Create a list that mirrors the contrasts in the contrasts list object where the pattern is:
# <column_of_comparison>_<treatment_name>_vs_<control_name>
names_to_use <- lapply(contrasts, function(contrast) {
coef <- paste0(contrast[1], "_", contrast[2], "_vs_", contrast[3])
})
# Currently the contrasts list object doesn't have names for the items in the list
# Assign the names from names_to_use to the names in the contrasts list object
names(contrasts) <- names_to_use
# Create a list called `de_list` to hold the differential expression information for each contrast
# Go through each contrast in the contrasts list object and
de_list <- lapply(contrasts, function(contrast) {
# Extract the results (LFC, pvalue, padj, etc.) for the contrast
resLFC <- results(de, contrast = contrast)
# Create a string for the contrast following this pattern:
# <column_of_comparison>_<treatment_name>_vs_<control_name>
coef <- paste0(contrast[1], "_", contrast[2], "_vs_", contrast[3])
# Apply LFC shrinkage with `ashr`
# NOTE: Use `ashr` for comparisons with many groups to be able to pull out all the contrasts; otherwise `apeglm` is fine. It shrinks less.
resLFCS <- lfcShrink(
dds = de, # Provide the DDS object
contrast = contrast, # Provide the contrast
type = "ash" # Use `ashr` to apply the shrinkage
)
# If you want to use `apeglm` instead of `ashr` then use:
# resLFCS <- lfcShrink(
# dds = de, # Provide the DDS object
# coef = coef, # Provide the contrast
# type = "apeglm" # Use `apeglm` to apply the shrinkage
# )
# Create a new results data frame with some extra information
res <- as.data.frame(resLFCS) %>% # Extract the results from the resLFCS as a dataframe
rownames_to_column("gene_id") %>% # Move the rownames to a column of data called "gene_id"
left_join(rdata, by = "gene_id") %>% # Combine the results with the rdata object by the gene_id and rdata holds information about each gene
relocate(gene_name) %>% # Move the "gene_name" column to be the first column
dplyr::rename(lfc = log2FoldChange) %>% # Rename the column log2FoldChange to lfc
mutate(pi = abs(lfc) * -log10(padj)) %>% # Create a new column named pi that multiplies the absolute value of log fold change by the log10 adjusted adjusted p-value
arrange(-pi) # Arrange the data frame by the highest values of pi
# Filter out genes that have no expression or were filtered out by DESEQ2
res <- res[res$baseMean > 0, ] %>% # Remove genes with no expression
drop_na(padj) %>% # Remove genes who don't have an adjusted p-value (lots of lowly expressed genes)
drop_na(pvalue) # Remove genes who don't have a p-value (lots of lowly expressed genes)
# Extract the significant genes
res_sig <- res %>%
filter(padj < 0.05) %>% # Retain the values that have an adjusted p-value less than 0.05
arrange(padj) %>% # Arrange the genes by the lowest adjusted p-value
mutate(gene_name = ifelse(test = gene_name == "" | is.na(gene_name), # If the gene_name value for a gene is NA or blank then
yes = gene_id, # Use the gene_id as the gene_name
no = gene_name # Maintain the gene_name
))
# Create a list object called results
results <- list(
lfc = resLFC, # Assign the unshrunken results data to `lfc`
lfcs = resLFCS, # Assign the shrunken results data to `lfcs`
all = res, # Assign the full differential expression results to `all`
sig = res_sig # Assign the significantly differentially expressed genes to `sig`
)
# Return the `results` list object to be a list within the object `de_list`
return(results)
})
# NOTE: If you manually add any other comparison to the list with the following variables, the code below will make the plots for those as well:
# de_list <- c(de_list, new_comparison=list(lfc=resLFC,
# lfcs=resLFCS,
# all=res, sig=res_sig))
```
## MA plot
This plot can help to:
- Identify Differential Expression: Genes that show a significant log-fold change (M value away from 0) indicate changes in expression between conditions.
- Assess Data Quality: The plot can help in identifying biases or systematic errors in the data. Ideally, most points should scatter around the M=0 line, indicating that there is no significant systematic difference between the conditions.
- Visualize data dispersion: The distribution of points along the A-axis gives a sense of the spread of expression levels and any patterns or anomalies in the dataset.
::: {.panel-tabset}
```{r after_lfc_shrink, message=F, warning=F}
#| results: asis
#| fig-width: 4
#| fig-height: 4
# For each of the contrasts in the de_list list object to create MA plots
for (contrast in names(de_list)) {
# Create a tabset of the contrast
cat("### ", contrast, "\n\n")
# Use the unshrunken (lfc) counts for creating an MA plot
p1 <- degMA(as.DEGSet(de_list[[contrast]]$lfc)) +
ggtitle("Before LFC Shrinking") # Add title
# Print the unshrunken counts MA plot
print(p1)
# Use the shrunken (lfc) counts for creating an MA plot
p2 <- degMA(as.DEGSet(de_list[[contrast]]$lfcs), limit = 2) +
ggtitle("After LFC Shrinking") # Add title
print(p2) # Print the shrunken counts MA plot
cat("\n\n") # Print two new lines
}
```
:::
## Differentially Expressed Genes
```{r}
#| results: asis
dt_list <- list()
for (contrast in names(de_list)) {
res_sig <- de_list[[contrast]][["sig"]]
dt_list <- c(
dt_list,
list(h3(contrast)),
list(DT::datatable(res_sig,
options = list(
scrollX = TRUE, autoWidth = TRUE
),
class = "stripe hover"
))
)
}
tagList(dt_list)
```
## Volcano plot
This volcano plot shows the genes that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in purple are genes that are significantly differentially expressed with a large effect (padj < 0.05 and model coefficient > ±0.5). Points in pink are significant with a small effect (padj < 0.05 and model coefficient < ±0.5) and points in blue are not significant but have a large effect (padj > 0.05 and model coefficient > ±0.5). Grey points are non-significant. The dashed lines correspond to the cutoff values of padj and model coefficient that we have chosen.
::: {.panel-tabset}
```{r}
#| results: asis
#| fig-height: 6
for (contrast in names(de_list)) {
cat("### ", contrast, "\n\n")
res <- de_list[[contrast]][["all"]]
res_mod <- res
show <- as.data.frame(res_mod[1:10, c("lfc", "padj", "gene_name")])
p1 <- EnhancedVolcano(res_mod,
lab = res_mod$gene_name,
pCutoff = 0.05,
selectLab = c(show$gene_name),
FCcutoff = 0.5,
x = "lfc",
y = "padj",
title = contrast,
col = c("darkgrey", "lightblue", "plum1", "purple"),
subtitle = "",
drawConnectors = TRUE,
max.overlaps = Inf
)
print(p1)
cat("\n\n")
}
```
:::
## Heatmap
::: {.panel-tabset}
```{r}
#| results: asis
for (contrast in names(de_list)) {
cat("### ", contrast, "\n\n")
res_sig <- de_list[[contrast]][["sig"]]
con <- contrasts[[contrast]]
to_test <- subset(coldata, coldata[[column]] %in% c(con[2], con[3]))
ma <- norm_matrix[res_sig$gene_id, to_test$sample]
colma <- coldata[, c(column), drop = FALSE]
ma_colors <- lapply(colnames(colma), function(c) {
l.col <- colors[1:length(unique(colma[[c]]))]
names(l.col) <- sort(unique(colma[[c]]))
l.col
})
names(ma_colors) <- colnames(colma)
p1 <- pheatmap(ma,
color = inferno(10),
cluster_rows = TRUE,
show_rownames = FALSE,
annotation = colma,
annotation_colors = ma_colors,
border_color = NA,
fontsize = 10,
scale = "row",
fontsize_row = 10,
height = 20
)
print(p1)
cat("\n\n")
}
```
:::
## Plot Top 16 Genes
::: {.panel-tabset}
```{r}
#| results: asis
#| fig-height: 6
#| fig-width: 8
n <- 16
for (contrast in names(de_list)) {
cat("### ", contrast, "\n\n")
res_sig <- de_list[[contrast]][["sig"]]
top_n <- res_sig %>%
slice_min(order_by = padj, n = n, with_ties = FALSE) %>%
dplyr::select(gene_name, gene_id)
top_n_exp <- norm_matrix %>%
as.data.frame() %>%
rownames_to_column("gene_id") %>%
pivot_longer(cols = !gene_id, names_to = "sample", values_to = "log2_expression") %>%
right_join(y = top_n, relationship = "many-to-many") %>%
left_join(y = coldata, by = "sample")
con <- contrasts[[contrast]]
top_n_exp <- subset(top_n_exp, top_n_exp[[column]] %in% c(con[2], con[3]))
p1 <- ggplot(top_n_exp, aes_string(x = column, y = "log2_expression")) +
geom_boxplot(outlier.shape = NA, linewidth = 0.5, color = "grey") +
geom_point() +
facet_wrap(~gene_name) +
ggtitle(str_interp("Expression of Top ${n} DEGs")) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
print(p1)
cat("\n\n")
}
```
:::
# Pathway Enrichment
From the set of differentially expressed genes and using publicly available information about gene sets involved in biological processes and functions, we can calculate which biological processes and functions are significantly perturbed as a result of the treatment.
```{r setup-fa}
# Call the appropriate functional analysis databases for your analysis
if (grepl("hg|human", genome, ignore.case = TRUE)) { # If the genome value in the params section of the YAML contains hg or human, then
all_in_life <- get_databases_v2(sps = "human") # Get the human pathway databases using the get_databases_v2 function form the FA.R file within the 00_libs directory
run_FA <- TRUE # Assign the value of TRUE to a object called run_FA and it will allow the functional analysis sections to run
} else if (grepl("mm|mouse", genome, ignore.case = TRUE)) { # If the genome value in the params section of the YAML contains mm or mouse, then
all_in_life <- get_databases_v2(sps = "mouse") # Get the mouse pathway databases using the get_databases_v2 function form the FA.R file within the 00_libs directory
run_FA <- TRUE # Assign the value of TRUE to a object called run_FA and it will allow the functional analysis sections to run
} else { # If your organism isn't mouse or human then
warning("If you are working on an organism other than human or mouse: This pathway enrichment is based on annotations from MSigDB https://www.gsea-msigdb.org/gsea/msigdb which only has human and mouse collections. For non-model organism pathway analysis, please see the template rnaseq/03_functional/Nonmodel_Organism_Pathway_Analysis.Rmd") # Print this error message
run_FA <- FALSE # Assign the value of FALSE to a object called run_FA and it will not allow the functional analysis sections to run
}
```
# Pathway Analysis- GSEA
Gene Set Enrichment Analysis (GSEA) is a computational method used to determine whether a predefined set of genes shows statistically significant, concordant differences between two biological states (e.g., disease vs. normal) in RNA-seq data or other types of gene expression data. Advantages of GSEA.
- Biological Insight: Helps in understanding the underlying biological processes and pathways affected, rather than focusing on individual genes.
- Incorporation of Prior Knowledge: Utilizes predefined gene sets, allowing integration of existing biological knowledge.
- Contextual Relevance: Can reveal subtle but coordinated changes in biologically meaningful gene sets that might not be apparent when looking at individual genes.
**In the GSEA results, NES (normalized enrichment score) refers to the direction of regulation of the pathway. NES < 0 means downregulated, while NES > 0 means upregulated.**
```{r run-gsea, warning=F, message=F, eval=run_FA}
# Go through each contrast in the de_list list object and assign data frames of significantly enriched pathways to to different elements of the list
fa_gsea_list <- lapply(de_list, function(contrast) {
# For a given contrast pull out the full differential expression results
res <- contrast[["all"]]
# Creating the input for the GSEA analysis
gsea_input <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)
filter(!is.na(padj)) %>% # Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object
dplyr::select(gene_id, lfc) # Extract the columns "gene_id" and "lfc"
# Obtain the ENSEMBL ID and gene symbols for the genes in the gsea_input object
# NOTE: Change to the correct species if not working in human or mouse
input_entrezid <- AnnotationDbi::select(
x = ann.org, # Annotation database to search
keys = gsea_input$gene_id, # The gene_ids to search for
keytype = "ENSEMBL", # The gene_ids (keys) are ENSEMBL annotations
columns = c("ENTREZID", "SYMBOL")
) # Return the queried ENSEMBL ID, along with the ENTREZ ID and the gene symbol
# Combine gene annotation with the lfc information
input_entrezid <- inner_join(
x = gsea_input, # Combine the ENSEMBL ID and lfc input with
y = input_entrezid, # The ENTREZ ID and gene symbol information
by = c("gene_id" = "ENSEMBL")
) %>% # Join these table together by common ENSEMBL IDs
filter(!is.na(ENTREZID)) %>% # Remove the genes without an ENTREZ ID
distinct(ENTREZID, .keep_all = TRUE) %>% # Retain the unique ENTREZ IDs and all columns of data
arrange(desc(lfc)) # Arrange the data frame by descending values of lfc
# Run the run_fgsea_v2 function found in FA.R within the 00_libs directory to run the GSEA analysis and assign the output to the object tb
tb <- run_fgsea_v2(
input = input_entrezid, # Query for the GSEA analysis
all_in_life = all_in_life
) # Annotation databases to query against
tb %>%
filter(padj < 0.05) %>%
arrange(padj) # Filter and sort the GSEA output for results with an adjust p-value less than 0.05 and return them
})
```
```{r print-gsea, results='asis', eval=run_FA}
# NOTE: DT::datatables doesn't work with tabset and for loops
# You can use the following code to print dynamically or call manually sanitize_datatable() multiple times
# Create an empty list to assign contrast values and DT data tables to
dt_list <- list()
# For each of the contrasts in the de_list list object to extract the name of the contrast and the GSEA results to be rendered into an HTML table
for (contrast in names(de_list)) {
# Assign the significantly enriched pathways to the object res_sig
res_sig <- fa_gsea_list[[contrast]] %>% # Use the GSEA obect for a given contrast. NOTE: These have already been filter for adjusted p-value less than 0.05.
filter(padj < 0.05) %>% arrange(padj) # Filter and sort for adjusted p-values of less than 0.05
# Populate the dt_list list
dt_list <- c(
dt_list, # With the contents of dt_list from the previous loop
list(h3(contrast)), # Odd numbered objects in the list will be the contrast name in a HTML renderable Heading 3 size
list(sanitize_datatable(res_sig)) # Even numbered object in the list will hold HTML renderable data table from the DT package containing the significantly enriched pathways
)
}
# Render the dt_list into HTML
tagList(dt_list)
```
# Pathway Analysis- Over-representation
Over-Representation Analysis (ORA) is a statistical method used to determine whether a predefined set of genes (e.g., genes belonging to a specific biological pathway or function) is over-represented (or enriched) among a list of differentially expressed genes (DEGs) from RNA-seq data. Adventages of ORA:
- Simplicity: Easy to perform and interpret.
- Biological Insight: Helps to identify pathways and processes that are significantly affected in the condition studied.
- Prior Knowledge Integration: Utilizes existing biological knowledge through predefined gene sets.
```{r run-ora, warning=F, message=F, eval=run_FA}
# Go through each contrast in the de_list list object and
fa_list <- lapply(de_list, function(contrast) {
# For a given contrast pull out the full differential expression results
res <- contrast[["all"]]
# Extract the universe of genes to the ORA on and assign it to the object universe
universe <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)
filter(!is.na(padj)) %>% # Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object
pull(gene_id) # Extract the gene_id column
# Subset the full annotations to only contain universe we will evaluate within
universe_mapping <- rdata %>% # Start with the full annotations
filter(gene_id %in% universe, !is.na(entrez)) # Only retain the the annotations that are in the universe of genes (baseMean greater than 0, along with defined p-value and adjusted p-value) and have an ENTREZ ID
# AnnotationDbi::select(ann.org, universe, 'ENSEMBL', columns=c('ENTREZID', 'SYMBOL'))
# Create the input vector of genes for the ORA with a positive or negative lfc
ora_input <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)
filter(!is.na(padj), padj < 0.01, abs(lfc) > 0.3) %>% # Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object, and only keep genes with adjusted p-values less than 0.01 and the absolute value of the lfc is greater than 0.3
pull(gene_id) # Extract the gene ID
# Create the ENTREZ ID input for the ORA
# NOTE: Change to the correct species if not working in human or mouse
input_entrezid <- rdata %>% # Start with the full annotations
filter(gene_id %in% ora_input, !is.na(entrez)) # Only retain the the annotations that are are in the ora_input object (ajusted p-value less than 0.01, lfc greater than the absolute value of 0.3) and have an ENTREZ ID
# AnnotationDbi::select(ann.org, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL'))
# Run the ORA using the run_fora_v2 function within the FA.R file in the 00_libs directory
all <- run_fora_v2(
input = input_entrezid, # Data frame of genes that are being evaluated for enrichment
uni = universe_mapping, # Universe of gene to look for enrichment within
all_in_life = all_in_life
) # Annotation databases to query against
# Create the input vector of genes for the ORA with a positive lfc
ora_input <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)
filter(!is.na(padj), padj < 0.01, lfc > 0.3) %>% # Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object, and only keep genes with adjusted p-values less than 0.01 and the lfc is greater than 0.3
pull(gene_id) # Extract the gene ID
# Create the ENTREZ ID input for the ORA
# NOTE: Change to the correct species if not working in human or mouse
input_entrezid <- rdata %>% # Start with the full annotations
filter(gene_id %in% ora_input, !is.na(entrez)) # Only retain the the annotations that are are in the ora_input object (adjusted p-value less than 0.01 and lfc greater than 0.3) and have an ENTREZ ID
# Run the ORA using the run_fora_v2 function within the FA.R file in the 00_libs directory
up <- run_fora_v2(
input = input_entrezid, # Data frame of genes that are being evaluated for enrichment
uni = universe_mapping, # Universe of gene to look for enrichment within
all_in_life = all_in_life
) # Annotation databases to query against
# Create the input vector of genes for the ORA with a negative lfc
ora_input <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)
filter(!is.na(padj), padj < 0.01, lfc < -0.3) %>% # Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object, and only keep genes with adjusted p-values less than 0.01 and the lfc is less than -0.3
pull(gene_id) # Extract the gene ID
# Create the ENTREZ ID input for the ORA
# NOTE: Change to the correct species if not working in human or mouse
input_entrezid <- rdata %>% # Start with the full annotations
filter(gene_id %in% ora_input, !is.na(entrez)) # Only retain the the annotations that are are in the ora_input object (adjusted p-value less than 0.01 and lfc less than -0.3) and have an ENTREZ ID
# Run the ORA using the run_fora_v2 function within the FA.R file in the 00_libs directory
down <- run_fora_v2(
input = input_entrezid, # Data frame of genes that are being evaluated for enrichment
uni = universe_mapping, # Universe of gene to look for enrichment within
all_in_life = all_in_life
) # Annotation databases to query against
# Create a list to hold the ORA results for a given contrast
list(
all = all, # Assign the positive and negative lfc combined ORA results to "all"
up = up, # Assign the positive lfc combined ORA results to "up"
down = down # Assign the negative lfc combined ORA results to "down"
)
})
```
## All significant genes
```{r}
#| results: asis
#| eval: !expr run_FA
# NOTE DT::datatables doesn't work with tabset and for loops
# You can use the following code to print dynamically or call manually sanitize_datatable() multiple times
dt_list <- list()
for (contrast in names(de_list)) {
res_sig <- fa_list[[contrast]][["all"]] %>%
filter(padj < 0.05) %>%
arrange(padj) # Filter and sort
dt_list <- c(
dt_list,
list(h3(contrast)),
list(sanitize_datatable(res_sig))
)
}
tagList(dt_list)
```
## Down-regulated genes
```{r print-ora-down}
#| results: asis
#| eval: !expr run_FA
# Create an empty list to assign contrast values and DT data tables to
dt_list <- list()
# For each of the contrasts in the de_list list object to extract the name of the contrast and the ORA results to be rendered into an HTML table
for (contrast in names(de_list)) {
# Create a dataframe to hold the significantly over-represented terms for genes with a negative lfc
res_sig <- fa_list[[contrast]][["down"]] %>% # Start with the complete list of terms used in the ORA evaluated for negative lfc
filter(padj < 0.05) %>% arrange(padj) # Filter and sort only the results with adjusted p-values of 0.05
# Populate the dt_list list
dt_list <- c(
dt_list, # With the contents of dt_list from the previous loop
list(h3(contrast)), # Odd numbered objects in the list will be the contrast name in a HTML renderable Heading 3 size
list(sanitize_datatable(res_sig)) # Even numbered object in the list will hold HTML renderable data table from the DT package containing the significantly over-represented terms for genes with a negative lfc
)
}
# Render the dt_list into HTML for the significantly over-represented terms for genes with a negative lfc
tagList(dt_list)
```
## Up-regulated genes
```{r print-ora-up}
#| results: asis
#| eval: !expr run_FA
# Create an empty list to assign contrast values and DT data tables to
dt_list <- list()
# For each of the contrasts in the de_list list object to extract the name of the contrast and the ORA results to be rendered into an HTML table
for (contrast in names(de_list)) {
# Create a dataframe to hold the significantly over-represented terms for genes with a positive lfc
res_sig <- fa_list[[contrast]][["up"]] %>% # Start with the complete list of terms used in the ORA evaluated for positive lfc
filter(padj < 0.05) %>% arrange(padj) # Filter and sort only the results with adjusted p-values of 0.05
# Populate the dt_list list
dt_list <- c(
dt_list, # With the contents of dt_list from the previous loop
list(h3(contrast)), # Odd numbered objects in the list will be the contrast name in a HTML renderable Heading 3 size
list(sanitize_datatable(res_sig)) # Even numbered object in the list will hold HTML renderable data table from the DT package containing the significantly over-represented terms for genes with a positive lfc
)
}
# Render the dt_list into HTML for the significantly over-represented terms for genes with a positive lfc
tagList(dt_list)
```
# Save files
```{r write-files}
# If the subset value from the params in the YAML is not null then
if (!is.null(subset_value) & !is.null(subset_column)) {
# Interpret the value of subset_value and assign it to the object called filenames
filenames <- str_interp("${subset_value}")
} else { # If the subset value from the params in the YAML is null then
# Assign the calue of "full" to the object called filenames
filenames <- "full"
}
# Assign the value of filenames to a new object called filename
filename <- paste0(filenames)
# Assign the file path to save the file path for the expression data CSV to the object name_expression_fn
name_expression_fn <- file.path( # Construct a file path
basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directory
str_interp("${filename}_expression.csv") # Interpret the value of "filename" and insert that in front of "_expression.csv"
)
# Extract the variance stabilized transformed counts matrix into a data frame to be saved
counts_norm <- norm_matrix %>% # Start with the variance stabilized transformed counts matrix
as.data.frame() %>% # Convert the matrix to a data frame
# Convert the row names (ENSEMBL IDs) to a column called "gene_id"
rownames_to_column("gene_id") %>%
left_join(rdata, by = "gene_id")
# Write the variance stabilized transformed counts matrix to a CSV file with the name_expression_fn path
write_csv(counts_norm, name_expression_fn)
# For each of the contrasts in the de_list list object to save the differential expression analysis and ORA to their respective CSV files
for (contrast in names(contrasts)) {
# Create a file name for the differential expression analysis
name_deg_fn <- file.path( # Construct a file path
basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directory
str_interp("${filename}_${contrast}_deg.csv") # Interpret the value of "filename" and "contrast", then insert that in front of "_deg.csv"
)
# Create a file name for the ORA analysis
name_pathways_fn <- file.path( # Construct a file path
basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directory
str_interp("${filename}_${contrast}_pathways.csv") # Interpret the value of "filename" and "contrast", then insert that in front of "_pathways.csv"
)
# Assign the full differential expression analysis for a given contrast to the object called res_for_writing
res_for_writing <- de_list[[contrast]][["all"]] %>% # Start with the full differential expression analysis dataframe for the given contrast
mutate(comparison = contrast) # Add a column to the table with the given contrast
# Assign the full ORA for positive and negative lfc for a given contrast to the object called pathways_for_writing
# NOTE: Choose what ORA to save (all, down or up). To save everything, add more lines of this code
pathways_for_writing <- fa_list[[contrast]][["all"]] %>% # Start with the full ORA dataframe evaluating negative and positive lfc for the given contrast
mutate(comparison = contrast) # Add a column to the table with the given contrast
# Write the full differential expression analysis for a given contrast to a CSV file with the name_deg_fn path
write_csv(res_for_writing, name_deg_fn)
# Write the full ORA for a given contrast to a CSV file with the name_pathways_fn path
write_csv(pathways_for_writing, name_pathways_fn)
}
```
If you would like to use `DEGpatterns.Rmd` template to visualize trends of gene changes across conditions/groups, please save the following output:
```{r save}
# Save the DESeq object to an RDS file
saveRDS(dds_to_use, "DEGpattern_deseq_obj.rds")
# Save the metadata to an RDS file
saveRDS(coldata, "deseq_coldata.rds")
# Set the adjusted p-value threshold to 0.05
padj.cutoff <- 0.05
# Assign a value for the top n genes to XXX
topN <- 1000
# Go through each contrast in the de_list list object and extract the significance values for the significant genes for top N significant genes
# This will go through the de_list object, assign the name of the contrast in the de_list to contrast and assign its associated differential expression list (containing lfc, lfcs, all and sig) to the object contrast_de
deg_List <- imap(de_list, \(contrast_de, contrast){
# Extract the significant genes from a given contrast and assign them to res_LRT
res_LRT <- contrast_de[["sig"]]
# Create a named vector where the values in the vector are the adjusted p-values and the names are the index of the value
gene_padj <- setNames(res_LRT$padj, rownames(res_LRT))
# Subset the gene_padj vector by the adjusted p-value threshold established in the top of this code block
gene_sig <- gene_padj[gene_padj < padj.cutoff]
# Arrange the gene significance values with the most significant first (they should already be arranged this way though), then keep either the first N genes defined by topN at the top of this code block or the total number of significant genes, whichever is fewer and assign it to the object called degs
degs <- sort(gene_sig)[1:min(length(gene_sig), topN)]
# Save the RDS object for the significance values for a given contrast, adjusted p-value threshold and number of top N genes
saveRDS(degs, glue("DEGpattern_deseq_{contrast}_DEGs_padj{padj.cutoff}_topN{topN}.rds"))
# Populate the deg_List with the degs vector for each contrast
return(degs)
})
# Please note, you can choose to combine all significant results across contrast as well
```
# Conclusion
# Methods
## R package references
```{r citations, results='asis'}
# Provide citations
citation("DESeq2")
citation("ggplot2")
citation("pheatmap")
```
## R session
List and version of tools used for the DE report generation.
```{r}
# Print sessionInfo()
sessionInfo()
```